diff --git a/.travis.yml b/.travis.yml index 2317f3e75..b85b5edd6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,8 +16,10 @@ install: - source ${VIRTUAL_ENV}/bin/activate - sudo -H ./provision.sh install_deps install_splash install_python_deps - sudo -H pip install tox -- python -c 'import splash, qt5reactor' # Check it's in the python path +- python2.7 -c 'import splash, qt5reactor' # Check it's in the python path - cd portiaui +- nvm install 6.10.0 +- nvm use 6.10.0 - npm install -g bower - npm install - bower install @@ -29,7 +31,7 @@ before_script: - source ${VIRTUAL_ENV}/bin/activate - export PYTHONPATH=`pwd`/slybot:`pwd`/slyd - cd slyd -- python tests/testserver/server.py 2>&1 | grep -v 'HTTP/1.1" 200' & +- python2.7 tests/testserver/server.py 2>&1 | grep -v 'HTTP/1.1" 200' & - cd .. - sleep 3 # give xvfb some time to start script: @@ -39,6 +41,7 @@ script: - ./manage.py test portia_orm.tests - ./manage.py test portia_api.tests - cd ../portiaui +- npm rebuild node-sass - npm test before_deploy: - cd ../slybot diff --git a/portia_server/portia_orm/models.py b/portia_server/portia_orm/models.py index 84720a01e..b7d7eb7cc 100644 --- a/portia_server/portia_orm/models.py +++ b/portia_server/portia_orm/models.py @@ -409,12 +409,13 @@ def migrate_sample(self, data): if not data.get('name'): data['name'] = (data.get('id', data.get('page_id', u'')[:20]) or strip_json(self.context['path'].split('/')[-1])) - if data.get('version', '') >= '0.13.1': + version = data.get('version', '') + if version == '0.13.0' or version >= '0.13.1': return data if any(body in data for body in ('original_body', 'rendered_body')): self._migrate_html(self, data) schemas = json.load(self.context['storage'].open('items.json')) - if data.get('version', '') > '0.13.0': + if version > '0.13.0': schema_id, new_schemas = guess_schema(data, schemas) self._add_schemas(self, new_schemas) # Add the most likely schema id to the base containers if needed diff --git a/portia_server/portia_orm/tests/test_model.py b/portia_server/portia_orm/tests/test_model.py index 65398d4f5..d8c08c5a4 100644 --- a/portia_server/portia_orm/tests/test_model.py +++ b/portia_server/portia_orm/tests/test_model.py @@ -1476,7 +1476,7 @@ def test_save_edit(self): spider.id = 'test-id' spider.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -1571,7 +1571,7 @@ def test_delete(self): spider = project.spiders['shop-crawler'] spider.delete() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 3) self.storage.open.assert_has_calls([ mock.call('items.json'), mock.call('spiders/shop-crawler.json'), @@ -1750,7 +1750,7 @@ def test_load_through_project(self): 'version': SLYBOT_VERSION, }, ]) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -1859,7 +1859,7 @@ def test_load_through_partial(self): 'extractors': {}, 'version': SLYBOT_VERSION, }) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -1869,7 +1869,7 @@ def test_save_edit(self): spider=Spider(self.storage, id='shop-crawler')) sample.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -1882,7 +1882,7 @@ def test_save_edit(self): sample.page_id = sample.id sample.save() - self.assertEqual(self.storage.open.call_count, 5) + self.assertEqual(self.storage.open.call_count, 3) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -1993,7 +1993,7 @@ def test_save_edit(self): sample.id = 'test-id' sample.save() - self.assertEqual(self.storage.open.call_count, 5) + self.assertEqual(self.storage.open.call_count, 3) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -2206,7 +2206,7 @@ def test_delete(self): sample = spider.samples['1ddc-4043-ac4d'] sample.delete() - self.assertEqual(self.storage.open.call_count, 6) + self.assertEqual(self.storage.open.call_count, 5) self.storage.open.assert_has_calls([ mock.call('items.json'), mock.call('spiders/shop-crawler.json'), @@ -2580,7 +2580,7 @@ def test_load_through_project(self): items = project.spiders['shop-crawler'].samples['1ddc-4043-ac4d'].items self.assertListEqual(items.keys(), ['1e47-4833-a4d4']) self.assertIsInstance(items, Item.collection) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -2701,7 +2701,7 @@ def test_load_through_project(self): 'text-content': '#portia-content', }, ]) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -2827,11 +2827,10 @@ def test_load_through_partial(self): 'tagid': None, 'text-content': '#portia-content', }) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) def test_save_edit(self): item = Item( @@ -2841,21 +2840,19 @@ def test_save_edit(self): spider=Spider(self.storage, id='shop-crawler'))) item.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.storage.save.assert_not_called() item.selector = '#test' item.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.storage.save.assert_called_once_with( 'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY) self.assertEqual( @@ -3005,11 +3002,10 @@ def test_save_edit(self): item.id = 'test-id' item.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.assertEqual(self.storage.save.call_count, 2) self.storage.save.assert_has_calls([ mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY), @@ -3166,11 +3162,10 @@ def test_save_new(self): item = Item(self.storage, id='test1', sample=sample) item.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.storage.save.assert_called_once_with( 'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY) self.assertEqual( @@ -3337,11 +3332,10 @@ def test_save_new(self): repeated_selector='.yyy')) sample.items[0].save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.assertEqual(self.storage.save.call_count, 2) self.storage.save.assert_has_calls([ mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY), @@ -3545,7 +3539,7 @@ def test_delete(self): item = sample.items['1e47-4833-a4d4'] item.delete() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 3) self.storage.open.assert_has_calls([ mock.call('items.json'), mock.call('spiders/shop-crawler.json'), @@ -3689,7 +3683,7 @@ def test_load_through_project(self): ['3606-4d68-a6a0|d1e2-4673-a72a', '5c18-40cf-8809|de35-49b5-b90b']) self.assertIsInstance(annotations, BaseAnnotation.collection) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -3749,7 +3743,7 @@ def test_load_through_project(self): 'xpath': None, }, ]) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) @@ -3790,11 +3784,10 @@ def test_load_through_partial(self): 'tagid': None, 'xpath': None, }) - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) def test_save_edit(self): annotation = Annotation( @@ -3806,21 +3799,19 @@ def test_save_edit(self): spider=Spider(self.storage, id='shop-crawler')))) annotation.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.storage.save.assert_not_called() annotation.selector = '.test' annotation.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.storage.save.assert_called_once_with( 'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY) self.assertEqual( @@ -3930,11 +3921,10 @@ def test_save_edit(self): annotation.id = 'test-id|data-id' annotation.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.assertEqual(self.storage.save.call_count, 2) self.storage.save.assert_has_calls([ mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY), @@ -4052,11 +4042,10 @@ def test_save_new(self): annotation = Annotation(self.storage, id='test1|data1', parent=item) annotation.save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.storage.save.assert_called_once_with( 'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY) self.assertEqual( @@ -4189,11 +4178,10 @@ def test_save_new(self): item.annotations.insert(0, Annotation(self.storage, id='test2|data2')) item.annotations[0].save() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 2) self.storage.open.assert_has_calls([ mock.call('spiders/shop-crawler.json'), - mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'), - mock.call('items.json')]) + mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')]) self.assertEqual(self.storage.save.call_count, 2) self.storage.save.assert_has_calls([ mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY), @@ -4360,7 +4348,7 @@ def test_delete(self): annotation = item.annotations['3606-4d68-a6a0|d1e2-4673-a72a'] annotation.delete() - self.assertEqual(self.storage.open.call_count, 4) + self.assertEqual(self.storage.open.call_count, 3) self.storage.open.assert_has_calls([ mock.call('items.json'), mock.call('spiders/shop-crawler.json'), diff --git a/slybot/CHANGES b/slybot/CHANGES index bd5274fdd..4292371e8 100644 --- a/slybot/CHANGES +++ b/slybot/CHANGES @@ -1,4 +1,13 @@ -Slybot 0.13.0b37 - next +Slybot 0.13.0 + +Update slybot to use the most recent libraries +Add a DropMetaPipeline to remove unwanted meta fields from items +Allow `\r` character to be used in start url feeds +Use `rendered_html` when building sample if JS enabled +Limit CSS selector annotations to a single element +Allow scrapy shell to find a spider if the name matches the url hostname + +Slybot 0.13.0b37 Do not create repeated extractor for CSS/XPath annotations Handle parsing Empty HTML response without raising error diff --git a/slybot/requirements-test.txt b/slybot/requirements-test.txt index f679f718e..356d1aa8f 100644 --- a/slybot/requirements-test.txt +++ b/slybot/requirements-test.txt @@ -2,3 +2,4 @@ tox==2.5.0 nose==1.3.7 nose-timer==0.6.0 doctest-ignore-unicode==0.1.2 +setuptools>=36.0.1 diff --git a/slybot/slybot/__init__.py b/slybot/slybot/__init__.py index 730c3be36..2d7893e3d 100644 --- a/slybot/slybot/__init__.py +++ b/slybot/slybot/__init__.py @@ -1 +1 @@ -__version__ = '0.13.0b37' +__version__ = '0.13.0' diff --git a/slybot/slybot/plugins/scrapely_annotations/annotations.py b/slybot/slybot/plugins/scrapely_annotations/annotations.py index 6562d3365..c18e9e498 100644 --- a/slybot/slybot/plugins/scrapely_annotations/annotations.py +++ b/slybot/slybot/plugins/scrapely_annotations/annotations.py @@ -112,10 +112,14 @@ def setup_bot(self, settings, spider, spec, items, extractors, logger): self.clustering = None def _get_annotated_template(self, template): - if (template.get('version', '0.12.0') >= '0.13.0' and - not template.get('annotated')): + changed = False + if template.get('version', '0.12.0') >= '0.13.0': using_js = self.spider._filter_js_urls(template['url']) - template['body'] = 'rendered_body' if using_js else 'original_body' + body = 'rendered_body' if using_js else 'original_body' + if template.get('body') != body: + template['body'] = body + changed = True + if changed or not template.get('annotated'): _build_sample(template) return template diff --git a/slybot/slybot/plugins/scrapely_annotations/builder.py b/slybot/slybot/plugins/scrapely_annotations/builder.py index c261f7558..9f4a6d149 100644 --- a/slybot/slybot/plugins/scrapely_annotations/builder.py +++ b/slybot/slybot/plugins/scrapely_annotations/builder.py @@ -1,3 +1,4 @@ +import copy import json import six @@ -18,6 +19,8 @@ class Annotations(object): def __init__(self, sample, **options): self.sample = sample + page_id = sample.get('page_id') or sample.get('id') or "" + sample['page_id'] = page_id plugins = sample.setdefault('plugins', {}) self.data = plugins.setdefault('annotations-plugin', {'extracts': []}) self.annotations = self.data['extracts'] @@ -32,13 +35,8 @@ def html(self): template = self.sample body = template.get('body') or 'original_body' if body not in template: - if 'original_body' in template: - body = 'original_body' - else: - bodies = [k for k, v in template.items() - if v and k.endswith('_body')] - if bodies: - body = bodies[0] + body = next((k for k, v in template.items() + if v and k.endswith('_body')), body) self._html = template[body] return self._html @@ -306,7 +304,7 @@ def apply(self): def split(self): selector, tagid = [], [] - for ann in self.annotations: + for ann in copy.deepcopy(self.annotations): if ann: if ann.get('selector'): selector.append(ann) diff --git a/slybot/slybot/plugins/scrapely_annotations/processors.py b/slybot/slybot/plugins/scrapely_annotations/processors.py index 61d84824d..a71001425 100644 --- a/slybot/slybot/plugins/scrapely_annotations/processors.py +++ b/slybot/slybot/plugins/scrapely_annotations/processors.py @@ -81,6 +81,10 @@ def region_id(self): def metadata(self): return self.annotation.metadata + @cached_property + def repeated(self): + return self.metadata.get('repeated', False) + def attribute_query(self, metadata): """Extract attribute or content from a region.""" content_field = metadata.get(u'text-content', u'content') @@ -200,6 +204,12 @@ def _selector_annotations(self): new_attribute.update(attribute) yield new_attribute + @staticmethod + def get_region_id(region): + if hasattr(region, 'attributes'): + return region.attributes.get('data-tagid') + return region + def _process_css_and_xpath(self, annotations, selector): schema, modifiers, page = self.schema, self.modifiers, self.htmlpage region_ids = list(filter(bool, (region_id(r) for r in self.regions))) @@ -207,7 +217,13 @@ def _process_css_and_xpath(self, annotations, selector): parents = {e._root for e in selector.css(query)} containers = () if self.parent_region: - pquery = '[data-tagid="%s"]' % self.parent_region + if isinstance(self.parent_region, list): + pquery = ', '.join( + '[data-tagid="{}"]'.format(self.get_region_id(r)) + for r in self.parent_region) + else: + pquery = '[data-tagid="{}"]'.format( + self.get_region_id(self.parent_region)) containers = {e._root for e in selector.css(pquery)} for i, a in enumerate(annotations, start=len(self.fields)): mode = a.get(u'selection_mode') @@ -244,7 +260,7 @@ def _pick_elems(self, elements, parents, containers): other_elements.append(element) if closest_elements: return closest_elements - elif other_elements: + elif (self.repeated and containers) or other_elements: return other_elements return elements diff --git a/slybot/slybot/spidermanager.py b/slybot/slybot/spidermanager.py index 07cde36a8..13d166bf5 100644 --- a/slybot/slybot/spidermanager.py +++ b/slybot/slybot/spidermanager.py @@ -4,6 +4,8 @@ import atexit import logging +from six.moves.urllib.parse import urlparse + import slybot from zipfile import ZipFile @@ -67,8 +69,10 @@ def list(self): return list(self._specs["spiders"].keys()) def find_by_request(self, request): - """Placeholder to meet SpiderManager interface""" - raise NotImplementedError() + parsed = urlparse(request.url) + if parsed.hostname in self._specs['spiders'].spider_names: + return [parsed.hostname] + # TODO: Look at start urls and samples class ZipfileSlybotSpiderManager(SlybotSpiderManager): diff --git a/slybot/slybot/tests/data/templates/firmen.wko.at.html b/slybot/slybot/tests/data/templates/firmen.wko.at.html new file mode 100644 index 000000000..9fa3f400c --- /dev/null +++ b/slybot/slybot/tests/data/templates/firmen.wko.at.html @@ -0,0 +1,1353 @@ + + + + + Elektrotechnik - Firmen A-Z + + + + +
+ + + + + + + + + + + + + + + + + + + arrow-up + + + + arrow-right + + + + arrow-left + + + + arrow-down + + + + close + + + + help + + + + popup + + + + + + + + phone + + + + mobile + + + + fax + + + + email + + + + + web + + + + + + + + + + editieren + + + + + + location + + + + search + + + + th + + + + + location-1 + + + + + location-2 + + 2 + + + location-3 + + 3 + + + location-4 + + 4 + + + location-5 + + 5 + + + location-6 + + 6 + + + location-7 + + 7 + + + location-8 + + 8 + + + location-9 + + 9 + + + location-10 + + 10 + + + + + +
+ + + +
+
+ + +
+ +
+ +
+ + +
+
+

Ihre Suchabfrage Elektrotechnik liefert mehr als 1000 Treffer

+
+
+ + +
+
+ + + + +
+ + + +
+
+

Filtern nach:

+
+
+ +
+
+ + +
+
+ +
+
+

Treffer ohne Kontaktdaten:

+
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+

Karte

+ +
+ + +
+
+

Sortieren nach:

+
+
+ +
+
+ +
+
+
+
+ +
+
+ + +
+
+ + + + +
+

A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H.

+ + +
+
+
+ + + +
+
+
+ Standort + + + +
+ +
+
+
Am Tabor 1-3
+ 1020 + Wien +
Elektrotechnik
+
+
+
+ + + + +
+ + + + Telefon 01/7299548 +
+ + + + +
+
+ + + + + + + + + + +
+
+
+
+ + + +
+
+
+ + + +
+
+
+ Standort + + + +
+ +
+
+
Kolpingstraße 4
+ 1230 + Wien +
Elektrotechnik
+
+
+
+ + + + +
+ + + + Telefon 01/7299548 +
+ + + + +
+
+ + + + + + + + + + +
+
+
+
+ + +
+ +
+ +
+
+
+ + + +
+
+
+ Standort + + + +
+ +
+
+
Hauptstraße 53/1
+ 4092 + Esternberg +
Elektrotechnik; Errichter von Alarmanlagen
+
+
+
+ + + + +
+ + + + Telefon +43 (720) 59001910 +
+ + + + + +
+ + + + E-Mail + office@abmtech.at +
+ + +
+
+ + + + + + + + + + +
+
+
+
+ + +
+ + + + + + + + + + +
+
+ + +
+ +
+ +
+
+ +
+ +
+ + + + + + + + + + + + + diff --git a/slybot/slybot/tests/data/templates/firmen.wko.at.json b/slybot/slybot/tests/data/templates/firmen.wko.at.json new file mode 100644 index 000000000..3bd598891 --- /dev/null +++ b/slybot/slybot/tests/data/templates/firmen.wko.at.json @@ -0,0 +1,609 @@ +{ + "extractors": {}, + "id": "2688-4a8e-8b29", + "name": "Elektrotechnik - Firmen A-Z", + "page_id": "", + "page_type": "item", + "plugins": { + "annotations-plugin": { + "extracts": [ + { + "annotations": { + "#portia-content": "#dummy" + }, + "container_id": null, + "id": "75f8-4fdc-b4e3#parent", + "item_container": true, + "repeated": false, + "required": [], + "schema_id": "020e-42fe-8006", + "selector": "#result > .container", + "siblings": 0, + "tagid": null, + "text-content": "#portia-content" + }, + { + "annotations": { + "#portia-content": "#dummy" + }, + "container_id": "75f8-4fdc-b4e3#parent", + "id": "75f8-4fdc-b4e3", + "item_container": true, + "repeated": true, + "required": [], + "schema_id": "020e-42fe-8006", + "selector": ".col1-right", + "siblings": 0, + "tagid": null, + "text-content": "#portia-content" + }, + { + "accept_selectors": [ + "[itemprop= \"name\"]" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "e04a-4184-91a1": { + "attribute": "content", + "extractors": {}, + "field": "5adf-43c6-8113", + "required": false + } + }, + "id": "24dd-4492-95a6", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": true, + "required": [], + "selection_mode": "css", + "selector": "[itemprop= \"name\"]", + "tagid": null, + "text-content": "content", + "xpath": "//" + }, + { + "accept_selectors": [ + "[itemprop=\"streetAddress\"]" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "cd41-429d-a779": { + "attribute": "content", + "extractors": {}, + "field": "0247-457c-8591", + "required": false + } + }, + "id": "80f0-4a7f-a632", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": true, + "required": [], + "selection_mode": "css", + "selector": "[itemprop=\"streetAddress\"]", + "tagid": null, + "text-content": "content", + "xpath": "//" + }, + { + "accept_selectors": [ + "[itemprop=\"addressLocality\"]" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "4b2e-4235-9bb3": { + "attribute": "content", + "extractors": {}, + "field": "b9eb-4347-b68e", + "required": false + } + }, + "id": "0523-45f0-b261", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": true, + "required": [], + "selection_mode": "css", + "selector": "[itemprop=\"addressLocality\"]", + "tagid": null, + "text-content": "content", + "xpath": "//" + }, + { + "accept_selectors": [ + "div [class = \"authorization\"]" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "c230-47ae-abfd": { + "attribute": "content", + "extractors": {}, + "field": "ae31-4c8e-b4e9", + "required": false + } + }, + "id": "4cd3-42fd-b9ad", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": true, + "required": [], + "selection_mode": "css", + "selector": "div [class = \"authorization\"]", + "tagid": null, + "text-content": "content", + "xpath": "//" + }, + { + "accept_selectors": [ + "article:nth-child(1) > .row > .col-md-12 > .row > .col1 > .col1-right > .zip", + "article:nth-child(2) > .row > .col-md-12 > .row > .col1 > .col1-right > .zip" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "5a8e-420e-ac4a": { + "attribute": "content", + "extractors": {}, + "field": "d07b-407d-9306", + "required": false + } + }, + "id": "7515-40d7-b291", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": false, + "required": [], + "selection_mode": "auto", + "selector": ".col1-right > .zip", + "tagid": null, + "text-content": "content", + "xpath": "//*[contains(concat(\" \", @class, \" \"), \" col1-right \")]/*[contains(concat(\" \", @class, \" \"), \" zip \")]" + }, + { + "accept_selectors": [ + "[itemprop=\"email\"]" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "5af1-424e-bc2b": { + "attribute": "content", + "extractors": {}, + "field": "3a2a-459d-9468", + "required": false + } + }, + "id": "b23c-4d06-8b73", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": false, + "required": [], + "selection_mode": "css", + "selector": "[itemprop=\"email\"]", + "tagid": null, + "text-content": "content", + "xpath": "//" + }, + { + "accept_selectors": [ + "[itemprop=\"url\"]" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "9504-477d-b1c7": { + "attribute": "href", + "extractors": {}, + "field": "27f0-400d-98a7", + "required": false + } + }, + "id": "7884-4ce8-93f4", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": false, + "required": [], + "selection_mode": "css", + "selector": "[itemprop=\"url\"]", + "tagid": null, + "text-content": "content", + "xpath": "//" + }, + { + "accept_selectors": [ + "div[class=\"icon-mobile vcard__info\"] > a" + ], + "container_id": "75f8-4fdc-b4e3", + "data": { + "88ce-4d5e-8c08": { + "attribute": "content", + "extractors": {}, + "field": "db09-4a64-9f64", + "required": false + } + }, + "id": "be35-47ad-987d", + "post_text": null, + "pre_text": null, + "reject_selectors": [], + "repeated": false, + "required": [], + "selection_mode": "css", + "selector": "div[class=\"icon-mobile vcard__info\"] > a", + "tagid": null, + "text-content": "content", + "xpath": "///a" + } + ] + } + }, + "results": [{ + "website": [ + "http://www.3e-technik.at/" + ], + "city": [ + "Wien" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "3E-Technik e.U." + ], + "_index": 1, + "mobile": [ + "+43 699 112 199 14" + ], + "url": "http://url", + "street": [ + "Grellgasse 9/3/2" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "office@3e-technik.at" + ] + }, { + "website": [ + "http://www.elektro-jamnik.at/" + ], + "city": [ + "Seefeld in Tirol" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "A. Jamnik Elektro GmbH" + ], + "_index": 5, + "mobile": [ + "A. Jamnik Elektro GmbH" + ], + "url": "http://url", + "street": [ + "M\u00fcnchner Stra\u00dfe 487" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "a.jamnik.elektro@aon.at" + ] + }, { + "website": [ + "http://url/Web/DetailsKontakt.aspx?FirmaID=745d2dc3-c0e2-4c26-bf14-b83ba23d4849&StandortID=0&Branche=24170&BranchenName=Elektrotechnik&CategoryID=0&Page=1&Filter=1" + ], + "city": [ + "Wien" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H." + ], + "_index": 9, + "mobile": [ + "A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H." + ], + "url": "http://url", + "street": [ + "Am Tabor 1-3" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H." + ] + }, { + "website": [ + "http://url/Web/DetailsKontakt.aspx?FirmaID=bed7189f-12e0-4613-a301-f18d2c9e3529&StandortID=0&Branche=24170&BranchenName=Elektrotechnik&CategoryID=0&Page=1&Filter=1" + ], + "city": [ + "Wien" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H." + ], + "_index": 13, + "mobile": [ + "A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H." + ], + "url": "http://url", + "street": [ + "Kolpingstra\u00dfe 4" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H." + ] + }, { + "website": [ + "http://www.aaves.at/" + ], + "city": [ + "Ebenthal" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "AAVES Sicherheitstechnik GmbH" + ], + "_index": 17, + "mobile": [ + "+43 (0) 664 203 75 95" + ], + "url": "http://url", + "street": [ + "Gewerbezone, Zeiss Stra\u00dfe 16" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "office@aaves.at" + ] + }, { + "website": [ + "http://url/Web/DetailsKontakt.aspx?FirmaID=c1a2b9b2-e9be-41fe-a427-4f1301e8159b&StandortID=0&Branche=24170&BranchenName=Elektrotechnik&CategoryID=0&Page=1&Filter=1" + ], + "city": [ + "Esternberg" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "ABM Tech GmbH" + ], + "_index": 21, + "mobile": [ + "ABM Tech GmbH" + ], + "url": "http://url", + "street": [ + "Hauptstra\u00dfe 53/1" + ], + "branch": [ + "Elektrotechnik; Errichter von Alarmanlagen" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "office@abmtech.at" + ] + }, { + "website": [ + "http://www.elektro-acherer.at/" + ], + "city": [ + "Kufstein" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "Harald Acherer - ELEKTRO ACHERER" + ], + "_index": 25, + "mobile": [ + "Harald Acherer - ELEKTRO ACHERER" + ], + "url": "http://url", + "street": [ + "Herzog-Erich-Stra\u00dfe 12" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "el.acherer@kufnet.at" + ] + }, { + "website": [ + "http://www.aes-energietechnik.at/" + ], + "city": [ + "Ottenschlag" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "AES - Energie Technik GmbH Alternative Energie Systeme" + ], + "_index": 29, + "mobile": [ + "0664 885 018 83" + ], + "url": "http://url", + "street": [ + "Spitzerstr. 24" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "office@aes-energietechnik.at" + ] + }, { + "website": [ + "http://www.aes.co.at/" + ], + "city": [ + "Treibach-Althofen" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "AES alternativENERGIEsysteme e.U." + ], + "_index": 33, + "mobile": [ + "+43 664 307 47 03" + ], + "url": "http://url", + "street": [ + "Hauptplatz 3 - 4" + ], + "branch": [ + "Errichter von Photovoltaikanlagen" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "elpo1@gmx.at" + ] + }, { + "website": [ + "http://www.agetech.at/" + ], + "city": [ + "Lienz" + ], + "_type": "Elektrotechnik - Firmen A-Z", + "name": [ + "AGEtech GmbH - smart electric" + ], + "_index": 37, + "mobile": [ + "AGEtech GmbH - smart electric" + ], + "url": "http://url", + "street": [ + "Beda-Weber-Gasse 10" + ], + "branch": [ + "Elektrotechnik" + ], + "_template": "2688-4a8e-8b29", + "email": [ + "info@agetech.at" + ] + } + ], + "scrapes": "020e-42fe-8006", + "schemas": { + "020e-42fe-8006": { + "fields": { + "5adf-43c6-8113": { + "id": "5adf-43c6-8113", + "name": "name", + "required": true, + "type": "text", + "vary": false + }, + "0247-457c-8591": { + "id": "0247-457c-8591", + "name": "street", + "required": false, + "type": "text", + "vary": false + }, + "84c7-48fa-b3cb": { + "id": "84c7-48fa-b3cb", + "name": "plz", + "required": false, + "type": "text", + "vary": false + }, + "b9eb-4347-b68e": { + "id": "b9eb-4347-b68e", + "name": "city", + "required": false, + "type": "text", + "vary": false + }, + "ae31-4c8e-b4e9": { + "id": "ae31-4c8e-b4e9", + "name": "branch", + "required": true, + "type": "text", + "vary": false + }, + "8953-460a-b810": { + "id": "8953-460a-b810", + "name": "landline", + "required": false, + "type": "text", + "vary": false + }, + "7a4b-4723-a3a1": { + "id": "7a4b-4723-a3a1", + "name": "phone_2", + "required": false, + "type": "text", + "vary": false + }, + "db09-4a64-9f64": { + "id": "db09-4a64-9f64", + "name": "mobile", + "required": false, + "type": "text", + "vary": false + }, + "e706-4d94-be4f": { + "id": "e706-4d94-be4f", + "name": "fax", + "required": false, + "type": "text", + "vary": false + }, + "2e6e-498d-81c3": { + "id": "2e6e-498d-81c3", + "name": "e-mail", + "required": false, + "type": "text", + "vary": false + }, + "27f0-400d-98a7": { + "id": "27f0-400d-98a7", + "name": "website", + "required": false, + "type": "url", + "vary": false + }, + "d07b-407d-9306": { + "auto_created": true, + "id": "d07b-407d-9306", + "name": "field1", + "required": false, + "type": "text", + "vary": false + }, + "3a2a-459d-9468": { + "id": "3a2a-459d-9468", + "name": "email", + "required": false, + "type": "text", + "vary": false + } + }, + "name": "Elektrotechnik - Firmen A-Z" + } + }, + "spider": "firmen.wko.at", + "url": "https://firmen.wko.at/Web/Ergebnis.aspx?StandortID=0&Branche=24170&BranchenName=Elektrotechnik&CategoryID=0&Filter=1&Page=3", + "version": "0.13.0b37" +} diff --git a/slybot/slybot/tests/test_dropmeta.py b/slybot/slybot/tests/test_dropmeta.py index e0a5ea305..0f62a0ab2 100644 --- a/slybot/slybot/tests/test_dropmeta.py +++ b/slybot/slybot/tests/test_dropmeta.py @@ -8,7 +8,7 @@ from .utils import PATH -class DupeFilterTest(TestCase): +class DropMetaTest(TestCase): def test_dupefilter(self): smanager = SlybotSpiderManager("%s/data/SampleProject" % PATH) @@ -19,10 +19,39 @@ def test_dupefilter(self): result = { "breadcrumbs": ["Home", "Books", "Mystery"], "description": [ - u"WICKED above her hipbone, GIRL across her heart Words are like a road map to reporter Camille Preaker’s troubled past. Fresh from a brief stay at a psych hospital, Camille’s first assignment from the second-rate daily paper where she works brings her reluctantly back to her hometown to cover the murders of two preteen girls. NASTY on her kneecap, BABYDOLL on her leg Since WICKED above her hipbone, GIRL across her heart Words are like a road map to reporter Camille Preaker’s troubled past. Fresh from a brief stay at a psych hospital, Camille’s first assignment from the second-rate daily paper where she works brings her reluctantly back to her hometown to cover the murders of two preteen girls. NASTY on her kneecap, BABYDOLL on her leg Since she left town eight years ago, Camille has hardly spoken to her neurotic, hypochondriac mother or to the half-sister she barely knows: a beautiful thirteen-year-old with an eerie grip on the town. Now, installed again in her family’s Victorian mansion, Camille is haunted by the childhood tragedy she has spent her whole life trying to cut from her memory. HARMFUL on her wrist, WHORE on her ankle As Camille works to uncover the truth about these violent crimes, she finds herself identifying with the young victims—a bit too strongly. Clues keep leading to dead ends, forcing Camille to unravel the psychological puzzle of her own past to get at the story. Dogged by her own demons, Camille will have to confront what happened to her years before if she wants to survive this homecoming.With its taut, crafted writing, Sharp Objects is addictive, haunting, and unforgettable. ...more" + u"WICKED above her hipbone, GIRL across her heart Words are " + u"like a road map to reporter Camille Preaker’s troubled past." + u" Fresh from a brief stay at a psych hospital, Camille’s " + u"first assignment from the second-rate daily paper where she " + u"works brings her reluctantly back to her hometown to cover " + u"the murders of two preteen girls. NASTY on her kneecap, " + u"BABYDOLL on her leg Since WICKED above her hipbone, GIRL " + u"across her heart Words are like a road map to reporter " + u"Camille Preaker’s troubled past. Fresh from a brief stay at " + u"a psych hospital, Camille’s first assignment from the " + u"second-rate daily paper where she works brings her " + u"reluctantly back to her hometown to cover the murders of " + u"two preteen girls. NASTY on her kneecap, BABYDOLL on her leg" + u" Since she left town eight years ago, Camille has hardly " + u"spoken to her neurotic, hypochondriac mother or to the " + u"half-sister she barely knows: a beautiful thirteen-year-old " + u"with an eerie grip on the town. Now, installed again in her " + u"family’s Victorian mansion, Camille is haunted by the " + u"childhood tragedy she has spent her whole life trying to cut" + u" from her memory. HARMFUL on her wrist, WHORE on her ankle " + u"As Camille works to uncover the truth about these violent " + u"crimes, she finds herself identifying with the young " + u"victims—a bit too strongly. Clues keep leading to dead ends," + u" forcing Camille to unravel the psychological puzzle of her " + u"own past to get at the story. Dogged by her own demons, " + u"Camille will have to confront what happened to her years " + u"before if she wants to survive this homecoming.With its " + u"taut, crafted writing, Sharp Objects is addictive, haunting," + u" and unforgettable. ...more" ], "image": [ - "http://books.toscrape.com/media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg" + "http://books.toscrape.com/media/cache/c0/59/c05972805aa720117" + "1b8fc71a5b00292.jpg" ], "info": { "price": ["47.82"], @@ -30,7 +59,8 @@ def test_dupefilter(self): "tax": ["0.00"], "type": ["Books"], "upc": ["e00eb4fd7b871a48"]}, - "url": "http://books.toscrape.com/catalogue/sharp-objects_997/index.html" + "url": ("http://books.toscrape.com/catalogue/" + "sharp-objects_997/index.html") } tid = '3617-44af-a2f0' extracted = next(t for t in spec["templates"] if t['page_id'] == tid) diff --git a/slybot/slybot/tests/test_multiple_item_extraction.py b/slybot/slybot/tests/test_multiple_item_extraction.py index 2ef2864f0..f1a32cee4 100644 --- a/slybot/slybot/tests/test_multiple_item_extraction.py +++ b/slybot/slybot/tests/test_multiple_item_extraction.py @@ -369,3 +369,6 @@ def test_repeated_css_extractors(self): spider, page, results = open_spider_page_and_results('stips.co.il.json') items = [i for i in spider.parse(page) if not isinstance(i, Request)] self.assertEqual(items, results) + spider, page, results = open_spider_page_and_results('firmen.wko.at.json') + items = [i for i in spider.parse(page) if not isinstance(i, Request)] + self.assertEqual(items, results) diff --git a/slybot/slybot/tests/test_spider.py b/slybot/slybot/tests/test_spider.py index 6c6fcaebc..43bd91113 100644 --- a/slybot/slybot/tests/test_spider.py +++ b/slybot/slybot/tests/test_spider.py @@ -150,12 +150,11 @@ def test_generic_form_requests(self): url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request - request_list = [request_to_dict(req, spider) + request_list = [{k: v for k, v in request_to_dict(req, spider).items() + if not k.startswith('_')} for req in generic_form_request.callback(response)] expected = [{ - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -171,9 +170,7 @@ def test_generic_form_requests(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -189,9 +186,7 @@ def test_generic_form_requests(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -207,9 +202,7 @@ def test_generic_form_requests(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -225,8 +218,7 @@ def test_generic_form_requests(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50') }, { - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'parse', 'cookies': {}, 'dont_filter': True, @@ -337,12 +329,11 @@ def test_generic_form_requests_with_file_field(self): url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request - request_list = [request_to_dict(req, spider) + request_list = [{k: v for k, v in request_to_dict(req, spider).items() + if not k.startswith('_')} for req in generic_form_request.callback(response)] expected = [{ - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -358,9 +349,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -376,9 +365,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -394,9 +381,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -412,9 +397,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -430,9 +413,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Boats') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -448,9 +429,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Boats') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -466,9 +445,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Boats') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -484,9 +461,7 @@ def test_generic_form_requests_with_file_field(self): 'dio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dm' 'd=1&_ipg=50&_nkw2=Boats') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -502,9 +477,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -520,9 +493,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -538,9 +509,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -556,9 +525,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Cars') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -574,9 +541,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Boats') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -592,9 +557,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Boats') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -610,9 +573,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Boats') }, { - '_class': 'scrapy.http.request.form.FormRequest', - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'after_form_page', 'cookies': {}, 'dont_filter': True, @@ -628,8 +589,7 @@ def test_generic_form_requests_with_file_field(self): 'adio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_d' 'md=1&_ipg=50&_nkw2=Boats') }, { - '_encoding': 'utf-8', - 'body': '', + 'body': b'', 'callback': 'parse', 'cookies': {}, 'dont_filter': True, diff --git a/slybot/slybot/tests/test_starturls.py b/slybot/slybot/tests/test_starturls.py index 03c94a410..7360b4143 100644 --- a/slybot/slybot/tests/test_starturls.py +++ b/slybot/slybot/tests/test_starturls.py @@ -489,7 +489,8 @@ def test_feed_url(self): 'http://example.com/1\r' 'http://example.com/2\r\n' 'http://example.com/3\n\r' - 'http://example.com/4\n')) + 'http://example.com/4\n'), + encoding='utf-8') self.assertEqual([r.url for r in feed.parse_urls(response)], [ 'http://example.com/1', 'http://example.com/2', diff --git a/slybot/slybot/utils.py b/slybot/slybot/utils.py index f808d29a1..efc4dce38 100644 --- a/slybot/slybot/utils.py +++ b/slybot/slybot/utils.py @@ -104,7 +104,6 @@ def read(fp, encoding='utf-8'): def _build_sample(sample, legacy=False): from slybot.plugins.scrapely_annotations.builder import Annotations Annotations(sample, legacy=legacy).build() - sample['page_id'] = sample.get('page_id') or sample.get('id') or "" sample['annotated'] = True return sample diff --git a/slybot/tox.ini b/slybot/tox.ini index 4bfb09217..c6247d745 100644 --- a/slybot/tox.ini +++ b/slybot/tox.ini @@ -4,12 +4,12 @@ # and then run "tox" from this directory. [tox] -envlist = py27,py34 +envlist = py27 [testenv] deps = - -r{toxinidir}/requirements.txt -r{toxinidir}/requirements-test.txt + -r{toxinidir}/requirements.txt commands = nosetests \ --with-doctest \