Skip to content

Commit

Permalink
Release Slybot 0.13.0
Browse files Browse the repository at this point in the history
Update slybot to use the most recent libraries
Add a DropMetaPipeline to remove unwanted meta fields from items
Allow `\r` character to be used in start url feeds
Use `rendered_html` when building sample if JS enabled
Limit CSS selector annotations to a single element
Allow scrapy shell to find a spider if the name matches the url hostname
  • Loading branch information
ruairif committed Jun 8, 2017
1 parent ca24b40 commit c232819
Show file tree
Hide file tree
Showing 18 changed files with 2,124 additions and 145 deletions.
7 changes: 5 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ install:
- source ${VIRTUAL_ENV}/bin/activate
- sudo -H ./provision.sh install_deps install_splash install_python_deps
- sudo -H pip install tox
- python -c 'import splash, qt5reactor' # Check it's in the python path
- python2.7 -c 'import splash, qt5reactor' # Check it's in the python path
- cd portiaui
- nvm install 6.10.0
- nvm use 6.10.0
- npm install -g bower
- npm install
- bower install
Expand All @@ -29,7 +31,7 @@ before_script:
- source ${VIRTUAL_ENV}/bin/activate
- export PYTHONPATH=`pwd`/slybot:`pwd`/slyd
- cd slyd
- python tests/testserver/server.py 2>&1 | grep -v 'HTTP/1.1" 200' &
- python2.7 tests/testserver/server.py 2>&1 | grep -v 'HTTP/1.1" 200' &
- cd ..
- sleep 3 # give xvfb some time to start
script:
Expand All @@ -39,6 +41,7 @@ script:
- ./manage.py test portia_orm.tests
- ./manage.py test portia_api.tests
- cd ../portiaui
- npm rebuild node-sass
- npm test
before_deploy:
- cd ../slybot
Expand Down
5 changes: 3 additions & 2 deletions portia_server/portia_orm/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,12 +409,13 @@ def migrate_sample(self, data):
if not data.get('name'):
data['name'] = (data.get('id', data.get('page_id', u'')[:20]) or
strip_json(self.context['path'].split('/')[-1]))
if data.get('version', '') >= '0.13.1':
version = data.get('version', '')
if version == '0.13.0' or version >= '0.13.1':
return data
if any(body in data for body in ('original_body', 'rendered_body')):
self._migrate_html(self, data)
schemas = json.load(self.context['storage'].open('items.json'))
if data.get('version', '') > '0.13.0':
if version > '0.13.0':
schema_id, new_schemas = guess_schema(data, schemas)
self._add_schemas(self, new_schemas)
# Add the most likely schema id to the base containers if needed
Expand Down
88 changes: 38 additions & 50 deletions portia_server/portia_orm/tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1476,7 +1476,7 @@ def test_save_edit(self):
spider.id = 'test-id'
spider.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -1571,7 +1571,7 @@ def test_delete(self):
spider = project.spiders['shop-crawler']
spider.delete()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 3)
self.storage.open.assert_has_calls([
mock.call('items.json'),
mock.call('spiders/shop-crawler.json'),
Expand Down Expand Up @@ -1750,7 +1750,7 @@ def test_load_through_project(self):
'version': SLYBOT_VERSION,
},
])
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -1859,7 +1859,7 @@ def test_load_through_partial(self):
'extractors': {},
'version': SLYBOT_VERSION,
})
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand All @@ -1869,7 +1869,7 @@ def test_save_edit(self):
spider=Spider(self.storage, id='shop-crawler'))
sample.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand All @@ -1882,7 +1882,7 @@ def test_save_edit(self):
sample.page_id = sample.id
sample.save()

self.assertEqual(self.storage.open.call_count, 5)
self.assertEqual(self.storage.open.call_count, 3)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -1993,7 +1993,7 @@ def test_save_edit(self):
sample.id = 'test-id'
sample.save()

self.assertEqual(self.storage.open.call_count, 5)
self.assertEqual(self.storage.open.call_count, 3)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -2206,7 +2206,7 @@ def test_delete(self):
sample = spider.samples['1ddc-4043-ac4d']
sample.delete()

self.assertEqual(self.storage.open.call_count, 6)
self.assertEqual(self.storage.open.call_count, 5)
self.storage.open.assert_has_calls([
mock.call('items.json'),
mock.call('spiders/shop-crawler.json'),
Expand Down Expand Up @@ -2580,7 +2580,7 @@ def test_load_through_project(self):
items = project.spiders['shop-crawler'].samples['1ddc-4043-ac4d'].items
self.assertListEqual(items.keys(), ['1e47-4833-a4d4'])
self.assertIsInstance(items, Item.collection)
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -2701,7 +2701,7 @@ def test_load_through_project(self):
'text-content': '#portia-content',
},
])
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -2827,11 +2827,10 @@ def test_load_through_partial(self):
'tagid': None,
'text-content': '#portia-content',
})
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])

def test_save_edit(self):
item = Item(
Expand All @@ -2841,21 +2840,19 @@ def test_save_edit(self):
spider=Spider(self.storage, id='shop-crawler')))
item.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.storage.save.assert_not_called()

item.selector = '#test'
item.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.storage.save.assert_called_once_with(
'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY)
self.assertEqual(
Expand Down Expand Up @@ -3005,11 +3002,10 @@ def test_save_edit(self):
item.id = 'test-id'
item.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.assertEqual(self.storage.save.call_count, 2)
self.storage.save.assert_has_calls([
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY),
Expand Down Expand Up @@ -3166,11 +3162,10 @@ def test_save_new(self):
item = Item(self.storage, id='test1', sample=sample)
item.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.storage.save.assert_called_once_with(
'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY)
self.assertEqual(
Expand Down Expand Up @@ -3337,11 +3332,10 @@ def test_save_new(self):
repeated_selector='.yyy'))
sample.items[0].save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.assertEqual(self.storage.save.call_count, 2)
self.storage.save.assert_has_calls([
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY),
Expand Down Expand Up @@ -3545,7 +3539,7 @@ def test_delete(self):
item = sample.items['1e47-4833-a4d4']
item.delete()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 3)
self.storage.open.assert_has_calls([
mock.call('items.json'),
mock.call('spiders/shop-crawler.json'),
Expand Down Expand Up @@ -3689,7 +3683,7 @@ def test_load_through_project(self):
['3606-4d68-a6a0|d1e2-4673-a72a',
'5c18-40cf-8809|de35-49b5-b90b'])
self.assertIsInstance(annotations, BaseAnnotation.collection)
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -3749,7 +3743,7 @@ def test_load_through_project(self):
'xpath': None,
},
])
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
Expand Down Expand Up @@ -3790,11 +3784,10 @@ def test_load_through_partial(self):
'tagid': None,
'xpath': None,
})
self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])

def test_save_edit(self):
annotation = Annotation(
Expand All @@ -3806,21 +3799,19 @@ def test_save_edit(self):
spider=Spider(self.storage, id='shop-crawler'))))
annotation.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.storage.save.assert_not_called()

annotation.selector = '.test'
annotation.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.storage.save.assert_called_once_with(
'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY)
self.assertEqual(
Expand Down Expand Up @@ -3930,11 +3921,10 @@ def test_save_edit(self):
annotation.id = 'test-id|data-id'
annotation.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.assertEqual(self.storage.save.call_count, 2)
self.storage.save.assert_has_calls([
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY),
Expand Down Expand Up @@ -4052,11 +4042,10 @@ def test_save_new(self):
annotation = Annotation(self.storage, id='test1|data1', parent=item)
annotation.save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.storage.save.assert_called_once_with(
'spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY)
self.assertEqual(
Expand Down Expand Up @@ -4189,11 +4178,10 @@ def test_save_new(self):
item.annotations.insert(0, Annotation(self.storage, id='test2|data2'))
item.annotations[0].save()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 2)
self.storage.open.assert_has_calls([
mock.call('spiders/shop-crawler.json'),
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json'),
mock.call('items.json')])
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json')])
self.assertEqual(self.storage.save.call_count, 2)
self.storage.save.assert_has_calls([
mock.call('spiders/shop-crawler/1ddc-4043-ac4d.json', mock.ANY),
Expand Down Expand Up @@ -4360,7 +4348,7 @@ def test_delete(self):
annotation = item.annotations['3606-4d68-a6a0|d1e2-4673-a72a']
annotation.delete()

self.assertEqual(self.storage.open.call_count, 4)
self.assertEqual(self.storage.open.call_count, 3)
self.storage.open.assert_has_calls([
mock.call('items.json'),
mock.call('spiders/shop-crawler.json'),
Expand Down
11 changes: 10 additions & 1 deletion slybot/CHANGES
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
Slybot 0.13.0b37 - next
Slybot 0.13.0

Update slybot to use the most recent libraries
Add a DropMetaPipeline to remove unwanted meta fields from items
Allow `\r` character to be used in start url feeds
Use `rendered_html` when building sample if JS enabled
Limit CSS selector annotations to a single element
Allow scrapy shell to find a spider if the name matches the url hostname

Slybot 0.13.0b37

Do not create repeated extractor for CSS/XPath annotations
Handle parsing Empty HTML response without raising error
Expand Down
1 change: 1 addition & 0 deletions slybot/requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ tox==2.5.0
nose==1.3.7
nose-timer==0.6.0
doctest-ignore-unicode==0.1.2
setuptools>=36.0.1
2 changes: 1 addition & 1 deletion slybot/slybot/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.13.0b37'
__version__ = '0.13.0'
10 changes: 7 additions & 3 deletions slybot/slybot/plugins/scrapely_annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,14 @@ def setup_bot(self, settings, spider, spec, items, extractors, logger):
self.clustering = None

def _get_annotated_template(self, template):
if (template.get('version', '0.12.0') >= '0.13.0' and
not template.get('annotated')):
changed = False
if template.get('version', '0.12.0') >= '0.13.0':
using_js = self.spider._filter_js_urls(template['url'])
template['body'] = 'rendered_body' if using_js else 'original_body'
body = 'rendered_body' if using_js else 'original_body'
if template.get('body') != body:
template['body'] = body
changed = True
if changed or not template.get('annotated'):
_build_sample(template)
return template

Expand Down
Loading

0 comments on commit c232819

Please sign in to comment.