Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support easy session reuse #85

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,47 @@ requests with `DEFAULT_REQUEST_HEADERS <http://doc.scrapy.org/en/1.0/topics/sett

This Middleware also adds some configurable Scrapy Settings, check :ref:`the complete list here <settings>`.


Reusing sessions
================

To create a request in a callback and have that request reuse the same Crawlera
session as the callback response, you have to write something like::

def callback(self, response):
session = response.headers.get('X-Crawlera-Session')
# …
headers = {}
if session:
headers = {'X-Crawlera-Session': session}
yield Request(url, callback=self.callback, headers=headers)

scrapy-crawlera provides an optional spider middleware that, if enabled, allows
setting ``crawlera_session_reuse`` to ``True`` in your request to reuse the
Crawlera session from the source response::

def callback(self, response):
meta = {'crawlera_session_reuse': True}
yield Request(url, callback=self.callback, meta=meta)

To enable the Crawlera session reuse spider middleware, add it to your
``SPIDER_MIDDLEWARES`` setting::

SPIDER_MIDDLEWARES = {
'scrapy_crawlera.CrawleraSessionReuseMiddleware': 1000,
}

By default, ``CrawleraSessionReuseMiddleware`` removes ``X-Crawlera-Session``
from the request headers if the source response did not use a Crawlera session,
or the source Crawlera session ID was bad. Use the
``CRAWLERA_SESSION_REUSE_DEFAULT_SESSION`` setting to set a fallback Crawlera
session value instead. For example, to create a new Crawlera session on
requests that come from responses without a Crawlera session or with a bad
Crawlera session ID::

CRAWLERA_SESSION_REUSE_DEFAULT_SESSION = 'create'


All the rest
============

Expand Down
1 change: 1 addition & 0 deletions scrapy_crawlera/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .middleware import CrawleraMiddleware
from .spidermiddlewares import CrawleraSessionReuseMiddleware


__version__ = '1.6.0'
37 changes: 37 additions & 0 deletions scrapy_crawlera/spidermiddlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from scrapy import Request


class CrawleraSessionReuseMiddleware(object):

@classmethod
def from_crawler(cls, crawler):
return cls(crawler)

def __init__(self, crawler):
setting = 'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION'
self._default_session = crawler.settings.get(setting)

def process_spider_output(self, response, result, spider):
def _set_session(request_or_item):
if not isinstance(request_or_item, Request):
return request_or_item

request = request_or_item
header = b'X-Crawlera-Session'
meta_key = 'crawlera_session_reuse'

if request.meta.get(meta_key) is not True:
return request

session = response.headers.get(header)
error = response.headers.get(b'X-Crawlera-Error')
session_is_bad = error == b'bad_session_id'

if session is not None and not session_is_bad:
request.headers[header] = session
elif self._default_session:
request.headers[header] = self._default_session
return request

return (_set_session(request_or_item)
for request_or_item in result or ())
211 changes: 211 additions & 0 deletions tests/test_spidermiddlewares_session.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import pytest
from scrapy import Spider as _Spider
from scrapy.http import Response, Request
from scrapy.item import Item
from scrapy.utils.reqser import request_to_dict
from scrapy.utils.test import get_crawler

from scrapy_crawlera.spidermiddlewares import CrawleraSessionReuseMiddleware


SESSION = '1'


def compare_requests(request1, request2):
assert request_to_dict(request1) == request_to_dict(request2)


def process_output(response, result, settings=None):
crawler = get_crawler(Spider, settings)
mw = CrawleraSessionReuseMiddleware.from_crawler(crawler)
generator = mw.process_spider_output(response, [result], Spider())
return list(generator)[0]


def get_request(reuse=False, session=None):
headers = {}
if session is not None:
headers['X-Crawlera-Session'] = session
meta = {}
if reuse is True:
meta['crawlera_session_reuse'] = True
return Request('https://example.com', headers=headers, meta=meta)


def get_response(session=None, error=None):
headers = {}
if session is not None:
headers['X-Crawlera-Session'] = session
if error is not None:
headers['X-Crawlera-Error'] = error
return Response('https://example.com', headers=headers)


class Spider(_Spider):
name = 'spider'


@pytest.mark.parametrize(
'item',
[
(
{},
),
(
Item(),
),
]
)
def test_item(item):
response = get_response(session=SESSION)
assert process_output(response, item) == item


def test_no_session():
response = get_response()
input_request = get_request()
processed_request = process_output(response, input_request)
expected_request = get_request()
compare_requests(processed_request, expected_request)


def test_bad_session_id():
response = get_response(session=SESSION, error='bad_session_id')
input_request = get_request(reuse=True)
processed_request = process_output(response, input_request)
expected_request = get_request(reuse=True)
compare_requests(processed_request, expected_request)


def test_bad_session_id_default_session():
response = get_response(session=SESSION, error='bad_session_id')
input_request = get_request(reuse=True)
settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'}
processed_request = process_output(response, input_request, settings)
expected_request = get_request(reuse=True, session='create')
compare_requests(processed_request, expected_request)


def test_user_session_limit():
# This session error is only expected to come from a response that has no
# ``X-Crawlera-Session`` value, caused by a request with ``create`` as
# ``X-Crawlera-Session`` value.
response = get_response(error='user_session_limit')
input_request = get_request(reuse=True)
processed_request = process_output(response, input_request)
expected_request = get_request(reuse=True)
compare_requests(processed_request, expected_request)


@pytest.mark.parametrize(
'error',
[
# https://doc.scrapinghub.com/crawlera.html#errors
(
'bad_proxy_auth',
),
(
'too_many_conns',
),
(
'header_auth',
),
(
'',
),
(
'nxdomain',
),
(
'ehostunreach',
),
(
'econnrefused',
),
(
'econnreset',
),
(
'socket_closed_remotely',
),
(
'client_conn_closed',
),
(
'noslaves',
),
(
'banned',
),
(
'serverbusy',
),
(
'timeout',
),
(
'msgtimeout',
),
(
'domain_forbidden',
),
(
'bad_header',
),
(
'data_error',
),
]
)
def test_non_session_error(error):
session = SESSION
response = get_response(session=session, error=error)
input_request = get_request(reuse=True)
processed_request = process_output(response, input_request)
expected_request = get_request(reuse=True, session=SESSION)
compare_requests(processed_request, expected_request)


def test_session():
session = SESSION
response = get_response(session=session)
input_request = get_request(reuse=True)
processed_request = process_output(response, input_request)
expected_request = get_request(reuse=True, session=SESSION)
compare_requests(processed_request, expected_request)


def test_create_on_sessionless_reuse():
response = get_response()
input_request = get_request(reuse=True)
settings = {'CRAWLERA_SESSION_REUSE_DEFAULT_SESSION': 'create'}
processed_request = process_output(response, input_request, settings)
expected_request = get_request(reuse=True, session='create')
compare_requests(processed_request, expected_request)


def test_dont_create_on_sessionless_reuse():
response = get_response()
input_request = get_request(reuse=True)
processed_request = process_output(response, input_request)
expected_request = get_request(reuse=True)
compare_requests(processed_request, expected_request)


@pytest.mark.parametrize(
'session',
[
(
SESSION,
),
(
'create',
),
]
)
def test_header_without_reuse(session):
response = get_response()
input_request = get_request(session=session)
processed_request = process_output(response, input_request)
expected_request = get_request(session=session)
compare_requests(processed_request, expected_request)