Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor spidermon.utils.zyte.Client to use crawler settings #411

Merged
merged 3 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions spidermon/contrib/scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from spidermon.python import factory
from spidermon.python.monitors import ExpressionsMonitor
from spidermon.utils.field_coverage import calculate_field_coverage
from spidermon.utils.zyte import Client


class Spidermon:
Expand Down Expand Up @@ -54,6 +55,7 @@ def __init__(

self.periodic_suites = periodic_suites or {}
self.periodic_tasks = {}
self.client = Client(self.crawler.settings)

def load_suite(self, suite_to_load):
try:
Expand Down Expand Up @@ -204,14 +206,12 @@ def _run_suites(self, spider, suites):
runner.run(suite, **data)

def _generate_data_for_spider(self, spider):
from spidermon.utils.zyte import client

return {
"stats": self.crawler.stats.get_stats(spider),
"stats_history": spider.stats_history
if hasattr(spider, "stats_history")
else [],
"crawler": self.crawler,
"spider": spider,
"job": client.job if client.available else None,
"job": self.client.job if self.client.available else None,
}
8 changes: 5 additions & 3 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from spidermon import Monitor, monitors
from spidermon.exceptions import NotConfigured
from spidermon.utils import zyte
from spidermon.utils.zyte import Client
from spidermon.utils.settings import getdictorlist
from spidermon.contrib.monitors.mixins.stats import StatsMonitorMixin

Expand Down Expand Up @@ -554,7 +554,9 @@ def _get_jobs(self, states, number_of_jobs):

jobs = []
start = 0
_jobs = zyte.client.spider.jobs.list(
client = Client(self.crawler.settings)

_jobs = client.spider.jobs.list(
start=start,
state=states,
count=number_of_jobs,
Expand All @@ -563,7 +565,7 @@ def _get_jobs(self, states, number_of_jobs):
while _jobs:
jobs.extend(_jobs)
start += 1000
_jobs = zyte.client.spider.jobs.list(
_jobs = client.spider.jobs.list(
start=start,
state=states,
count=number_of_jobs,
Expand Down
28 changes: 13 additions & 15 deletions spidermon/utils/zyte.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
"""
Module to hold a singleton to Zyte's Scrapy Cloud client.

It expects that "SHUB_JOBKEY" be present as environment variable.
Also, it will look up for Scrapy Cloud API key in `SHUB_APIKEY` Scrapy
setting, then for the environment variables `SH_APIKEY` and `SHUB_JOBAUTH`.
Note that "SHUB_JOBAUTH" can't access all API endpoints.
"""
import os

try:
Expand All @@ -17,12 +9,23 @@


class Client:
def __init__(self):
"""
Wrapper over ScrapinghubClient that indicates with its available
attribute if it can be used to fetch data from Scrapy Cloud API.

It expects that "SHUB_JOBKEY" be present as environment variable.
Also, it will look up for Scrapy Cloud API key in `SHUB_APIKEY` Scrapy
setting, then for the environment variables `SH_APIKEY` and `SHUB_JOBAUTH`.
Note that "SHUB_JOBAUTH" can't access all API endpoints.
"""

def __init__(self, settings):
self.available = HAS_CLIENT and "SHUB_JOBKEY" in os.environ
self._client = None
self._project = None
self._spider = None
self._job = None
self._settings = settings
if self.available:
self.job_key = os.environ["SHUB_JOBKEY"]
self.project_id, self.spider_id, self.job_id = map(
Expand All @@ -40,10 +43,8 @@ def client(self):
return self._client

def _apikey(self):
from scrapy.utils.project import get_project_settings

apikey = (
get_project_settings().get("SHUB_APIKEY")
self._settings.get("SHUB_APIKEY")
or os.environ.get("SH_APIKEY")
or os.environ.get("SHUB_JOBAUTH")
)
Expand Down Expand Up @@ -76,6 +77,3 @@ def job(self):
def close(self):
if self._client:
self._client.close()


client = Client()
13 changes: 7 additions & 6 deletions tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@ def get_paginated_jobs(**kwargs):

monkeypatch.setenv("SHUB_JOB_DATA", '{"tags":["tag1","tag2","tag3"]}')

monkeypatch.setattr(monitors, "zyte", Mock())
monitors.zyte.client.spider.jobs.list.side_effect = get_paginated_jobs
mock_client = Mock()
mock_client.spider.jobs.list.side_effect = get_paginated_jobs

return MonitorSuite(monitors=[monitors.ZyteJobsComparisonMonitor]), monitors.zyte
monkeypatch.setattr(monitors, "Client", Mock(side_effect=[mock_client]))
return MonitorSuite(monitors=[monitors.ZyteJobsComparisonMonitor]), mock_client


@pytest.mark.parametrize(
Expand Down Expand Up @@ -137,12 +138,12 @@ def test_arguments_passed_to_zyte_client(
SPIDERMON_JOBS_COMPARISON_THRESHOLD: threshold,
}
)
suite, zyte_module = mock_suite_and_zyte_client
suite, mock_client = mock_suite_and_zyte_client
runner = data.pop("runner")
runner.run(suite, **data)

calls = [
call.zyte_module.client.spider.jobs.list(
call.mock_client.spider.jobs.list(
start=n,
state=list(states),
count=number_of_jobs,
Expand All @@ -151,4 +152,4 @@ def test_arguments_passed_to_zyte_client(
for n in range(0, number_of_jobs + 1000, 1000)
]

zyte_module.client.spider.jobs.list.assert_has_calls(calls)
mock_client.spider.jobs.list.assert_has_calls(calls)
82 changes: 57 additions & 25 deletions tests/utils/test_zyte.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest import mock

import pytest
from scrapy.utils.test import get_crawler
from spidermon.utils import zyte


Expand All @@ -19,60 +20,65 @@ def mock_module(monkeypatch):
return zyte


def test_client_creation_no_env(no_env_mock_module):
client = no_env_mock_module.Client()
assert client.available == False
assert client.project_id == None
assert client.spider_id == None
assert client.job_id == None
@pytest.fixture
def settings():
return get_crawler().settings


def test_client_creation_no_env(no_env_mock_module, settings):
client = no_env_mock_module.Client(settings)
assert client.available is False
assert client.project_id is None
assert client.spider_id is None
assert client.job_id is None

with pytest.raises(RuntimeError):
client._apikey()


def test_client_creation(mock_module):
client = mock_module.Client()
assert client.available == True
def test_client_creation(mock_module, settings):
client = mock_module.Client(settings)
assert client.available
assert client.project_id == 123
assert client.spider_id == 456
assert client.job_id == 789


def test_client_property_project(mock_module):
client = mock_module.Client()
def test_client_property_project(mock_module, settings):
client = mock_module.Client(settings)

assert client._project == None
assert client._project is None
client.project
assert client._project != None
assert client._project is not None
client._client.get_project.assert_called_with("123")


def test_client_property_job(mock_module):
client = mock_module.Client()
def test_client_property_job(mock_module, settings):
client = mock_module.Client(settings)

assert client._job == None
assert client._job is None
client.job
assert client._job != None
assert client._job is not None
client._client.get_job.assert_called_with("123/456/789")


def test_client_property_spider(mock_module):
client = mock_module.Client()
def test_client_property_spider(mock_module, settings):
client = mock_module.Client(settings)
client._job = mock.Mock()
client._job.metadata.get.return_value = "my_awesome_spider"

assert client._spider == None
assert client._spider is None
client.spider
assert client._project != None
assert client._job != None
assert client._spider != None
assert client._project is not None
assert client._job is not None
assert client._spider is not None

client._job.metadata.get.assert_called_with("spider")
client._project.spiders.get.assert_called_with("my_awesome_spider")


def test_client_close(mock_module):
client = mock_module.Client()
def test_client_close(mock_module, settings):
client = mock_module.Client(settings)

client.client
client.close()
Expand All @@ -89,3 +95,29 @@ def test_has_client(monkeypatch, expected):

reload(zyte)
assert zyte.HAS_CLIENT == expected


def test_client_settings_priority(mock_module, monkeypatch):
shub_apikey = "SHUB_APIKEY"
shub_jobauth = "SHUB_JOBAUTH"
monkeypatch.setenv("SHUB_JOBAUTH", shub_jobauth)

# Crawler settings has top priority
settings = get_crawler(settings_dict={"SHUB_APIKEY": shub_apikey}).settings
client = mock_module.Client(settings)
assert client._apikey() == shub_apikey

# then, SH_APIKEY
empty_settings = get_crawler().settings
client = mock_module.Client(empty_settings)
assert client._apikey() == "foobar"

# then, SHUB_JOBAUTH
monkeypatch.delenv("SH_APIKEY")
assert client._apikey() == shub_jobauth

# Otherwise, raise runtime error
monkeypatch.delenv("SHUB_JOBAUTH")
client = mock_module.Client(empty_settings)
with pytest.raises(RuntimeError):
client._apikey()
Loading