Skip to content

Commit

Permalink
[434] Allow to filter jobs in ZyteJobsComparisonMonitor by close_reas…
Browse files Browse the repository at this point in the history
…on (#440)

* added filter based on close reason

* handled pr change requests

* added test cases

* updated testcase name

* fixed test cases

* added new unit test cases

* linting

* updated doc string

---------

Co-authored-by: Víctor Ruiz <[email protected]>
  • Loading branch information
shafiq-muhammad and VMRuiz committed Jul 15, 2024
1 parent 6fe8928 commit d7d553a
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 2 deletions.
1 change: 1 addition & 0 deletions spidermon/contrib/scrapy/monitors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
SPIDERMON_JOBS_COMPARISON_TAGS,
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
SPIDERMON_ITEM_COUNT_INCREASE,
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
)
from .suites import (
SpiderCloseMonitorSuite,
Expand Down
17 changes: 15 additions & 2 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SPIDERMON_JOBS_COMPARISON = "SPIDERMON_JOBS_COMPARISON"
SPIDERMON_JOBS_COMPARISON_STATES = "SPIDERMON_JOBS_COMPARISON_STATES"
SPIDERMON_JOBS_COMPARISON_TAGS = "SPIDERMON_JOBS_COMPARISON_TAGS"
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS = "SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS"
SPIDERMON_JOBS_COMPARISON_THRESHOLD = "SPIDERMON_JOBS_COMPARISON_THRESHOLD"
SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE"

Expand Down Expand Up @@ -528,6 +529,11 @@ class ZyteJobsComparisonMonitor(BaseStatMonitor):
You can also filter which jobs to compare based on their tags using the
``SPIDERMON_JOBS_COMPARISON_TAGS`` setting. Among the defined tags we consider only those
that are also present in the current job.
You can also filter which jobs to compare based on their close reason using the
``SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS`` setting. The default value is ``()``,
which doesn't filter any job based on close_reason. To only consider successfully finished jobs,
use ``("finished", ) instead.``
"""

stat_name = "item_scraped_count"
Expand Down Expand Up @@ -556,6 +562,9 @@ def run(self, result):

def _get_jobs(self, states, number_of_jobs):
tags = self._get_tags_to_filter()
close_reasons = self.crawler.settings.getlist(
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ()
)

total_jobs = []
start = 0
Expand All @@ -571,9 +580,13 @@ def _get_jobs(self, states, number_of_jobs):
count=count,
has_tag=tags or None,
)
total_jobs.extend(current_jobs)

if len(current_jobs) < MAX_API_COUNT or len(total_jobs) == number_of_jobs:
for job in current_jobs:
if close_reasons and job.get("close_reason") not in close_reasons:
continue
total_jobs.append(job)

if len(current_jobs) < MAX_API_COUNT or len(total_jobs) >= number_of_jobs:
# Stop paginating if results are less than 1000 (pagination not required)
# or target jobs was reached - no more pagination required
break
Expand Down
77 changes: 77 additions & 0 deletions tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
SPIDERMON_JOBS_COMPARISON_STATES,
SPIDERMON_JOBS_COMPARISON_TAGS,
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
ZyteJobsComparisonMonitor,
monitors,
)
Expand All @@ -20,6 +21,17 @@ def mock_jobs(previous_counts):
return Mock(return_value=[dict(items=c) for c in previous_counts])


@pytest.fixture
def mock_jobs_with_close_reason(previous_job_objs, close_reasons):
return Mock(
return_value=[
dict(items=j["items"], close_reason=j["close_reason"])
for j in previous_job_objs
if j["close_reason"] in close_reasons
]
)


@pytest.fixture
def mock_suite(mock_jobs, monkeypatch):
monkeypatch.setattr(ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs)
Expand All @@ -30,6 +42,34 @@ def get_paginated_jobs(**kwargs):
return [Mock() for _ in range(kwargs["count"])]


def get_paginated_jobs_with_finished_close_reason(**kwargs):
objs = []
for _ in range(kwargs["count"]):
obj = Mock()
obj.get.return_value = "finished"
objs.append(obj)

return objs


def get_paginated_jobs_with_cancel_close_reason(**kwargs):
objs = []
for _ in range(kwargs["count"]):
obj = Mock()
obj.get.return_value = "cancel"
objs.append(obj)

return objs


@pytest.fixture
def mock_suite_with_close_reason(mock_jobs_with_close_reason, monkeypatch):
monkeypatch.setattr(
ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs_with_close_reason
)
return MonitorSuite(monitors=[ZyteJobsComparisonMonitor])


@pytest.fixture
def mock_suite_and_zyte_client(
monkeypatch,
Expand Down Expand Up @@ -119,6 +159,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Return exact number of jobs
Expand All @@ -134,6 +175,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
output = [Mock(), Mock()]
mock_client.spider.jobs.list = Mock(return_value=output)

Expand All @@ -149,13 +191,48 @@ def test_jobs_comparison_monitor_get_jobs():
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Jobs bigger than 1000
jobs = monitor._get_jobs(states=None, number_of_jobs=2500)
assert len(jobs) == 2500
assert mock_client.spider.jobs.list.call_count == 3

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = ["finished"]
mock_client.spider.jobs.list = Mock(
side_effect=get_paginated_jobs_with_finished_close_reason
)

# Return exact number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 50

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = ["finished"]
mock_client.spider.jobs.list = Mock(
side_effect=get_paginated_jobs_with_cancel_close_reason
)

# Return no jobs as all will be filtered due to close reaseon
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 0


@pytest.mark.parametrize(
["item_count", "previous_counts", "threshold", "should_raise"],
Expand Down

0 comments on commit d7d553a

Please sign in to comment.