diff --git a/spidermon/contrib/scrapy/monitors/__init__.py b/spidermon/contrib/scrapy/monitors/__init__.py index 91a3247c..cdbc1c58 100644 --- a/spidermon/contrib/scrapy/monitors/__init__.py +++ b/spidermon/contrib/scrapy/monitors/__init__.py @@ -27,6 +27,7 @@ SPIDERMON_JOBS_COMPARISON_TAGS, SPIDERMON_JOBS_COMPARISON_THRESHOLD, SPIDERMON_ITEM_COUNT_INCREASE, + SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ) from .suites import ( SpiderCloseMonitorSuite, diff --git a/spidermon/contrib/scrapy/monitors/monitors.py b/spidermon/contrib/scrapy/monitors/monitors.py index 6221a19b..e7679227 100644 --- a/spidermon/contrib/scrapy/monitors/monitors.py +++ b/spidermon/contrib/scrapy/monitors/monitors.py @@ -21,6 +21,7 @@ SPIDERMON_JOBS_COMPARISON = "SPIDERMON_JOBS_COMPARISON" SPIDERMON_JOBS_COMPARISON_STATES = "SPIDERMON_JOBS_COMPARISON_STATES" SPIDERMON_JOBS_COMPARISON_TAGS = "SPIDERMON_JOBS_COMPARISON_TAGS" +SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS = "SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS" SPIDERMON_JOBS_COMPARISON_THRESHOLD = "SPIDERMON_JOBS_COMPARISON_THRESHOLD" SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE" @@ -528,6 +529,11 @@ class ZyteJobsComparisonMonitor(BaseStatMonitor): You can also filter which jobs to compare based on their tags using the ``SPIDERMON_JOBS_COMPARISON_TAGS`` setting. Among the defined tags we consider only those that are also present in the current job. + + You can also filter which jobs to compare based on their close reason using the + ``SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS`` setting. The default value is ``()``, + which doesn't filter any job based on close_reason. To only consider successfully finished jobs, + use ``("finished", ) instead.`` """ stat_name = "item_scraped_count" @@ -556,6 +562,9 @@ def run(self, result): def _get_jobs(self, states, number_of_jobs): tags = self._get_tags_to_filter() + close_reasons = self.crawler.settings.getlist( + SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, () + ) total_jobs = [] start = 0 @@ -571,9 +580,13 @@ def _get_jobs(self, states, number_of_jobs): count=count, has_tag=tags or None, ) - total_jobs.extend(current_jobs) - if len(current_jobs) < MAX_API_COUNT or len(total_jobs) == number_of_jobs: + for job in current_jobs: + if close_reasons and job.get("close_reason") not in close_reasons: + continue + total_jobs.append(job) + + if len(current_jobs) < MAX_API_COUNT or len(total_jobs) >= number_of_jobs: # Stop paginating if results are less than 1000 (pagination not required) # or target jobs was reached - no more pagination required break diff --git a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py index f01125a5..9d455949 100644 --- a/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py +++ b/tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py @@ -8,6 +8,7 @@ SPIDERMON_JOBS_COMPARISON_STATES, SPIDERMON_JOBS_COMPARISON_TAGS, SPIDERMON_JOBS_COMPARISON_THRESHOLD, + SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ZyteJobsComparisonMonitor, monitors, ) @@ -20,6 +21,17 @@ def mock_jobs(previous_counts): return Mock(return_value=[dict(items=c) for c in previous_counts]) +@pytest.fixture +def mock_jobs_with_close_reason(previous_job_objs, close_reasons): + return Mock( + return_value=[ + dict(items=j["items"], close_reason=j["close_reason"]) + for j in previous_job_objs + if j["close_reason"] in close_reasons + ] + ) + + @pytest.fixture def mock_suite(mock_jobs, monkeypatch): monkeypatch.setattr(ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs) @@ -30,6 +42,34 @@ def get_paginated_jobs(**kwargs): return [Mock() for _ in range(kwargs["count"])] +def get_paginated_jobs_with_finished_close_reason(**kwargs): + objs = [] + for _ in range(kwargs["count"]): + obj = Mock() + obj.get.return_value = "finished" + objs.append(obj) + + return objs + + +def get_paginated_jobs_with_cancel_close_reason(**kwargs): + objs = [] + for _ in range(kwargs["count"]): + obj = Mock() + obj.get.return_value = "cancel" + objs.append(obj) + + return objs + + +@pytest.fixture +def mock_suite_with_close_reason(mock_jobs_with_close_reason, monkeypatch): + monkeypatch.setattr( + ZyteJobsComparisonMonitor, "_get_jobs", mock_jobs_with_close_reason + ) + return MonitorSuite(monitors=[ZyteJobsComparisonMonitor]) + + @pytest.fixture def mock_suite_and_zyte_client( monkeypatch, @@ -119,6 +159,7 @@ def test_jobs_comparison_monitor_get_jobs(): monitor = TestZyteJobsComparisonMonitor() monitor._get_tags_to_filter = Mock(side_effect=lambda: None) monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = None mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs) # Return exact number of jobs @@ -134,6 +175,7 @@ def test_jobs_comparison_monitor_get_jobs(): monitor = TestZyteJobsComparisonMonitor() monitor._get_tags_to_filter = Mock(side_effect=lambda: None) monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = None output = [Mock(), Mock()] mock_client.spider.jobs.list = Mock(return_value=output) @@ -149,6 +191,7 @@ def test_jobs_comparison_monitor_get_jobs(): monitor = TestZyteJobsComparisonMonitor() monitor._get_tags_to_filter = Mock(side_effect=lambda: None) monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = None mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs) # Jobs bigger than 1000 @@ -156,6 +199,40 @@ def test_jobs_comparison_monitor_get_jobs(): assert len(jobs) == 2500 assert mock_client.spider.jobs.list.call_count == 3 + mock_client = Mock() + with patch( + "spidermon.contrib.scrapy.monitors.monitors.Client" + ) as mock_client_class: + mock_client_class.return_value = mock_client + monitor = TestZyteJobsComparisonMonitor() + monitor._get_tags_to_filter = Mock(side_effect=lambda: None) + monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = ["finished"] + mock_client.spider.jobs.list = Mock( + side_effect=get_paginated_jobs_with_finished_close_reason + ) + + # Return exact number of jobs + jobs = monitor._get_jobs(states=None, number_of_jobs=50) + assert len(jobs) == 50 + + mock_client = Mock() + with patch( + "spidermon.contrib.scrapy.monitors.monitors.Client" + ) as mock_client_class: + mock_client_class.return_value = mock_client + monitor = TestZyteJobsComparisonMonitor() + monitor._get_tags_to_filter = Mock(side_effect=lambda: None) + monitor.data = Mock() + monitor.crawler.settings.getlist.return_value = ["finished"] + mock_client.spider.jobs.list = Mock( + side_effect=get_paginated_jobs_with_cancel_close_reason + ) + + # Return no jobs as all will be filtered due to close reaseon + jobs = monitor._get_jobs(states=None, number_of_jobs=50) + assert len(jobs) == 0 + @pytest.mark.parametrize( ["item_count", "previous_counts", "threshold", "should_raise"],