Addition of SPIDERMON_MONITOR_SKIPPING_RULES (#384)

* added skip rules * added passing method to skip rule support * added doc string * percommit * percommit * - enhanced operators in comparison - updated doc string placement * Added test cases * handled requested changes on PR
scrapinghub · Aug 10, 2023 · aa76291 · aa76291
1 parent e6509dd
commit aa76291
Show file tree

Hide file tree

Showing 3 changed files with 218 additions and 0 deletions.
diff --git a/spidermon/contrib/scrapy/monitors/base.py b/spidermon/contrib/scrapy/monitors/base.py
@@ -1,18 +1,97 @@
+import operator
+import logging
+
 from spidermon import Monitor
 from spidermon.exceptions import NotConfigured
 
 from ...monitors.mixins.spider import SpiderMonitorMixin
 
+logger = logging.getLogger(__name__)
+
 
 class BaseScrapyMonitor(Monitor, SpiderMonitorMixin):
+    """
+    Monitor can be skipped based on conditions given in the settings.
+    The purpose is to skip a monitor based on stat value or any custom
+    function. A scenario could be skipping the Field Coverage Monitor
+    when a spider produced no items. Following is a code block of
+    examples of how we can configure the skip rules in settings.
+
+    Example #1: skip rules based on stat values
+    .. code-block:: python
+        class QuotesSpider(scrapy.Spider):
+            name = "quotes"
+            custom_settings = {
+                "SPIDERMON_FIELD_COVERAGE_RULES": {
+                    "dict/quote": 1,
+                    "dict/author": 1,
+                },
+                "SPIDERMON_MONITOR_SKIPPING_RULES": {
+                    "Field Coverage Monitor": [["item_scraped_count", "==", 0]],
+                }
+            }
+
+    Example #2: skip rules based on a custom function
+    .. code-block:: python
+
+        def skip_function(monitor):
+            return "item_scraped_count" not in monitor.data.stats
+
+        class QuotesSpider(scrapy.Spider):
+            name = "quotes"
+
+            custom_settings = {
+                "SPIDERMON_FIELD_COVERAGE_RULES": {
+                    "dict/quote": 1,
+                    "dict/author": 1,
+                },
+                "SPIDERMON_MONITOR_SKIPPING_RULES": {
+                    "Field Coverage Monitor": [skip_function],
+                }
+            }
+    """
+
     longMessage = False
+    ops = {
+        ">": operator.gt,
+        ">=": operator.ge,
+        "<": operator.lt,
+        "<=": operator.le,
+        "==": operator.eq,
+        "!=": operator.ne,
+    }
 
     @property
     def monitor_description(self):
         if self.__class__.__doc__:
             return self.__class__.__doc__.split("\n")[0]
         return super().monitor_description
 
+    def run(self, result):
+        if self.check_if_skip_rule_met():
+            logger.info(f"Skipping {self.monitor_name} monitor")
+            return
+
+        return super().run(result)
+
+    def check_if_skip_rule_met(self):
+        if (
+            hasattr(self, "skip_rules")
+            and getattr(self, "monitor_name")
+            and self.skip_rules.get(self.monitor_name)
+        ):
+            skip_rules = self.skip_rules[self.monitor_name]
+            for rule in skip_rules:
+                if hasattr(rule, "__call__"):
+                    if rule(self):
+                        return True
+                    continue
+                stats_value = self.data.stats.get(rule[0], 0)
+                if self.ops[rule[1]](stats_value, rule[2]):
+                    return True
+
+        return False
+
 
 class BaseStatMonitor(BaseScrapyMonitor):
     """Base Monitor class for stat-related monitors.

diff --git a/spidermon/contrib/scrapy/monitors/suites.py b/spidermon/contrib/scrapy/monitors/suites.py
@@ -39,6 +39,30 @@ class SpiderCloseMonitorSuite(MonitorSuite):
             )
     """
 
+    def __init__(
+        self,
+        name=None,
+        monitors=None,
+        monitors_finished_actions=None,
+        monitors_passed_actions=None,
+        monitors_failed_actions=None,
+        order=None,
+        crawler=None,
+    ):
+        super().__init__(
+            name,
+            monitors,
+            monitors_finished_actions,
+            monitors_passed_actions,
+            monitors_failed_actions,
+            order,
+            crawler,
+        )
+        if dict(crawler.settings).get("SPIDERMON_MONITOR_SKIPPING_RULES"):
+            skip_rules = crawler.settings.get("SPIDERMON_MONITOR_SKIPPING_RULES")
+            for monitor in self.monitors:
+                monitor.skip_rules = skip_rules
+
     monitors = [
         ItemCountMonitor,
         ItemValidationMonitor,

diff --git a/tests/contrib/scrapy/monitors/test_monitor_skipping_rules.py b/tests/contrib/scrapy/monitors/test_monitor_skipping_rules.py
@@ -0,0 +1,115 @@
+import operator
+import pytest
+from spidermon.contrib.scrapy.monitors import ItemCountMonitor
+from spidermon import settings
+from scrapy.utils.test import get_crawler
+from spidermon.contrib.scrapy.monitors.suites import SpiderCloseMonitorSuite
+
+
+ops = {
+    ">": operator.gt,
+    ">=": operator.ge,
+    "<": operator.lt,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+}
+
+
+def always_skip(monitor):
+    return True
+
+
+def never_skip(monitor):
+    return False
+
+
+class monitorSuite(SpiderCloseMonitorSuite):
+    monitors = [ItemCountMonitor]
+
+
+@pytest.mark.parametrize(
+    "value,threshold,expected_status,rules",
+    [
+        (100, 100, settings.MONITOR.STATUS.SUCCESS, None),
+        (1000, 1, settings.MONITOR.STATUS.SUCCESS, None),
+        (1, 0, settings.MONITOR.STATUS.SUCCESS, None),
+        (0, 10, None, {"Extracted Items Monitor": [["item_scraped_count", "<", 1]]}),
+        (50, 100, None, {"Extracted Items Monitor": [["item_scraped_count", "<", 60]]}),
+        (
+            99,
+            100,
+            settings.MONITOR.STATUS.FAILURE,
+            {"Extracted Items Monitor": [["item_scraped_count", "<", 1]]},
+        ),
+        (
+            101,
+            100,
+            settings.MONITOR.STATUS.SUCCESS,
+            {"Extracted Items Monitor": [["item_scraped_count", "<", 1]]},
+        ),
+    ],
+)
+def test_skipping_rule_on_stats_value(
+    make_data, value, threshold, expected_status, rules
+):
+    data = make_data(
+        {
+            ItemCountMonitor.threshold_setting: threshold,
+        }
+    )
+
+    settings = {"SPIDERMON_MONITOR_SKIPPING_RULES": rules}
+    crawler = get_crawler(settings_dict=settings)
+    new_suite = monitorSuite(crawler=crawler)
+
+    runner = data.pop("runner")
+    data["stats"][ItemCountMonitor.stat_name] = value
+    runner.run(new_suite, **data)
+
+    if rules:
+        rule = rules["Extracted Items Monitor"][0]
+        ops[rule[1]](value, rule[2])
+        if ops[rule[1]](value, rule[2]):  # Monitor didn't ran
+            assert runner.result.monitor_results == []
+            return
+
+    assert runner.result.monitor_results[0].status == expected_status
+
+
+@pytest.mark.parametrize(
+    "value,threshold,expected_status,rules",
+    [
+        (0, 10, None, {"Extracted Items Monitor": [always_skip]}),
+        (
+            50,
+            100,
+            settings.MONITOR.STATUS.FAILURE,
+            {"Extracted Items Monitor": [never_skip]},
+        ),
+    ],
+)
+def test_skipping_rule_on_callable_function(
+    make_data, value, threshold, expected_status, rules
+):
+    data = make_data(
+        {
+            ItemCountMonitor.threshold_setting: threshold,
+        }
+    )
+
+    settings = {"SPIDERMON_MONITOR_SKIPPING_RULES": rules}
+    crawler = get_crawler(settings_dict=settings)
+    new_suite = monitorSuite(crawler=crawler)
+
+    runner = data.pop("runner")
+    data["stats"][ItemCountMonitor.stat_name] = value
+    runner.run(new_suite, **data)
+
+    if rules:
+        rule = rules["Extracted Items Monitor"][0]
+        if rule.__name__ == "always_skip":
+            assert runner.result.monitor_results == []
+            return
+
+    assert runner.result.monitor_results[0].status == expected_status