Skip to content

Commit

Permalink
Addition of SPIDERMON_MONITOR_SKIPPING_RULES (#384)
Browse files Browse the repository at this point in the history
* added skip rules

* added passing method to skip rule support

* added doc string

* percommit

* percommit

* - enhanced operators in comparison
- updated doc string placement

* Added test cases

* handled requested changes on PR
  • Loading branch information
shafiq-muhammad committed Aug 10, 2023
1 parent e6509dd commit aa76291
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 0 deletions.
79 changes: 79 additions & 0 deletions spidermon/contrib/scrapy/monitors/base.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,97 @@
import operator
import logging

from spidermon import Monitor
from spidermon.exceptions import NotConfigured

from ...monitors.mixins.spider import SpiderMonitorMixin

logger = logging.getLogger(__name__)


class BaseScrapyMonitor(Monitor, SpiderMonitorMixin):
"""
Monitor can be skipped based on conditions given in the settings.
The purpose is to skip a monitor based on stat value or any custom
function. A scenario could be skipping the Field Coverage Monitor
when a spider produced no items. Following is a code block of
examples of how we can configure the skip rules in settings.
Example #1: skip rules based on stat values
.. code-block:: python
class QuotesSpider(scrapy.Spider):
name = "quotes"
custom_settings = {
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/quote": 1,
"dict/author": 1,
},
"SPIDERMON_MONITOR_SKIPPING_RULES": {
"Field Coverage Monitor": [["item_scraped_count", "==", 0]],
}
}
Example #2: skip rules based on a custom function
.. code-block:: python
def skip_function(monitor):
return "item_scraped_count" not in monitor.data.stats
class QuotesSpider(scrapy.Spider):
name = "quotes"
custom_settings = {
"SPIDERMON_FIELD_COVERAGE_RULES": {
"dict/quote": 1,
"dict/author": 1,
},
"SPIDERMON_MONITOR_SKIPPING_RULES": {
"Field Coverage Monitor": [skip_function],
}
}
"""

longMessage = False
ops = {
">": operator.gt,
">=": operator.ge,
"<": operator.lt,
"<=": operator.le,
"==": operator.eq,
"!=": operator.ne,
}

@property
def monitor_description(self):
if self.__class__.__doc__:
return self.__class__.__doc__.split("\n")[0]
return super().monitor_description

def run(self, result):
if self.check_if_skip_rule_met():
logger.info(f"Skipping {self.monitor_name} monitor")
return

return super().run(result)

def check_if_skip_rule_met(self):
if (
hasattr(self, "skip_rules")
and getattr(self, "monitor_name")
and self.skip_rules.get(self.monitor_name)
):
skip_rules = self.skip_rules[self.monitor_name]
for rule in skip_rules:
if hasattr(rule, "__call__"):
if rule(self):
return True
continue
stats_value = self.data.stats.get(rule[0], 0)
if self.ops[rule[1]](stats_value, rule[2]):
return True

return False


class BaseStatMonitor(BaseScrapyMonitor):
"""Base Monitor class for stat-related monitors.
Expand Down
24 changes: 24 additions & 0 deletions spidermon/contrib/scrapy/monitors/suites.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,30 @@ class SpiderCloseMonitorSuite(MonitorSuite):
)
"""

def __init__(
self,
name=None,
monitors=None,
monitors_finished_actions=None,
monitors_passed_actions=None,
monitors_failed_actions=None,
order=None,
crawler=None,
):
super().__init__(
name,
monitors,
monitors_finished_actions,
monitors_passed_actions,
monitors_failed_actions,
order,
crawler,
)
if dict(crawler.settings).get("SPIDERMON_MONITOR_SKIPPING_RULES"):
skip_rules = crawler.settings.get("SPIDERMON_MONITOR_SKIPPING_RULES")
for monitor in self.monitors:
monitor.skip_rules = skip_rules

monitors = [
ItemCountMonitor,
ItemValidationMonitor,
Expand Down
115 changes: 115 additions & 0 deletions tests/contrib/scrapy/monitors/test_monitor_skipping_rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import operator
import pytest
from spidermon.contrib.scrapy.monitors import ItemCountMonitor
from spidermon import settings
from scrapy.utils.test import get_crawler
from spidermon.contrib.scrapy.monitors.suites import SpiderCloseMonitorSuite


ops = {
">": operator.gt,
">=": operator.ge,
"<": operator.lt,
"<=": operator.le,
"==": operator.eq,
"!=": operator.ne,
}


def always_skip(monitor):
return True


def never_skip(monitor):
return False


class monitorSuite(SpiderCloseMonitorSuite):
monitors = [ItemCountMonitor]


@pytest.mark.parametrize(
"value,threshold,expected_status,rules",
[
(100, 100, settings.MONITOR.STATUS.SUCCESS, None),
(1000, 1, settings.MONITOR.STATUS.SUCCESS, None),
(1, 0, settings.MONITOR.STATUS.SUCCESS, None),
(0, 10, None, {"Extracted Items Monitor": [["item_scraped_count", "<", 1]]}),
(50, 100, None, {"Extracted Items Monitor": [["item_scraped_count", "<", 60]]}),
(
99,
100,
settings.MONITOR.STATUS.FAILURE,
{"Extracted Items Monitor": [["item_scraped_count", "<", 1]]},
),
(
101,
100,
settings.MONITOR.STATUS.SUCCESS,
{"Extracted Items Monitor": [["item_scraped_count", "<", 1]]},
),
],
)
def test_skipping_rule_on_stats_value(
make_data, value, threshold, expected_status, rules
):
data = make_data(
{
ItemCountMonitor.threshold_setting: threshold,
}
)

settings = {"SPIDERMON_MONITOR_SKIPPING_RULES": rules}
crawler = get_crawler(settings_dict=settings)
new_suite = monitorSuite(crawler=crawler)

runner = data.pop("runner")
data["stats"][ItemCountMonitor.stat_name] = value
runner.run(new_suite, **data)

if rules:
rule = rules["Extracted Items Monitor"][0]
ops[rule[1]](value, rule[2])
if ops[rule[1]](value, rule[2]): # Monitor didn't ran
assert runner.result.monitor_results == []
return

assert runner.result.monitor_results[0].status == expected_status


@pytest.mark.parametrize(
"value,threshold,expected_status,rules",
[
(0, 10, None, {"Extracted Items Monitor": [always_skip]}),
(
50,
100,
settings.MONITOR.STATUS.FAILURE,
{"Extracted Items Monitor": [never_skip]},
),
],
)
def test_skipping_rule_on_callable_function(
make_data, value, threshold, expected_status, rules
):
data = make_data(
{
ItemCountMonitor.threshold_setting: threshold,
}
)

settings = {"SPIDERMON_MONITOR_SKIPPING_RULES": rules}
crawler = get_crawler(settings_dict=settings)
new_suite = monitorSuite(crawler=crawler)

runner = data.pop("runner")
data["stats"][ItemCountMonitor.stat_name] = value
runner.run(new_suite, **data)

if rules:
rule = rules["Extracted Items Monitor"][0]
if rule.__name__ == "always_skip":
assert runner.result.monitor_results == []
return

assert runner.result.monitor_results[0].status == expected_status

0 comments on commit aa76291

Please sign in to comment.