diff --git a/findspam.py b/findspam.py index 3bb2a7dafd..c0fd0a9910 100644 --- a/findspam.py +++ b/findspam.py @@ -44,6 +44,8 @@ PUNCTUATION_RATIO = 0.42 REPEATED_CHARACTER_RATIO = 0.20 IMG_TXT_R_THRES = 0.7 +OLD_VIDEO_THRES = 5 +OLD_MEDIUM_POST_THRES = 7 EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) COMMON_MALFORMED_PROTOCOLS = [ @@ -622,6 +624,49 @@ def mostly_img(s, site): return False, "" +def is_recent(date, now, thres): + return now.year == int(date[2]) and now.strftime("%b") == date[0] and now.day <= int(date[1]) + thres + + +def scrap_and_check(url_list, date_regex, thres, thing): + now = datetime.now() + for link in url_list: + try: + resp = requests.get(link).text + date = regex.findall(date_regex, resp) + if len(date) == 1 and is_recent(date[0], now, thres): + return True, "{} is posted on {} {}, {}".format(thing, date[0][0], + date[0][1], date[0][2]) + except Exception: + pass + return False, "" + + +@create_rule("Newly posted youtube video") +def new_video(s, site): + # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517 + youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" + + r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s) + youtube_links = ["https://youtu.be/" + x for x in youtube_ids] + return scrap_and_check(youtube_links, + r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' + + r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', + OLD_VIDEO_THRES, + "Video") + + +@create_rule("Newly posted medium post") +def new_medium_post(s, site): + medium_links_core = regex.findall(r"medium\.com\/@?[\w-]*+\/[\w-]*+", s) + medium_links = ["https://" + x for x in medium_links_core] + return scrap_and_check(medium_links, + r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? ' + + r'(\d++), (\d++)<\/a>', + OLD_MEDIUM_POST_THRES, + "Medium post") + + # noinspection PyUnusedLocal,PyMissingTypeHints @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000) def has_repeating_characters(s, site):