Charcoal-SE · user12986714 · Sep 2, 2020 · Sep 2, 2020 · Sep 2, 2020 · Sep 2, 2020
diff --git a/findspam.py b/findspam.py
@@ -44,6 +44,8 @@
 PUNCTUATION_RATIO = 0.42
 REPEATED_CHARACTER_RATIO = 0.20
 IMG_TXT_R_THRES = 0.7
+OLD_VIDEO_THRES = 5
+OLD_MEDIUM_POST_THRES = 7
 EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
 RE_COMPILE = regex.compile(EXCEPTION_RE)
 COMMON_MALFORMED_PROTOCOLS = [
@@ -622,6 +624,49 @@ def mostly_img(s, site):
     return False, ""
 
 
+def is_recent(date, now, thres):
+    return now.year == int(date[2]) and now.strftime("%b") == date[0] and now.day <= int(date[1]) + thres
+
+
+def scrap_and_check(url_list, date_regex, thres, thing):
+    now = datetime.now()
+    for link in url_list:
+        try:
+            resp = requests.get(link).text
+            date = regex.findall(date_regex, resp)
+            if len(date) == 1 and is_recent(date[0], now, thres):
+                return True, "{} is posted on {} {}, {}".format(thing, date[0][0],
+                                                                date[0][1], date[0][2])
+        except Exception:
+            pass
+    return False, ""
+
+
+@create_rule("Newly posted youtube video")
+def new_video(s, site):
+    # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517
+    youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" +
+                                r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s)
+    youtube_links = ["https://youtu.be/" + x for x in youtube_ids]
+    return scrap_and_check(youtube_links,
+                           r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
+                           r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}',
+                           OLD_VIDEO_THRES,
+                           "Video")
+
+
+@create_rule("Newly posted medium post")
+def new_medium_post(s, site):
+    medium_links_core = regex.findall(r"medium\.com\/@?[\w-]*+\/[\w-]*+", s)
+    medium_links = ["https://" + x for x in medium_links_core]
+    return scrap_and_check(medium_links,
+                           r'<a class="bh bi at au av aw ax ay az ba fu bd bl bm" rel="noopener" ' +
+                           r'href="[^"]*+">(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? ' +
+                           r'(\d++), (\d++)<\/a>',
+                           OLD_MEDIUM_POST_THRES,
+                           "Medium post")
+
+
 # noinspection PyUnusedLocal,PyMissingTypeHints
 @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
 def has_repeating_characters(s, site):