From ca20c80497445ec1a5538b4cee388396326c96bf Mon Sep 17 00:00:00 2001 From: Thomas David Baker Date: Fri, 13 Sep 2024 21:15:56 -0700 Subject: [PATCH 1/3] Refactor forum scraper so it gets alllll the bugs --- modo_bugs/fetcher.py | 40 ++++++++++++++++++++++++++++++++------- modo_bugs/scrape_forum.py | 2 +- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/modo_bugs/fetcher.py b/modo_bugs/fetcher.py index f4b76a17d..a82d1ce76 100644 --- a/modo_bugs/fetcher.py +++ b/modo_bugs/fetcher.py @@ -11,6 +11,9 @@ from shared import configuration, fetch_tools, lazy +BUG_REPORTS_FORUM_BASE_URL = 'https://forums.mtgo.com' +BUG_REPORT_FORUM_BASE_PATH = '/index.php?forums/bug-reports.16/' + logger = logging.getLogger(__name__) @attrs.define @@ -138,7 +141,30 @@ def get_daybreak_label(url: str) -> str | None: return None -def get_forum_posts(url: str, all_pages: bool) -> list[ForumPost]: +def get_all_forum_posts() -> list[ForumPost]: + posts = [] + sections = get_section_urls() + for url in sections: + logger.info(f'Going to get all threads in section {url}') + posts += get_forum_posts(url) + return posts + +def get_section_urls() -> list[str]: + html = fetch_tools.fetch(BUG_REPORTS_FORUM_BASE_URL + BUG_REPORT_FORUM_BASE_PATH) + soup = BeautifulSoup(html, 'html.parser') + section_urls = [] + + for node in soup.find_all('a', class_='subNodeLink--forum'): + url = BUG_REPORTS_FORUM_BASE_URL + node['href'] + section_urls.append(url) + + for node in soup.find_all('div', class_='node--forum'): + url = BUG_REPORTS_FORUM_BASE_URL + node.find('h3', class_='node-title').find('a')['href'] + section_urls.append(url) + + return section_urls + +def get_forum_posts(url: str) -> list[ForumPost]: time.sleep(1) # Try not to get blocked by the Daybreak forums. html = fetch_tools.fetch(url) soup = BeautifulSoup(html, 'html.parser') @@ -150,17 +176,17 @@ def get_forum_posts(url: str, all_pages: bool) -> list[ForumPost]: # votes = post.find('span', class_='js-voteCount').text title = post.find('div', class_='structItem-title') t = title.find('a') - if t.attrs['href'].startswith('/index.php?forums/bug-reports.16'): + if t.attrs['href'].startswith('/index.php?forums'): label = t.text t = t.find_next_sibling('a') url = 'https://forums.mtgo.com' + t.attrs['href'] name = t.text posts.append(ForumPost(name, label, url)) - if all_pages: - next = soup.find('a', class_='pageNav-jump--next') - if next is not None: - url = 'https://forums.mtgo.com' + next.attrs['href'] - posts.extend(get_forum_posts(url, True)) + next = soup.find('a', class_='pageNav-jump--next') + if next is not None: + logger.info(f'Next page: {next.attrs["href"]}') + url = 'https://forums.mtgo.com' + next.attrs['href'] + posts.extend(get_forum_posts(url)) return posts def forum_to_discord(post: ForumPost) -> None: diff --git a/modo_bugs/scrape_forum.py b/modo_bugs/scrape_forum.py index a8de09716..1b0baab1f 100644 --- a/modo_bugs/scrape_forum.py +++ b/modo_bugs/scrape_forum.py @@ -13,7 +13,7 @@ def main() -> None: with open('bugs.json') as f: bugs = f.read() - posts = fetcher.get_forum_posts('https://forums.mtgo.com/index.php?forums/bug-reports.16/', True) + posts = fetcher.get_all_forum_posts() checked = [p.url for p in posts] bad = [] for p in posts: From a850890198af021a2d33bc985baaed4aed6afd80 Mon Sep 17 00:00:00 2001 From: Thomas David Baker Date: Fri, 13 Sep 2024 21:16:14 -0700 Subject: [PATCH 2/3] More output in janky bug script --- modo_bugs/untracked.py => untracked_bugs.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) rename modo_bugs/untracked.py => untracked_bugs.py (63%) diff --git a/modo_bugs/untracked.py b/untracked_bugs.py similarity index 63% rename from modo_bugs/untracked.py rename to untracked_bugs.py index ae6dbcc5a..54a6676c0 100644 --- a/modo_bugs/untracked.py +++ b/untracked_bugs.py @@ -1,5 +1,8 @@ +from decksite.data import playability from shared import fetch_tools +# A little script to run to try and marry up our (modo-bugs) bugs with their (Darybreak MTGO bug form) bugs + def main() -> None: their_bugs = fetch_tools.fetch_json('https://raw.githubusercontent.com/PennyDreadfulMTG/modo-bugs/master/forums.json') @@ -13,12 +16,14 @@ def main() -> None: print('Maybe\n', our_bug['description'], '\nis tracked by them as\n', their_bug['title']) print(their_bug['url']) print(our_bug['url'] + '\n') - print("= All of their bugs we aren't tracking:\n") + print(f"= All of their bugs we aren't tracking ({len(their_untracked_bugs)}):\n") for their_bug in their_untracked_bugs: print(f'[{their_bug["status"]}] {their_bug["title"]}\n{their_bug["url"]}\n') - print("= All of our bugs they aren't tracking:\n") + print(f"= All of our bugs they aren't tracking ({len(our_untracked_bugs)}):\n") + ranks = playability.rank() + our_untracked_bugs.sort(key=lambda bug: (not bug['pd_legal'], ranks.get(bug['card'], float('inf')) or float('inf'))) for our_bug in our_untracked_bugs: - print(f'[{our_bug["card"]}] {our_bug["description"]}\n{our_bug["url"]}\n') + print(f'[{our_bug["card"]}][{"LEGAL" if our_bug["pd_legal"] else "NOT LEGAL"}][Rank {ranks.get(our_bug["card"], float("inf"))}] {our_bug["description"]}\n{our_bug["url"]}\n') if __name__ == '__main__': From 35e7d3d3aea2438c3eec5704f65f3aaf3d137338 Mon Sep 17 00:00:00 2001 From: Thomas David Baker Date: Fri, 13 Sep 2024 21:38:16 -0700 Subject: [PATCH 3/3] Use some logging stuff instead of print I'll clean this up later by making shared.logger able to set log level. --- untracked_bugs.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/untracked_bugs.py b/untracked_bugs.py index 54a6676c0..cae5f8130 100644 --- a/untracked_bugs.py +++ b/untracked_bugs.py @@ -1,29 +1,40 @@ +import logging + from decksite.data import playability from shared import fetch_tools -# A little script to run to try and marry up our (modo-bugs) bugs with their (Darybreak MTGO bug form) bugs +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) +logger.addHandler(console_handler) +# A little script to run to try and marry up our (modo-bugs) bugs with their (Darybreak MTGO bug form) bugs + def main() -> None: their_bugs = fetch_tools.fetch_json('https://raw.githubusercontent.com/PennyDreadfulMTG/modo-bugs/master/forums.json') their_untracked_bugs = [bug for bug in their_bugs.values() if not bug['tracked'] and bug['status'] not in ['Fixed', 'Not A Bug', 'No Fix Planned', 'Could Not Reproduce']] our_bugs = fetch_tools.fetch_json('https://raw.githubusercontent.com/PennyDreadfulMTG/modo-bugs/master/bugs.json') our_untracked_bugs = [bug for bug in our_bugs if not bug['support_thread']] - print('= Possible missing linkage:\n') + logger.info('= Possible missing linkage:\n') for our_bug in our_untracked_bugs: for their_bug in their_untracked_bugs: if our_bug['card'] in their_bug['title']: - print('Maybe\n', our_bug['description'], '\nis tracked by them as\n', their_bug['title']) - print(their_bug['url']) - print(our_bug['url'] + '\n') - print(f"= All of their bugs we aren't tracking ({len(their_untracked_bugs)}):\n") + logger.info('Maybe') + logger.info(our_bug['description']) + logger.info('is tracked by them as') + logger.info(their_bug['title']) + logger.info(their_bug['url']) + logger.info(our_bug['url'] + '\n') + logger.info(f"= All of their bugs we aren't tracking ({len(their_untracked_bugs)}):\n") for their_bug in their_untracked_bugs: - print(f'[{their_bug["status"]}] {their_bug["title"]}\n{their_bug["url"]}\n') - print(f"= All of our bugs they aren't tracking ({len(our_untracked_bugs)}):\n") + logger.info(f'[{their_bug["status"]}] {their_bug["title"]}\n{their_bug["url"]}\n') + logger.info(f"= All of our bugs they aren't tracking ({len(our_untracked_bugs)}):\n") ranks = playability.rank() our_untracked_bugs.sort(key=lambda bug: (not bug['pd_legal'], ranks.get(bug['card'], float('inf')) or float('inf'))) for our_bug in our_untracked_bugs: - print(f'[{our_bug["card"]}][{"LEGAL" if our_bug["pd_legal"] else "NOT LEGAL"}][Rank {ranks.get(our_bug["card"], float("inf"))}] {our_bug["description"]}\n{our_bug["url"]}\n') + logger.info(f'[{our_bug["card"]}][{"LEGAL" if our_bug["pd_legal"] else "NOT LEGAL"}][Rank {ranks.get(our_bug["card"], float("inf"))}] {our_bug["description"]}\n{our_bug["url"]}\n') if __name__ == '__main__':