From 846a1417f0a6b8d5c90c0cfb7a6fa3aab08313e4 Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Fri, 14 Jun 2024 12:16:52 -0400 Subject: [PATCH] Made sync more tolerant of poorly configured webservers Treeinfo download will ignore results that look like HTML. Some webservers return 200 with an HTML error page rather than 404. closes #3599 (cherry picked from commit 298f3a9392472c6185ed869d00baddfb98f1c0f0) --- CHANGES/3599.bugfix | 1 + pulp_rpm/app/kickstart/treeinfo.py | 6 +++--- pulp_rpm/app/tasks/synchronizing.py | 17 +++++++++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 CHANGES/3599.bugfix diff --git a/CHANGES/3599.bugfix b/CHANGES/3599.bugfix new file mode 100644 index 000000000..60c18bf71 --- /dev/null +++ b/CHANGES/3599.bugfix @@ -0,0 +1 @@ +Made sync more tolerant of poorly configured webservers. diff --git a/pulp_rpm/app/kickstart/treeinfo.py b/pulp_rpm/app/kickstart/treeinfo.py index c21f7074a..cc960443c 100644 --- a/pulp_rpm/app/kickstart/treeinfo.py +++ b/pulp_rpm/app/kickstart/treeinfo.py @@ -17,13 +17,13 @@ class PulpTreeInfo(TreeInfo): """ - def load(self, f): + def loads(self, s): """ - Load data from a file. + Load data from a string. """ try: - super().load(f) + super().loads(s) except MissingSectionHeaderError: raise TypeError(_("Treeinfo file should have INI format")) diff --git a/pulp_rpm/app/tasks/synchronizing.py b/pulp_rpm/app/tasks/synchronizing.py index c54ea63f9..7608c3be8 100644 --- a/pulp_rpm/app/tasks/synchronizing.py +++ b/pulp_rpm/app/tasks/synchronizing.py @@ -404,8 +404,9 @@ def get_treeinfo_data(remote, remote_url): namespaces = [".treeinfo", "treeinfo"] for namespace in namespaces: + treeinfo_url = urlpath_sanitize(remote_url, namespace) downloader = remote.get_downloader( - url=urlpath_sanitize(remote_url, namespace), + url=treeinfo_url, silence_errors_for_response_status_codes={403, 404}, ) @@ -415,7 +416,19 @@ def get_treeinfo_data(remote, remote_url): continue treeinfo = PulpTreeInfo() - treeinfo.load(f=result.path) + with open(result.path, "r") as f: + treeinfo_str = f.read() + # some impolitely configured webservers return HTTP 200 with an HTML error page + # when a resource isn't found, instead of returning an HTTP 404 code + if treeinfo_str.startswith("<"): + # in the event that the response looks like HTML rather than an INI file, + # let's just pretend it returned 404 + log.debug( + f"Server returned 200 for {treeinfo_url}, but the result looks like HTML" + " rather than treeinfo. Ignoring it." + ) + continue + treeinfo.loads(treeinfo_str) sha256 = result.artifact_attributes["sha256"] treeinfo_data = TreeinfoData(treeinfo.parsed_sections())