Making CI pipeline faster by parallelizing the PDF checkers execution (…

…#868)
py-pdf · Jul 26, 2023 · 892d58a · 892d58a
1 parent 54d2eb0
commit 892d58a
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 64 deletions.
diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml
@@ -50,12 +50,12 @@ jobs:
           export PYTHONPATH=$PWD
           # Using Datalogics PDF Checker:
           scripts/install-pdfchecker.sh
-          find . -name '*.pdf' | xargs -n 1 scripts/pdfchecker.py
-          scripts/pdfchecker.py  # printing aggregated report
+          scripts/pdfchecker.py --process-all-test-pdf-files
+          scripts/pdfchecker.py --print-aggregated-report
           # Using VeraPDF:
           scripts/install-verapdf.sh
-          find . -name '*.pdf' | xargs -n 1 scripts/verapdf.py
-          scripts/verapdf.py  # printing aggregated report
+          scripts/verapdf.py --process-all-test-pdf-files
+          scripts/verapdf.py --print-aggregated-report
       - name: Running tests ☑
         env:
           CHECK_EXEC_TIME: ${{ matrix.python-version == '3.9' && 'test-enabled' || '' }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,7 +29,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 - [`FPDF.image()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.image): allowing images path starting with `data` to be passed as input
 - text overflow is better handled by `FPDF.write()` & `FPDF.write_html()` - _cf._ [issue #847](https://github.com/PyFPDF/fpdf2/issues/847)
 - the initial text color is preserved when using `FPDF.write_html()` - _cf._ [issue #846](https://github.com/PyFPDF/fpdf2/issues/846)
-- handle superscript and subscript correctly when rendering `TextLine`- [Pull Request #862](https://github.com/PyFPDF/fpdf2/pull/862) 
+- handle superscript and subscript correctly when rendering `TextLine`- thanks to @Tolker-KU - _cf._ [Pull Request #862](https://github.com/PyFPDF/fpdf2/pull/862)
 ### Deprecated
 - the `center` optional parameter of [`FPDF.cell()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.cell) is **no more** deprecated, as it allows for horizontal positioning, which is different from text alignment control with `align="C"`
 

diff --git a/fpdf/html.py b/fpdf/html.py
@@ -266,9 +266,9 @@ def handle_data(self, data):
             if not data:
                 return
             if "inserted" in self.td_th:
-                tag = self.td_th["tag"]
+                td_th_tag = self.td_th["tag"]
                 raise NotImplementedError(
-                    f"Unsupported nested HTML tags inside <{tag}> element"
+                    f"Unsupported nested HTML tags inside <{td_th_tag}> element: <{self._tags_stack[-1]}>"
                 )
                 # We could potentially support nested <b> / <em> / <font> tags
                 # by building a list of Fragment instances from the HTML cell content

diff --git a/scripts/checker_commons.py b/scripts/checker_commons.py
@@ -1,8 +1,64 @@
-import json, sys
+import json, os, sys
 from collections import defaultdict
+from multiprocessing import cpu_count, Pool
 
+try:  # optional dependency to display a progress bar
+    from tqdm import tqdm
 
-def aggregate(pdf_filepath, report, aggregated_report_filepath):
+    HIDE_STDERR = True
+except ImportError:
+    tqdm = lambda _, total: _
+    HIDE_STDERR = False
+
+
+def main(checker_name, analyze_pdf_file, argv, checks_details_url):
+    if len(argv) != 2:
+        print(argv, file=sys.stderr)
+        print(
+            f"Exactly one argument must be passed to {checker_name}.py", file=sys.stderr
+        )
+        sys.exit(2)
+    elif argv[1] == "--print-aggregated-report":
+        print_aggregated_report(checker_name, checks_details_url)
+    elif argv[1] == "--process-all-test-pdf-files":
+        process_all_test_pdf_files(checker_name, analyze_pdf_file)
+    else:
+        print(analyze_pdf_file(argv[1]))
+
+
+def process_all_test_pdf_files(checker_name, analyze_pdf_file):
+    pdf_filepaths = [
+        entry.path
+        for entry in scantree("test")
+        if entry.is_file() and entry.name.endswith(".pdf")
+    ]
+    print(
+        f"Starting parallel execution of {checker_name} on {len(pdf_filepaths)} PDF files with {cpu_count()} CPUs"
+    )
+    with Pool(cpu_count()) as pool:
+        reports_per_pdf_filepath = {}
+        for pdf_filepath, report in tqdm(
+            pool.imap_unordered(analyze_pdf_file, pdf_filepaths),
+            total=len(pdf_filepaths),
+        ):
+            reports_per_pdf_filepath[pdf_filepath] = report
+    agg_report = aggregate(checker_name, reports_per_pdf_filepath)
+    print(
+        "Failures:", len(agg_report["failures"]), "Errors:", len(agg_report["errors"])
+    )
+
+
+def scantree(path):
+    """Recursively yield DirEntry objects for given directory."""
+    for entry in os.scandir(path):
+        if entry.is_dir():
+            yield from scantree(entry.path)
+        else:
+            yield entry
+
+
+def aggregate(checker_name, reports_per_pdf_filepath):
+    aggregated_report_filepath = f"{checker_name}-aggregated.json"
     agg_report = {
         "failures": defaultdict(list),
         "errors": defaultdict(list),
@@ -14,21 +70,23 @@ def aggregate(pdf_filepath, report, aggregated_report_filepath):
         agg_report["errors"].update(prev_agg_report["errors"])
     except FileNotFoundError:
         print("Initializing a new JSON file for the aggregated report")
+        report = list(reports_per_pdf_filepath.items())[0][1]
         if "version" in report:
             agg_report["version"] = report.pop("version")
-    if "failure" in report:
-        failure = report["failure"]
-        agg_report["failures"][failure].append(pdf_filepath)
-    else:
-        for error in report.get("errors", []):
-            agg_report["errors"][error].append(pdf_filepath)
+    for pdf_filepath, report in reports_per_pdf_filepath.items():
+        if "failure" in report:
+            agg_report["failures"][report["failure"]].append(pdf_filepath)
+        else:
+            for error in report.get("errors", ()):
+                agg_report["errors"][error].append(pdf_filepath)
     with open(aggregated_report_filepath, "w", encoding="utf8") as agg_file:
-        json.dump(agg_report, agg_file)
+        json.dump(agg_report, agg_file, indent=4)
+    return agg_report
 
 
-def print_aggregated_report(
-    aggregated_report_filepath, checks_details_url, ignore_whitelist_filepath
-):
+def print_aggregated_report(checker_name, checks_details_url):
+    aggregated_report_filepath = f"{checker_name}-aggregated.json"
+    ignore_whitelist_filepath = f"scripts/{checker_name}-ignore.json"
     with open(aggregated_report_filepath, encoding="utf8") as agg_file:
         agg_report = json.load(agg_file)
     if "version" in agg_report:

diff --git a/scripts/pdfchecker.py b/scripts/pdfchecker.py
@@ -4,36 +4,35 @@
 # Purpose of this script:
 # * abort the validation pipeline with a non-zero error code if any check fails on a PDF sample
 # * aggregate all checks performed in a concise summary
+# * parallelize the execution of this analysis on all PDF files
 # * allow to ignore some errors considered harmless, listed in pdfchecker-ignore.json
 
-# USAGE: ./pdfchecker.py [$pdf_filepath]
+# USAGE: ./pdfchecker.py [$pdf_filepath|--process-all-test-pdf-files|--print-aggregated-report]
 
 import sys
 from subprocess import check_output
 
-from scripts.checker_commons import aggregate, print_aggregated_report
+from scripts.checker_commons import main
 
-AGGREGATED_REPORT_FILEPATH = "pdfchecker-aggregated.json"
-IGNORE_WHITELIST_FILEPATH = "scripts/pdfchecker-ignore.json"
 CHECKS_DETAILS_URL = "https://dev.datalogics.com/pdf-checker/the-json-profile-file/description-of-json-profile-parameters/"
 UNPROCESSABLE_PDF_ERROR_LINE = "Unable to process document due to PDF Error"
 CHECKER_SUMMARY_END_LINE = "<<=CHECKER_SUMMARY_END=>>"
 
 
 def analyze_pdf_file(pdf_filepath):
-    output = check_output(
-        [
-            "PDF_Checker/pdfchecker",
-            "--profile",
-            "PDF_Checker/CheckerProfiles/everything.json",
-            "--input",
-            pdf_filepath,
-            "--password",
-            "fpdf2",
-        ]
-    ).decode()
-    report = parse_output(output)
-    aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH)
+    command = [
+        "PDF_Checker/pdfchecker",
+        "--profile",
+        "PDF_Checker/CheckerProfiles/everything.json",
+        "--input",
+        pdf_filepath,
+        "--password",
+        "fpdf2",
+    ]
+    # print(" ".join(command))
+    output = check_output(command).decode()
+    # print(output)
+    return pdf_filepath, parse_output(output)
 
 
 def parse_output(output):
@@ -106,13 +105,4 @@ def insert_indented(lines, node=None, depth=0, indent=0):
 
 
 if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print_aggregated_report(
-            AGGREGATED_REPORT_FILEPATH, CHECKS_DETAILS_URL, IGNORE_WHITELIST_FILEPATH
-        )
-    elif len(sys.argv) > 2:
-        print(sys.argv, file=sys.stderr)
-        print("Exactly one argument must be passed to pdfchecker.py", file=sys.stderr)
-        sys.exit(2)
-    else:
-        analyze_pdf_file(sys.argv[1])
+    main("pdfchecker", analyze_pdf_file, sys.argv, CHECKS_DETAILS_URL)
diff --git a/scripts/verapdf.py b/scripts/verapdf.py
@@ -4,17 +4,16 @@
 # Purpose of this script:
 # * abort the validation pipeline with a non-zero error code if any check fails on a PDF sample
 # * aggregate all checks performed in a concise summary
+# * parallelize the execution of this analysis on all PDF files
 # * allow to ignore some errors considered harmless, listed in verapdf-ignore.json
 
-# USAGE: ./verapdf.py [$pdf_filepath]
+# USAGE: ./verapdf.py [$pdf_filepath|--process-all-test-pdf-files|--print-aggregated-report]
 
 import sys
-from subprocess import PIPE, run
+from subprocess import run, DEVNULL, PIPE
 
-from scripts.checker_commons import aggregate, print_aggregated_report
+from scripts.checker_commons import main, HIDE_STDERR
 
-AGGREGATED_REPORT_FILEPATH = "verapdf-aggregated.json"
-IGNORE_WHITELIST_FILEPATH = "scripts/verapdf-ignore.json"
 CHECKS_DETAILS_URL = "https://docs.verapdf.org/validation/"
 BAT_EXT = ".bat" if sys.platform in ("cygwin", "win32") else ""
 
@@ -28,9 +27,11 @@ def analyze_pdf_file(pdf_filepath):
         pdf_filepath,
     ]
     # print(" ".join(command))
-    output = run(command, check=False, stdout=PIPE).stdout.decode()
-    report = parse_output(output)
-    aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH)
+    output = run(
+        command, stdout=PIPE, stderr=DEVNULL if HIDE_STDERR else None
+    ).stdout.decode()
+    # print(output)
+    return pdf_filepath, parse_output(output)
 
 
 def parse_output(output):
@@ -46,13 +47,4 @@ def parse_output(output):
 
 
 if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print_aggregated_report(
-            AGGREGATED_REPORT_FILEPATH, CHECKS_DETAILS_URL, IGNORE_WHITELIST_FILEPATH
-        )
-    elif len(sys.argv) > 2:
-        print(sys.argv, file=sys.stderr)
-        print("Exactly one argument must be passed to verapdf.py", file=sys.stderr)
-        sys.exit(2)
-    else:
-        analyze_pdf_file(sys.argv[1])
+    main("verapdf", analyze_pdf_file, sys.argv, CHECKS_DETAILS_URL)