diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 6c968b4fa..304de47a7 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -50,12 +50,12 @@ jobs: export PYTHONPATH=$PWD # Using Datalogics PDF Checker: scripts/install-pdfchecker.sh - find . -name '*.pdf' | xargs -n 1 scripts/pdfchecker.py - scripts/pdfchecker.py # printing aggregated report + scripts/pdfchecker.py --process-all-test-pdf-files + scripts/pdfchecker.py --print-aggregated-report # Using VeraPDF: scripts/install-verapdf.sh - find . -name '*.pdf' | xargs -n 1 scripts/verapdf.py - scripts/verapdf.py # printing aggregated report + scripts/verapdf.py --process-all-test-pdf-files + scripts/verapdf.py --print-aggregated-report - name: Running tests ☑ env: CHECK_EXEC_TIME: ${{ matrix.python-version == '3.9' && 'test-enabled' || '' }} diff --git a/CHANGELOG.md b/CHANGELOG.md index d9e2ace6b..f31808840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default', - [`FPDF.image()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.image): allowing images path starting with `data` to be passed as input - text overflow is better handled by `FPDF.write()` & `FPDF.write_html()` - _cf._ [issue #847](https://github.com/PyFPDF/fpdf2/issues/847) - the initial text color is preserved when using `FPDF.write_html()` - _cf._ [issue #846](https://github.com/PyFPDF/fpdf2/issues/846) -- handle superscript and subscript correctly when rendering `TextLine`- [Pull Request #862](https://github.com/PyFPDF/fpdf2/pull/862) +- handle superscript and subscript correctly when rendering `TextLine`- thanks to @Tolker-KU - _cf._ [Pull Request #862](https://github.com/PyFPDF/fpdf2/pull/862) ### Deprecated - the `center` optional parameter of [`FPDF.cell()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.cell) is **no more** deprecated, as it allows for horizontal positioning, which is different from text alignment control with `align="C"` diff --git a/fpdf/html.py b/fpdf/html.py index 736705467..83b4f199d 100644 --- a/fpdf/html.py +++ b/fpdf/html.py @@ -266,9 +266,9 @@ def handle_data(self, data): if not data: return if "inserted" in self.td_th: - tag = self.td_th["tag"] + td_th_tag = self.td_th["tag"] raise NotImplementedError( - f"Unsupported nested HTML tags inside <{tag}> element" + f"Unsupported nested HTML tags inside <{td_th_tag}> element: <{self._tags_stack[-1]}>" ) # We could potentially support nested / / tags # by building a list of Fragment instances from the HTML cell content diff --git a/scripts/checker_commons.py b/scripts/checker_commons.py index e4ca73b2f..7b6bdaf95 100644 --- a/scripts/checker_commons.py +++ b/scripts/checker_commons.py @@ -1,8 +1,64 @@ -import json, sys +import json, os, sys from collections import defaultdict +from multiprocessing import cpu_count, Pool +try: # optional dependency to display a progress bar + from tqdm import tqdm -def aggregate(pdf_filepath, report, aggregated_report_filepath): + HIDE_STDERR = True +except ImportError: + tqdm = lambda _, total: _ + HIDE_STDERR = False + + +def main(checker_name, analyze_pdf_file, argv, checks_details_url): + if len(argv) != 2: + print(argv, file=sys.stderr) + print( + f"Exactly one argument must be passed to {checker_name}.py", file=sys.stderr + ) + sys.exit(2) + elif argv[1] == "--print-aggregated-report": + print_aggregated_report(checker_name, checks_details_url) + elif argv[1] == "--process-all-test-pdf-files": + process_all_test_pdf_files(checker_name, analyze_pdf_file) + else: + print(analyze_pdf_file(argv[1])) + + +def process_all_test_pdf_files(checker_name, analyze_pdf_file): + pdf_filepaths = [ + entry.path + for entry in scantree("test") + if entry.is_file() and entry.name.endswith(".pdf") + ] + print( + f"Starting parallel execution of {checker_name} on {len(pdf_filepaths)} PDF files with {cpu_count()} CPUs" + ) + with Pool(cpu_count()) as pool: + reports_per_pdf_filepath = {} + for pdf_filepath, report in tqdm( + pool.imap_unordered(analyze_pdf_file, pdf_filepaths), + total=len(pdf_filepaths), + ): + reports_per_pdf_filepath[pdf_filepath] = report + agg_report = aggregate(checker_name, reports_per_pdf_filepath) + print( + "Failures:", len(agg_report["failures"]), "Errors:", len(agg_report["errors"]) + ) + + +def scantree(path): + """Recursively yield DirEntry objects for given directory.""" + for entry in os.scandir(path): + if entry.is_dir(): + yield from scantree(entry.path) + else: + yield entry + + +def aggregate(checker_name, reports_per_pdf_filepath): + aggregated_report_filepath = f"{checker_name}-aggregated.json" agg_report = { "failures": defaultdict(list), "errors": defaultdict(list), @@ -14,21 +70,23 @@ def aggregate(pdf_filepath, report, aggregated_report_filepath): agg_report["errors"].update(prev_agg_report["errors"]) except FileNotFoundError: print("Initializing a new JSON file for the aggregated report") + report = list(reports_per_pdf_filepath.items())[0][1] if "version" in report: agg_report["version"] = report.pop("version") - if "failure" in report: - failure = report["failure"] - agg_report["failures"][failure].append(pdf_filepath) - else: - for error in report.get("errors", []): - agg_report["errors"][error].append(pdf_filepath) + for pdf_filepath, report in reports_per_pdf_filepath.items(): + if "failure" in report: + agg_report["failures"][report["failure"]].append(pdf_filepath) + else: + for error in report.get("errors", ()): + agg_report["errors"][error].append(pdf_filepath) with open(aggregated_report_filepath, "w", encoding="utf8") as agg_file: - json.dump(agg_report, agg_file) + json.dump(agg_report, agg_file, indent=4) + return agg_report -def print_aggregated_report( - aggregated_report_filepath, checks_details_url, ignore_whitelist_filepath -): +def print_aggregated_report(checker_name, checks_details_url): + aggregated_report_filepath = f"{checker_name}-aggregated.json" + ignore_whitelist_filepath = f"scripts/{checker_name}-ignore.json" with open(aggregated_report_filepath, encoding="utf8") as agg_file: agg_report = json.load(agg_file) if "version" in agg_report: diff --git a/scripts/pdfchecker.py b/scripts/pdfchecker.py index 1a3d06c36..e422380ef 100755 --- a/scripts/pdfchecker.py +++ b/scripts/pdfchecker.py @@ -4,36 +4,35 @@ # Purpose of this script: # * abort the validation pipeline with a non-zero error code if any check fails on a PDF sample # * aggregate all checks performed in a concise summary +# * parallelize the execution of this analysis on all PDF files # * allow to ignore some errors considered harmless, listed in pdfchecker-ignore.json -# USAGE: ./pdfchecker.py [$pdf_filepath] +# USAGE: ./pdfchecker.py [$pdf_filepath|--process-all-test-pdf-files|--print-aggregated-report] import sys from subprocess import check_output -from scripts.checker_commons import aggregate, print_aggregated_report +from scripts.checker_commons import main -AGGREGATED_REPORT_FILEPATH = "pdfchecker-aggregated.json" -IGNORE_WHITELIST_FILEPATH = "scripts/pdfchecker-ignore.json" CHECKS_DETAILS_URL = "https://dev.datalogics.com/pdf-checker/the-json-profile-file/description-of-json-profile-parameters/" UNPROCESSABLE_PDF_ERROR_LINE = "Unable to process document due to PDF Error" CHECKER_SUMMARY_END_LINE = "<<=CHECKER_SUMMARY_END=>>" def analyze_pdf_file(pdf_filepath): - output = check_output( - [ - "PDF_Checker/pdfchecker", - "--profile", - "PDF_Checker/CheckerProfiles/everything.json", - "--input", - pdf_filepath, - "--password", - "fpdf2", - ] - ).decode() - report = parse_output(output) - aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH) + command = [ + "PDF_Checker/pdfchecker", + "--profile", + "PDF_Checker/CheckerProfiles/everything.json", + "--input", + pdf_filepath, + "--password", + "fpdf2", + ] + # print(" ".join(command)) + output = check_output(command).decode() + # print(output) + return pdf_filepath, parse_output(output) def parse_output(output): @@ -106,13 +105,4 @@ def insert_indented(lines, node=None, depth=0, indent=0): if __name__ == "__main__": - if len(sys.argv) < 2: - print_aggregated_report( - AGGREGATED_REPORT_FILEPATH, CHECKS_DETAILS_URL, IGNORE_WHITELIST_FILEPATH - ) - elif len(sys.argv) > 2: - print(sys.argv, file=sys.stderr) - print("Exactly one argument must be passed to pdfchecker.py", file=sys.stderr) - sys.exit(2) - else: - analyze_pdf_file(sys.argv[1]) + main("pdfchecker", analyze_pdf_file, sys.argv, CHECKS_DETAILS_URL) diff --git a/scripts/verapdf.py b/scripts/verapdf.py index 73fcb5e66..1c72b9664 100755 --- a/scripts/verapdf.py +++ b/scripts/verapdf.py @@ -4,17 +4,16 @@ # Purpose of this script: # * abort the validation pipeline with a non-zero error code if any check fails on a PDF sample # * aggregate all checks performed in a concise summary +# * parallelize the execution of this analysis on all PDF files # * allow to ignore some errors considered harmless, listed in verapdf-ignore.json -# USAGE: ./verapdf.py [$pdf_filepath] +# USAGE: ./verapdf.py [$pdf_filepath|--process-all-test-pdf-files|--print-aggregated-report] import sys -from subprocess import PIPE, run +from subprocess import run, DEVNULL, PIPE -from scripts.checker_commons import aggregate, print_aggregated_report +from scripts.checker_commons import main, HIDE_STDERR -AGGREGATED_REPORT_FILEPATH = "verapdf-aggregated.json" -IGNORE_WHITELIST_FILEPATH = "scripts/verapdf-ignore.json" CHECKS_DETAILS_URL = "https://docs.verapdf.org/validation/" BAT_EXT = ".bat" if sys.platform in ("cygwin", "win32") else "" @@ -28,9 +27,11 @@ def analyze_pdf_file(pdf_filepath): pdf_filepath, ] # print(" ".join(command)) - output = run(command, check=False, stdout=PIPE).stdout.decode() - report = parse_output(output) - aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH) + output = run( + command, stdout=PIPE, stderr=DEVNULL if HIDE_STDERR else None + ).stdout.decode() + # print(output) + return pdf_filepath, parse_output(output) def parse_output(output): @@ -46,13 +47,4 @@ def parse_output(output): if __name__ == "__main__": - if len(sys.argv) < 2: - print_aggregated_report( - AGGREGATED_REPORT_FILEPATH, CHECKS_DETAILS_URL, IGNORE_WHITELIST_FILEPATH - ) - elif len(sys.argv) > 2: - print(sys.argv, file=sys.stderr) - print("Exactly one argument must be passed to verapdf.py", file=sys.stderr) - sys.exit(2) - else: - analyze_pdf_file(sys.argv[1]) + main("verapdf", analyze_pdf_file, sys.argv, CHECKS_DETAILS_URL)