Skip to content

Commit

Permalink
Making CI pipeline faster by parallelizing the PDF checkers execution (
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucas-C committed Jul 26, 2023
1 parent 54d2eb0 commit 892d58a
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 64 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/continuous-integration-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ jobs:
export PYTHONPATH=$PWD
# Using Datalogics PDF Checker:
scripts/install-pdfchecker.sh
find . -name '*.pdf' | xargs -n 1 scripts/pdfchecker.py
scripts/pdfchecker.py # printing aggregated report
scripts/pdfchecker.py --process-all-test-pdf-files
scripts/pdfchecker.py --print-aggregated-report
# Using VeraPDF:
scripts/install-verapdf.sh
find . -name '*.pdf' | xargs -n 1 scripts/verapdf.py
scripts/verapdf.py # printing aggregated report
scripts/verapdf.py --process-all-test-pdf-files
scripts/verapdf.py --print-aggregated-report
- name: Running tests ☑
env:
CHECK_EXEC_TIME: ${{ matrix.python-version == '3.9' && 'test-enabled' || '' }}
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
- [`FPDF.image()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.image): allowing images path starting with `data` to be passed as input
- text overflow is better handled by `FPDF.write()` & `FPDF.write_html()` - _cf._ [issue #847](https://github.com/PyFPDF/fpdf2/issues/847)
- the initial text color is preserved when using `FPDF.write_html()` - _cf._ [issue #846](https://github.com/PyFPDF/fpdf2/issues/846)
- handle superscript and subscript correctly when rendering `TextLine`- [Pull Request #862](https://github.com/PyFPDF/fpdf2/pull/862)
- handle superscript and subscript correctly when rendering `TextLine`- thanks to @Tolker-KU - _cf._ [Pull Request #862](https://github.com/PyFPDF/fpdf2/pull/862)
### Deprecated
- the `center` optional parameter of [`FPDF.cell()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.cell) is **no more** deprecated, as it allows for horizontal positioning, which is different from text alignment control with `align="C"`

Expand Down
4 changes: 2 additions & 2 deletions fpdf/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,9 @@ def handle_data(self, data):
if not data:
return
if "inserted" in self.td_th:
tag = self.td_th["tag"]
td_th_tag = self.td_th["tag"]
raise NotImplementedError(
f"Unsupported nested HTML tags inside <{tag}> element"
f"Unsupported nested HTML tags inside <{td_th_tag}> element: <{self._tags_stack[-1]}>"
)
# We could potentially support nested <b> / <em> / <font> tags
# by building a list of Fragment instances from the HTML cell content
Expand Down
82 changes: 70 additions & 12 deletions scripts/checker_commons.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,64 @@
import json, sys
import json, os, sys
from collections import defaultdict
from multiprocessing import cpu_count, Pool

try: # optional dependency to display a progress bar
from tqdm import tqdm

def aggregate(pdf_filepath, report, aggregated_report_filepath):
HIDE_STDERR = True
except ImportError:
tqdm = lambda _, total: _
HIDE_STDERR = False


def main(checker_name, analyze_pdf_file, argv, checks_details_url):
if len(argv) != 2:
print(argv, file=sys.stderr)
print(
f"Exactly one argument must be passed to {checker_name}.py", file=sys.stderr
)
sys.exit(2)
elif argv[1] == "--print-aggregated-report":
print_aggregated_report(checker_name, checks_details_url)
elif argv[1] == "--process-all-test-pdf-files":
process_all_test_pdf_files(checker_name, analyze_pdf_file)
else:
print(analyze_pdf_file(argv[1]))


def process_all_test_pdf_files(checker_name, analyze_pdf_file):
pdf_filepaths = [
entry.path
for entry in scantree("test")
if entry.is_file() and entry.name.endswith(".pdf")
]
print(
f"Starting parallel execution of {checker_name} on {len(pdf_filepaths)} PDF files with {cpu_count()} CPUs"
)
with Pool(cpu_count()) as pool:
reports_per_pdf_filepath = {}
for pdf_filepath, report in tqdm(
pool.imap_unordered(analyze_pdf_file, pdf_filepaths),
total=len(pdf_filepaths),
):
reports_per_pdf_filepath[pdf_filepath] = report
agg_report = aggregate(checker_name, reports_per_pdf_filepath)
print(
"Failures:", len(agg_report["failures"]), "Errors:", len(agg_report["errors"])
)


def scantree(path):
"""Recursively yield DirEntry objects for given directory."""
for entry in os.scandir(path):
if entry.is_dir():
yield from scantree(entry.path)
else:
yield entry


def aggregate(checker_name, reports_per_pdf_filepath):
aggregated_report_filepath = f"{checker_name}-aggregated.json"
agg_report = {
"failures": defaultdict(list),
"errors": defaultdict(list),
Expand All @@ -14,21 +70,23 @@ def aggregate(pdf_filepath, report, aggregated_report_filepath):
agg_report["errors"].update(prev_agg_report["errors"])
except FileNotFoundError:
print("Initializing a new JSON file for the aggregated report")
report = list(reports_per_pdf_filepath.items())[0][1]
if "version" in report:
agg_report["version"] = report.pop("version")
if "failure" in report:
failure = report["failure"]
agg_report["failures"][failure].append(pdf_filepath)
else:
for error in report.get("errors", []):
agg_report["errors"][error].append(pdf_filepath)
for pdf_filepath, report in reports_per_pdf_filepath.items():
if "failure" in report:
agg_report["failures"][report["failure"]].append(pdf_filepath)
else:
for error in report.get("errors", ()):
agg_report["errors"][error].append(pdf_filepath)
with open(aggregated_report_filepath, "w", encoding="utf8") as agg_file:
json.dump(agg_report, agg_file)
json.dump(agg_report, agg_file, indent=4)
return agg_report


def print_aggregated_report(
aggregated_report_filepath, checks_details_url, ignore_whitelist_filepath
):
def print_aggregated_report(checker_name, checks_details_url):
aggregated_report_filepath = f"{checker_name}-aggregated.json"
ignore_whitelist_filepath = f"scripts/{checker_name}-ignore.json"
with open(aggregated_report_filepath, encoding="utf8") as agg_file:
agg_report = json.load(agg_file)
if "version" in agg_report:
Expand Down
44 changes: 17 additions & 27 deletions scripts/pdfchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,35 @@
# Purpose of this script:
# * abort the validation pipeline with a non-zero error code if any check fails on a PDF sample
# * aggregate all checks performed in a concise summary
# * parallelize the execution of this analysis on all PDF files
# * allow to ignore some errors considered harmless, listed in pdfchecker-ignore.json

# USAGE: ./pdfchecker.py [$pdf_filepath]
# USAGE: ./pdfchecker.py [$pdf_filepath|--process-all-test-pdf-files|--print-aggregated-report]

import sys
from subprocess import check_output

from scripts.checker_commons import aggregate, print_aggregated_report
from scripts.checker_commons import main

AGGREGATED_REPORT_FILEPATH = "pdfchecker-aggregated.json"
IGNORE_WHITELIST_FILEPATH = "scripts/pdfchecker-ignore.json"
CHECKS_DETAILS_URL = "https://dev.datalogics.com/pdf-checker/the-json-profile-file/description-of-json-profile-parameters/"
UNPROCESSABLE_PDF_ERROR_LINE = "Unable to process document due to PDF Error"
CHECKER_SUMMARY_END_LINE = "<<=CHECKER_SUMMARY_END=>>"


def analyze_pdf_file(pdf_filepath):
output = check_output(
[
"PDF_Checker/pdfchecker",
"--profile",
"PDF_Checker/CheckerProfiles/everything.json",
"--input",
pdf_filepath,
"--password",
"fpdf2",
]
).decode()
report = parse_output(output)
aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH)
command = [
"PDF_Checker/pdfchecker",
"--profile",
"PDF_Checker/CheckerProfiles/everything.json",
"--input",
pdf_filepath,
"--password",
"fpdf2",
]
# print(" ".join(command))
output = check_output(command).decode()
# print(output)
return pdf_filepath, parse_output(output)


def parse_output(output):
Expand Down Expand Up @@ -106,13 +105,4 @@ def insert_indented(lines, node=None, depth=0, indent=0):


if __name__ == "__main__":
if len(sys.argv) < 2:
print_aggregated_report(
AGGREGATED_REPORT_FILEPATH, CHECKS_DETAILS_URL, IGNORE_WHITELIST_FILEPATH
)
elif len(sys.argv) > 2:
print(sys.argv, file=sys.stderr)
print("Exactly one argument must be passed to pdfchecker.py", file=sys.stderr)
sys.exit(2)
else:
analyze_pdf_file(sys.argv[1])
main("pdfchecker", analyze_pdf_file, sys.argv, CHECKS_DETAILS_URL)
28 changes: 10 additions & 18 deletions scripts/verapdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,16 @@
# Purpose of this script:
# * abort the validation pipeline with a non-zero error code if any check fails on a PDF sample
# * aggregate all checks performed in a concise summary
# * parallelize the execution of this analysis on all PDF files
# * allow to ignore some errors considered harmless, listed in verapdf-ignore.json

# USAGE: ./verapdf.py [$pdf_filepath]
# USAGE: ./verapdf.py [$pdf_filepath|--process-all-test-pdf-files|--print-aggregated-report]

import sys
from subprocess import PIPE, run
from subprocess import run, DEVNULL, PIPE

from scripts.checker_commons import aggregate, print_aggregated_report
from scripts.checker_commons import main, HIDE_STDERR

AGGREGATED_REPORT_FILEPATH = "verapdf-aggregated.json"
IGNORE_WHITELIST_FILEPATH = "scripts/verapdf-ignore.json"
CHECKS_DETAILS_URL = "https://docs.verapdf.org/validation/"
BAT_EXT = ".bat" if sys.platform in ("cygwin", "win32") else ""

Expand All @@ -28,9 +27,11 @@ def analyze_pdf_file(pdf_filepath):
pdf_filepath,
]
# print(" ".join(command))
output = run(command, check=False, stdout=PIPE).stdout.decode()
report = parse_output(output)
aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH)
output = run(
command, stdout=PIPE, stderr=DEVNULL if HIDE_STDERR else None
).stdout.decode()
# print(output)
return pdf_filepath, parse_output(output)


def parse_output(output):
Expand All @@ -46,13 +47,4 @@ def parse_output(output):


if __name__ == "__main__":
if len(sys.argv) < 2:
print_aggregated_report(
AGGREGATED_REPORT_FILEPATH, CHECKS_DETAILS_URL, IGNORE_WHITELIST_FILEPATH
)
elif len(sys.argv) > 2:
print(sys.argv, file=sys.stderr)
print("Exactly one argument must be passed to verapdf.py", file=sys.stderr)
sys.exit(2)
else:
analyze_pdf_file(sys.argv[1])
main("verapdf", analyze_pdf_file, sys.argv, CHECKS_DETAILS_URL)

0 comments on commit 892d58a

Please sign in to comment.