From 6559eca58dd8cc3d3c750596f39f169f09c19cb1 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 25 Apr 2024 11:23:39 +0200 Subject: [PATCH] Windows specific prefix check (magick) --- src/invoice2data/input/tesseract.py | 58 +++++++++++++++++------------ 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py index 7bea12dd..4e245a71 100644 --- a/src/invoice2data/input/tesseract.py +++ b/src/invoice2data/input/tesseract.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- - +import platform import shutil import tempfile import mimetypes @@ -32,17 +32,21 @@ def to_text(path: str, area_details: dict = None): """ # Check for dependencies. Needs Tesseract and Imagemagick installed. + current_platform = platform.platform() + if current_platform.startswith("win32"): + convert_command_prefix = "magick" + else: + convert_command_prefix = "convert" if not shutil.which("tesseract"): raise EnvironmentError("tesseract not installed.") - if not shutil.which("convert"): + if not shutil.which(convert_command_prefix): raise EnvironmentError("imagemagick not installed.") language = get_languages() logger.debug("tesseract language arg is, %s", language) timeout = 180 - # convert the (multi-page) pdf file to a 300dpi png - convert = [ + convert = [convert_command_prefix] + [ "convert", "-units", "PixelsPerInch", @@ -90,7 +94,7 @@ def to_text(path: str, area_details: dict = None): tess_input, TMP_FOLDER + filename, "pdf", - "txt" + "txt", ] logger.debug("Calling tesseract with args, %s", tess_cmd) @@ -112,24 +116,31 @@ def to_text(path: str, area_details: dict = None): if area_details is not None: # An area was specified # Validate the required keys were provided - assert 'f' in area_details, 'Area r details missing' - assert 'l' in area_details, 'Area r details missing' - assert 'r' in area_details, 'Area r details missing' - assert 'x' in area_details, 'Area x details missing' - assert 'y' in area_details, 'Area y details missing' - assert 'W' in area_details, 'Area W details missing' - assert 'H' in area_details, 'Area H details missing' + assert "f" in area_details, "Area r details missing" + assert "l" in area_details, "Area r details missing" + assert "r" in area_details, "Area r details missing" + assert "x" in area_details, "Area x details missing" + assert "y" in area_details, "Area y details missing" + assert "W" in area_details, "Area W details missing" + assert "H" in area_details, "Area H details missing" # Convert all of the values to strings for key in area_details.keys(): area_details[key] = str(area_details[key]) pdftotext_cmd += [ - '-f', area_details['f'], - '-l', area_details['l'], - '-r', area_details['r'], - '-x', area_details['x'], - '-y', area_details['y'], - '-W', area_details['W'], - '-H', area_details['H'], + "-f", + area_details["f"], + "-l", + area_details["l"], + "-r", + area_details["r"], + "-x", + area_details["x"], + "-y", + area_details["y"], + "-W", + area_details["W"], + "-H", + area_details["H"], ] pdftotext_cmd += [TMP_FOLDER + filename + ".pdf", "-"] @@ -142,7 +153,7 @@ def to_text(path: str, area_details: dict = None): except TimeoutExpired: p3.kill() logger.warning("pdftotext took too long - skipping") - return extracted_str.decode('utf-8') + return extracted_str.decode("utf-8") def get_languages(): @@ -154,8 +165,9 @@ def lang_error(output): "-----------\n" ) return + logger.debug("get lang called") - args_tess = ['tesseract', '--list-langs'] + args_tess = ["tesseract", "--list-langs"] try: proc = run( args_tess, @@ -169,8 +181,8 @@ def lang_error(output): raise EnvironmentError(lang_error(e.output)) from e for line in output.splitlines(): - if line.startswith('Error'): + if line.startswith("Error"): raise EnvironmentError(lang_error(output)) _header, *rest = output.splitlines() langlist = {lang.strip() for lang in rest} - return '+'.join(map(str, langlist)) + return "+".join(map(str, langlist))