From 6559eca58dd8cc3d3c750596f39f169f09c19cb1 Mon Sep 17 00:00:00 2001
From: Alex <alex@alexandermartynek.dev>
Date: Thu, 25 Apr 2024 11:23:39 +0200
Subject: [PATCH] Windows specific prefix check (magick)

---
 src/invoice2data/input/tesseract.py | 58 +++++++++++++++++------------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py
index 7bea12dd..4e245a71 100644
--- a/src/invoice2data/input/tesseract.py
+++ b/src/invoice2data/input/tesseract.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-
+import platform
 import shutil
 import tempfile
 import mimetypes
@@ -32,17 +32,21 @@ def to_text(path: str, area_details: dict = None):
     """
 
     # Check for dependencies. Needs Tesseract and Imagemagick installed.
+    current_platform = platform.platform()
+    if current_platform.startswith("win32"):
+        convert_command_prefix = "magick"
+    else:
+        convert_command_prefix = "convert"
     if not shutil.which("tesseract"):
         raise EnvironmentError("tesseract not installed.")
-    if not shutil.which("convert"):
+    if not shutil.which(convert_command_prefix):
         raise EnvironmentError("imagemagick not installed.")
 
     language = get_languages()
     logger.debug("tesseract language arg is, %s", language)
     timeout = 180
-
     # convert the (multi-page) pdf file to a 300dpi png
-    convert = [
+    convert = [convert_command_prefix] + [
         "convert",
         "-units",
         "PixelsPerInch",
@@ -90,7 +94,7 @@ def to_text(path: str, area_details: dict = None):
         tess_input,
         TMP_FOLDER + filename,
         "pdf",
-        "txt"
+        "txt",
     ]
 
     logger.debug("Calling tesseract with args, %s", tess_cmd)
@@ -112,24 +116,31 @@ def to_text(path: str, area_details: dict = None):
     if area_details is not None:
         # An area was specified
         # Validate the required keys were provided
-        assert 'f' in area_details, 'Area r details missing'
-        assert 'l' in area_details, 'Area r details missing'
-        assert 'r' in area_details, 'Area r details missing'
-        assert 'x' in area_details, 'Area x details missing'
-        assert 'y' in area_details, 'Area y details missing'
-        assert 'W' in area_details, 'Area W details missing'
-        assert 'H' in area_details, 'Area H details missing'
+        assert "f" in area_details, "Area r details missing"
+        assert "l" in area_details, "Area r details missing"
+        assert "r" in area_details, "Area r details missing"
+        assert "x" in area_details, "Area x details missing"
+        assert "y" in area_details, "Area y details missing"
+        assert "W" in area_details, "Area W details missing"
+        assert "H" in area_details, "Area H details missing"
         # Convert all of the values to strings
         for key in area_details.keys():
             area_details[key] = str(area_details[key])
         pdftotext_cmd += [
-            '-f', area_details['f'],
-            '-l', area_details['l'],
-            '-r', area_details['r'],
-            '-x', area_details['x'],
-            '-y', area_details['y'],
-            '-W', area_details['W'],
-            '-H', area_details['H'],
+            "-f",
+            area_details["f"],
+            "-l",
+            area_details["l"],
+            "-r",
+            area_details["r"],
+            "-x",
+            area_details["x"],
+            "-y",
+            area_details["y"],
+            "-W",
+            area_details["W"],
+            "-H",
+            area_details["H"],
         ]
     pdftotext_cmd += [TMP_FOLDER + filename + ".pdf", "-"]
 
@@ -142,7 +153,7 @@ def to_text(path: str, area_details: dict = None):
     except TimeoutExpired:
         p3.kill()
         logger.warning("pdftotext took too long - skipping")
-    return extracted_str.decode('utf-8')
+    return extracted_str.decode("utf-8")
 
 
 def get_languages():
@@ -154,8 +165,9 @@ def lang_error(output):
             "-----------\n"
         )
         return
+
     logger.debug("get lang called")
-    args_tess = ['tesseract', '--list-langs']
+    args_tess = ["tesseract", "--list-langs"]
     try:
         proc = run(
             args_tess,
@@ -169,8 +181,8 @@ def lang_error(output):
         raise EnvironmentError(lang_error(e.output)) from e
 
     for line in output.splitlines():
-        if line.startswith('Error'):
+        if line.startswith("Error"):
             raise EnvironmentError(lang_error(output))
     _header, *rest = output.splitlines()
     langlist = {lang.strip() for lang in rest}
-    return '+'.join(map(str, langlist))
+    return "+".join(map(str, langlist))