Skip to content

Commit

Permalink
Windows specific prefix check (magick)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexm96 committed Apr 25, 2024
1 parent 925953f commit 6559eca
Showing 1 changed file with 35 additions and 23 deletions.
58 changes: 35 additions & 23 deletions src/invoice2data/input/tesseract.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-

import platform
import shutil
import tempfile
import mimetypes
Expand Down Expand Up @@ -32,17 +32,21 @@ def to_text(path: str, area_details: dict = None):
"""

# Check for dependencies. Needs Tesseract and Imagemagick installed.
current_platform = platform.platform()
if current_platform.startswith("win32"):
convert_command_prefix = "magick"
else:
convert_command_prefix = "convert"
if not shutil.which("tesseract"):
raise EnvironmentError("tesseract not installed.")
if not shutil.which("convert"):
if not shutil.which(convert_command_prefix):
raise EnvironmentError("imagemagick not installed.")

language = get_languages()
logger.debug("tesseract language arg is, %s", language)
timeout = 180

# convert the (multi-page) pdf file to a 300dpi png
convert = [
convert = [convert_command_prefix] + [
"convert",
"-units",
"PixelsPerInch",
Expand Down Expand Up @@ -90,7 +94,7 @@ def to_text(path: str, area_details: dict = None):
tess_input,
TMP_FOLDER + filename,
"pdf",
"txt"
"txt",
]

logger.debug("Calling tesseract with args, %s", tess_cmd)
Expand All @@ -112,24 +116,31 @@ def to_text(path: str, area_details: dict = None):
if area_details is not None:
# An area was specified
# Validate the required keys were provided
assert 'f' in area_details, 'Area r details missing'
assert 'l' in area_details, 'Area r details missing'
assert 'r' in area_details, 'Area r details missing'
assert 'x' in area_details, 'Area x details missing'
assert 'y' in area_details, 'Area y details missing'
assert 'W' in area_details, 'Area W details missing'
assert 'H' in area_details, 'Area H details missing'
assert "f" in area_details, "Area r details missing"
assert "l" in area_details, "Area r details missing"
assert "r" in area_details, "Area r details missing"
assert "x" in area_details, "Area x details missing"
assert "y" in area_details, "Area y details missing"
assert "W" in area_details, "Area W details missing"
assert "H" in area_details, "Area H details missing"
# Convert all of the values to strings
for key in area_details.keys():
area_details[key] = str(area_details[key])
pdftotext_cmd += [
'-f', area_details['f'],
'-l', area_details['l'],
'-r', area_details['r'],
'-x', area_details['x'],
'-y', area_details['y'],
'-W', area_details['W'],
'-H', area_details['H'],
"-f",
area_details["f"],
"-l",
area_details["l"],
"-r",
area_details["r"],
"-x",
area_details["x"],
"-y",
area_details["y"],
"-W",
area_details["W"],
"-H",
area_details["H"],
]
pdftotext_cmd += [TMP_FOLDER + filename + ".pdf", "-"]

Expand All @@ -142,7 +153,7 @@ def to_text(path: str, area_details: dict = None):
except TimeoutExpired:
p3.kill()
logger.warning("pdftotext took too long - skipping")
return extracted_str.decode('utf-8')
return extracted_str.decode("utf-8")


def get_languages():
Expand All @@ -154,8 +165,9 @@ def lang_error(output):
"-----------\n"
)
return

logger.debug("get lang called")
args_tess = ['tesseract', '--list-langs']
args_tess = ["tesseract", "--list-langs"]
try:
proc = run(
args_tess,
Expand All @@ -169,8 +181,8 @@ def lang_error(output):
raise EnvironmentError(lang_error(e.output)) from e

for line in output.splitlines():
if line.startswith('Error'):
if line.startswith("Error"):
raise EnvironmentError(lang_error(output))
_header, *rest = output.splitlines()
langlist = {lang.strip() for lang in rest}
return '+'.join(map(str, langlist))
return "+".join(map(str, langlist))

0 comments on commit 6559eca

Please sign in to comment.