Area support for tesseract

This allows one to specify the page(s) and coordinates of the area to be extracted.
invoice-x · Feb 20, 2023 · 7f7280e · 7f7280e
1 parent 40f1f92
commit 7f7280e
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 5 deletions.
diff --git a/src/invoice2data/extract/invoice_template.py b/src/invoice2data/extract/invoice_template.py
@@ -201,8 +201,7 @@ def extract(self, optimized_str: str, invoice_file: str, input_module: str) -> O
                     # Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
                     optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8")
                     # Log the text
-                    logger.debug("START pdftotext area result ===========================")
-                    logger.debug(optimized_str_area)
+                    logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
                     logger.debug("END pdftotext area result =============================")
                     optimized_str_for_parser = optimized_str_area
                 else:

diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py
@@ -13,13 +13,16 @@
 logger = logging.getLogger(__name__)
 
 
-def to_text(path):
+def to_text(path: str, area_details: dict = None):
     """Wraps Tesseract OCR with auto language model.
 
     Parameters
     ----------
     path : str
         path of electronic invoice in PDF, JPG or PNG format
+    area_details : dictionary
+        of the format {x: int, y: int, r: int, W: int, H: int}
+        used when extracting an area of the pdf rather than the whole document
 
     Returns
     -------
@@ -105,9 +108,30 @@ def to_text(path):
         "-layout",
         "-enc",
         "UTF-8",
-        TMP_FOLDER + filename + ".pdf",
-        "-",
     ]
+    if area_details is not None:
+        # An area was specified
+        # Validate the required keys were provided
+        assert 'f' in area_details, 'Area r details missing'
+        assert 'l' in area_details, 'Area r details missing'
+        assert 'r' in area_details, 'Area r details missing'
+        assert 'x' in area_details, 'Area x details missing'
+        assert 'y' in area_details, 'Area y details missing'
+        assert 'W' in area_details, 'Area W details missing'
+        assert 'H' in area_details, 'Area H details missing'
+        # Convert all of the values to strings
+        for key in area_details.keys():
+            area_details[key] = str(area_details[key])
+        pdftotext_cmd += [
+            '-f', area_details['f'],
+            '-l', area_details['l'],
+            '-r', area_details['r'],
+            '-x', area_details['x'],
+            '-y', area_details['y'],
+            '-W', area_details['W'],
+            '-H', area_details['H'],
+        ]
+    pdftotext_cmd += [TMP_FOLDER + filename + ".pdf", "-"]
 
     logger.debug("Calling pdfttext with, %s", pdftotext_cmd)
     p3 = Popen(pdftotext_cmd, stdin=p2.stdout, stdout=PIPE)