Skip to content

Commit

Permalink
Area support for tesseract
Browse files Browse the repository at this point in the history
 This allows one to specify the page(s) and coordinates of the area to be extracted.
  • Loading branch information
bosd committed Feb 20, 2023
1 parent 40f1f92 commit 7f7280e
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 5 deletions.
3 changes: 1 addition & 2 deletions src/invoice2data/extract/invoice_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,7 @@ def extract(self, optimized_str: str, invoice_file: str, input_module: str) -> O
# Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8")
# Log the text
logger.debug("START pdftotext area result ===========================")
logger.debug(optimized_str_area)
logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
logger.debug("END pdftotext area result =============================")
optimized_str_for_parser = optimized_str_area
else:
Expand Down
30 changes: 27 additions & 3 deletions src/invoice2data/input/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
logger = logging.getLogger(__name__)


def to_text(path):
def to_text(path: str, area_details: dict = None):
"""Wraps Tesseract OCR with auto language model.
Parameters
----------
path : str
path of electronic invoice in PDF, JPG or PNG format
area_details : dictionary
of the format {x: int, y: int, r: int, W: int, H: int}
used when extracting an area of the pdf rather than the whole document
Returns
-------
Expand Down Expand Up @@ -105,9 +108,30 @@ def to_text(path):
"-layout",
"-enc",
"UTF-8",
TMP_FOLDER + filename + ".pdf",
"-",
]
if area_details is not None:
# An area was specified
# Validate the required keys were provided
assert 'f' in area_details, 'Area r details missing'
assert 'l' in area_details, 'Area r details missing'
assert 'r' in area_details, 'Area r details missing'
assert 'x' in area_details, 'Area x details missing'
assert 'y' in area_details, 'Area y details missing'
assert 'W' in area_details, 'Area W details missing'
assert 'H' in area_details, 'Area H details missing'
# Convert all of the values to strings
for key in area_details.keys():
area_details[key] = str(area_details[key])
pdftotext_cmd += [
'-f', area_details['f'],
'-l', area_details['l'],
'-r', area_details['r'],
'-x', area_details['x'],
'-y', area_details['y'],
'-W', area_details['W'],
'-H', area_details['H'],
]
pdftotext_cmd += [TMP_FOLDER + filename + ".pdf", "-"]

logger.debug("Calling pdfttext with, %s", pdftotext_cmd)
p3 = Popen(pdftotext_cmd, stdin=p2.stdout, stdout=PIPE)
Expand Down

0 comments on commit 7f7280e

Please sign in to comment.