Skip to content

Commit

Permalink
Lines: support multiple RegEx-es in lines entries
Browse files Browse the repository at this point in the history
So far "first_line", "line" and "last_line" could contain a single
RegEx only. Some invoices have lines that use more than one format. To
simplify parsin them allow all 3 entries to contain list of RegEx-es.

Example:
fields:
  lines:
    parser: lines
    start: Item\s+Discount\s+Price$
    end: \s+Total
    line:
      - Items group:\s+(?P<group>.+)
      - (?P<description>.+)\s+(?P<discount>\d+.\d+)\s+(?P<price>\d+\d+)

Signed-off-by: Rafał Miłecki <[email protected]>
  • Loading branch information
Rafał Miłecki authored and bosd committed Oct 21, 2022
1 parent e63570f commit 81d4cfd
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions src/invoice2data/extract/parsers/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
DEFAULT_OPTIONS = {"line_separator": r"\n"}


def parse_line(patterns, line):
patterns = patterns if isinstance(patterns, list) else [patterns]
for pattern in patterns:
match = re.search(pattern, line)
if match:
return match
return None


def parse(template, field, _settings, content):
"""Try to extract lines from the invoice"""

Expand Down Expand Up @@ -57,7 +66,7 @@ def parse(template, field, _settings, content):
continue
if "first_line" in settings:
# Check if the current lines the first_line pattern
match = re.search(settings["first_line"], line)
match = parse_line(settings["first_line"], line)
if match:
# The line matches the first_line pattern so append current row to output
# then assign a new current_row
Expand All @@ -75,7 +84,7 @@ def parse(template, field, _settings, content):
# If last_line was provided, check that
if "last_line" in settings:
# last_line pattern provided, so check if the current line is that line
match = re.search(settings["last_line"], line)
match = parse_line(settings["last_line"], line)
if match:
# This is the last_line, so parse all lines thus far,
# append to output,
Expand All @@ -101,7 +110,7 @@ def parse(template, field, _settings, content):
logger.debug("skip_line match on *%s*", line)
continue
# If none of those have continued the loop, check if this is just a normal line
match = re.search(settings["line"], line)
match = parse_line(settings["line"], line)
if match:
# This is one of the lines between first_line and last_line
# Parse the data and add it to the current_row
Expand Down

0 comments on commit 81d4cfd

Please sign in to comment.