From 81d4cfdc7173eb36eec931a8e4fbd58e54d60ac5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sat, 1 Oct 2022 23:27:36 +0200 Subject: [PATCH] Lines: support multiple RegEx-es in lines entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far "first_line", "line" and "last_line" could contain a single RegEx only. Some invoices have lines that use more than one format. To simplify parsin them allow all 3 entries to contain list of RegEx-es. Example: fields: lines: parser: lines start: Item\s+Discount\s+Price$ end: \s+Total line: - Items group:\s+(?P.+) - (?P.+)\s+(?P\d+.\d+)\s+(?P\d+\d+) Signed-off-by: Rafał Miłecki --- src/invoice2data/extract/parsers/lines.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py index 03c106e5..bf2c8c65 100644 --- a/src/invoice2data/extract/parsers/lines.py +++ b/src/invoice2data/extract/parsers/lines.py @@ -12,6 +12,15 @@ DEFAULT_OPTIONS = {"line_separator": r"\n"} +def parse_line(patterns, line): + patterns = patterns if isinstance(patterns, list) else [patterns] + for pattern in patterns: + match = re.search(pattern, line) + if match: + return match + return None + + def parse(template, field, _settings, content): """Try to extract lines from the invoice""" @@ -57,7 +66,7 @@ def parse(template, field, _settings, content): continue if "first_line" in settings: # Check if the current lines the first_line pattern - match = re.search(settings["first_line"], line) + match = parse_line(settings["first_line"], line) if match: # The line matches the first_line pattern so append current row to output # then assign a new current_row @@ -75,7 +84,7 @@ def parse(template, field, _settings, content): # If last_line was provided, check that if "last_line" in settings: # last_line pattern provided, so check if the current line is that line - match = re.search(settings["last_line"], line) + match = parse_line(settings["last_line"], line) if match: # This is the last_line, so parse all lines thus far, # append to output, @@ -101,7 +110,7 @@ def parse(template, field, _settings, content): logger.debug("skip_line match on *%s*", line) continue # If none of those have continued the loop, check if this is just a normal line - match = re.search(settings["line"], line) + match = parse_line(settings["line"], line) if match: # This is one of the lines between first_line and last_line # Parse the data and add it to the current_row