diff --git a/TUTORIAL.md b/TUTORIAL.md index 523961bc..822f0a59 100644 --- a/TUTORIAL.md +++ b/TUTORIAL.md @@ -130,6 +130,10 @@ This parser allows parsing selected invoice section as a set of lines sharing some pattern. Those can be e.g. invoice items (good or services) or VAT rates. +Some companies may use multiple formats for their line-based data. In +such cases multiple sets of parsing regexes can be added to the `rules`. +Results from multiple `rules` get merged into a single array. + It replaces `lines` plugin and should be preferred over it. It allows reusing in multiple `fields`. @@ -142,6 +146,17 @@ Example for `fields`: end: \s+Total line: (?P.+)\s+(?P\d+.\d+)\s+(?P\d+\d+) + fields: + lines: + parser: lines + rules: + - start: Item\s+Discount\s+Price$ + end: \s+Total + line: (?P.+)\s+(?P\d+.\d+)\s+(?P\d+\d+) + - start: Item\s+Price$ + end: \s+Total + line: (?P.+)\s+(?P\d+\d+) + ### Legacy regexes For non-text fields, the name of the field is important: diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py index 03c106e5..cec288a9 100644 --- a/src/invoice2data/extract/parsers/lines.py +++ b/src/invoice2data/extract/parsers/lines.py @@ -12,12 +12,12 @@ DEFAULT_OPTIONS = {"line_separator": r"\n"} -def parse(template, field, _settings, content): +def parse_by_rule(field, rule, content): """Try to extract lines from the invoice""" # First apply default options. settings = DEFAULT_OPTIONS.copy() - settings.update(_settings) + settings.update(rule) # Validate settings assert "start" in settings, "Lines start regex missing" @@ -113,6 +113,24 @@ def parse(template, field, _settings, content): # All lines processed, so append whatever the final current_row was to output lines.append(current_row) + return lines + + +def parse(template, field, settings, content): + if "rules" in settings: + # One field can have multiple sets of line-parsing rules + rules = settings['rules'] + else: + # Original syntax stored line-parsing rules in top field YAML object + keys = ('start', 'end', 'line', 'first_line', 'last_line', 'skip_line') + rules = [{k: v for k, v in settings.items() if k in keys}] + + lines = [] + for rule in rules: + new_lines = parse_by_rule(field, rule, content) + if new_lines is not None: + lines += new_lines + types = settings.get("types", []) for row in lines: for name in row.keys(): diff --git a/src/invoice2data/extract/templates/com/com.amazon.aws.yml b/src/invoice2data/extract/templates/com/com.amazon.aws.yml index ced8600e..f7c793e8 100644 --- a/src/invoice2data/extract/templates/com/com.amazon.aws.yml +++ b/src/invoice2data/extract/templates/com/com.amazon.aws.yml @@ -7,16 +7,18 @@ fields: invoice_number: Invoice Number:\s+(\d+) partner_name: (Amazon Web Services, Inc\.) static_partner_website: aws.amazon.com + lines: + parser: lines + rules: + - start: Detail + end: \* May include estimated US sales tax + first_line: ^ (?P\w+.*)\$(?P\d+\.\d+) + line: (.*)\$(\d+\.\d+) + last_line: VAT \*\* keywords: - Amazon Web Services - $ - Invoice -lines: - start: Detail - end: \* May include estimated US sales tax - first_line: ^ (?P\w+.*)\$(?P\d+\.\d+) - line: (.*)\$(\d+\.\d+) - last_line: VAT \*\* options: currency: USD date_formats: