diff --git a/TUTORIAL.md b/TUTORIAL.md index 9b3ac22c..475b81f6 100644 --- a/TUTORIAL.md +++ b/TUTORIAL.md @@ -136,6 +136,10 @@ This parser allows parsing selected invoice section as a set of lines sharing some pattern. Those can be e.g. invoice items (good or services) or VAT rates. +Some companies may use multiple formats for their line-based data. In +such cases multiple sets of parsing regexes can be added to the `rules`. +Results from multiple `rules` get merged into a single array. + It replaces `lines` plugin and should be preferred over it. It allows reusing in multiple `fields`. @@ -148,6 +152,17 @@ Example for `fields`: end: \s+Total line: (?P.+)\s+(?P\d+.\d+)\s+(?P\d+\d+) + fields: + lines: + parser: lines + rules: + - start: Item\s+Discount\s+Price$ + end: \s+Total + line: (?P.+)\s+(?P\d+.\d+)\s+(?P\d+\d+) + - start: Item\s+Price$ + end: \s+Total + line: (?P.+)\s+(?P\d+\d+) + ### Legacy regexes For non-text fields, the name of the field is important: diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py index 1c830126..2c096881 100644 --- a/src/invoice2data/extract/parsers/lines.py +++ b/src/invoice2data/extract/parsers/lines.py @@ -117,10 +117,10 @@ def parse_block(template, field, settings, content): return lines -def parse(template, field, _settings, content): +def parse_by_rule(template, field, rule, content): # First apply default options. settings = DEFAULT_OPTIONS.copy() - settings.update(_settings) + settings.update(rule) # Validate settings assert "start" in settings, "Lines start regex missing" @@ -154,6 +154,24 @@ def parse(template, field, _settings, content): return lines +def parse(template, field, settings, content): + if "rules" in settings: + # One field can have multiple sets of line-parsing rules + rules = settings['rules'] + else: + # Original syntax stored line-parsing rules in top field YAML object + keys = ('start', 'end', 'line', 'first_line', 'last_line', 'skip_line', 'types') + rules = [{k: v for k, v in settings.items() if k in keys}] + + lines = [] + for rule in rules: + new_lines = parse_by_rule(template, field, rule, content) + if new_lines is not None: + lines += new_lines + + return lines + + def parse_current_row(match, current_row): # Parse the current row data for field, value in match.groupdict().items(): diff --git a/tests/custom/lines-multiple-patterns.json b/tests/custom/lines-multiple-patterns.json index f896ed66..7a4a01f6 100644 --- a/tests/custom/lines-multiple-patterns.json +++ b/tests/custom/lines-multiple-patterns.json @@ -17,6 +17,13 @@ { "pos": 6, "name": "Penguin" }, { "pos": 7, "name": "Ostrich" } ], + "dimensions": [ + { "pos": 1, "angle": 30, "length": 30 }, + { "pos": 2, "angle": 45, "length": 40 }, + { "pos": 3, "angle": 90, "length": 60 }, + { "pos": 4, "length": 80, "angle": 135 }, + { "pos": 5, "length": 100, "angle": 180 } + ], "currency": "EUR", "desc": "Invoice from Lines Tests" } diff --git a/tests/custom/lines-multiple-patterns.txt b/tests/custom/lines-multiple-patterns.txt index dea9f49e..35c4f3d8 100644 --- a/tests/custom/lines-multiple-patterns.txt +++ b/tests/custom/lines-multiple-patterns.txt @@ -5,6 +5,7 @@ Total: 50.00 EUR Lines with multiple patterns + Lines start Group: Mammals @@ -21,3 +22,15 @@ Subgroup: Flightless 7. Ostrich Lines end + + +No Angle [°] Length [cm] +1 30 30 +2 45 40 +3 90 60 +Count: 3 + +No Length [cm] Angle [°] +4 80 135 +5 100 180 +Count: 2 diff --git a/tests/custom/templates/lines-multiple-patterns.yml b/tests/custom/templates/lines-multiple-patterns.yml index 8cfe6912..bd05e97f 100644 --- a/tests/custom/templates/lines-multiple-patterns.yml +++ b/tests/custom/templates/lines-multiple-patterns.yml @@ -26,6 +26,23 @@ fields: - ^Subgroup:\s*(?P.+)$ types: pos: int + dimensions: + parser: lines + rules: + - start: No.*Angle.*Length + end: Count + line: ^(?P\d+)\s+(?P\d+)\s+(?P\d+)$ + types: + pos: int + angle: int + length: int + - start: No.*Length.*Angle + end: Count + line: ^(?P\d+)\s+(?P\d+)\s+(?P\d+)$ + types: + pos: int + angle: int + length: int options: currency: EUR date_formats: