invoice-x · bosd · May 25, 2022 · May 29, 2022 · May 30, 2022 · Jun 24, 2022
diff --git a/TUTORIAL.md b/TUTORIAL.md
@@ -130,8 +130,7 @@ This parser allows parsing selected invoice section as a set of lines
 sharing some pattern. Those can be e.g. invoice items (good or services)
 or VAT rates.
 
-It replaces `lines` plugin and should be preferred over it. It allows
-reusing in multiple `fields`.
+It allows reusing in multiple `fields`.
 
 Example for `fields`:
 
@@ -217,6 +216,54 @@ one doesn't match either, this line is ignored. This implies that you
 need to take care that the `first_line` regex is the most specific one,
 and `line` the least specific.
 
+It is possible to use multiple lines groups.
+The syntax is
+
+    lines:
+     - start: Item\s+Discount\s+Price$
+       end:  \s+Total
+       line: (?P<line_note>(FOOD))
+     - start: Item\s+Discount\s+Price$
+       end:  \s+Total
+       line: (?P<description>(.+))\s+(?P<price>\d+\d+)
+
+It is also possible to contentate multi line tags.
+Very usefull when the description of the invoiced item spans multiple lines.
+
+This can be done by defining the capturing group multiple times.
+In the example below the `item` is contentated.
+If the invoice looks like:
+
+
+    Service A                                                                     12                      $10.00         $120.00
+    Description: Repair
+    Notes: Replaced capacitor
+    Parts: 1 x cap_a
+    Tax: 0.2%
+
+
+The template is:
+
+    lines:
+     - start: Item\s+Quantity\s+Rate\s+Amount
+       end:  Subtotal
+       first_line: 'Service (?P<item>\w)\s+(?P<qty>\S+)\s+.(?P<unitprice>\d+.\d{2})\s+.?(?P<linetotal>\d+.\d{2})'
+       line: '(?P<item>.*)'
+       skip_line: ['Pino \w'] # 'Description:', "Notes:", 
+       last_line: '(?P<desc>Parts:.*)'
+       types:
+         qty: int
+         unitprice: float
+         linetotal: float
+
+
+
+Output:
+
+    'lines': [{'item': 'A\nDescription: Repair\nNotes: Replaced capacitor', 'qty': 12, 'unitprice': 10.0, 'linetotal': 120.0, 'desc': 'Parts: 1 x cap_a'}]
+
+
+
 ### Tables
 
 The `tables` plugin allows you to parse table-oriented fields that have

diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py
@@ -4,127 +4,215 @@
 Initial work and maintenance by Holger Brunn @hbrunn
 """
 
+
 import re
 import logging
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_OPTIONS = {"line_separator": r"\n"}
+# Initialize the settings
+# by setting the first_line_found boolean to False,
+# This indicates the we are looking for the first_line pattern
+DEFAULT_OPTIONS = {"line_separator": r"\n", "first_line_found": False}
 
 
 def parse(template, field, _settings, content):
     """Try to extract lines from the invoice"""
-
-    # First apply default options.
-    settings = DEFAULT_OPTIONS.copy()
-    settings.update(_settings)
-
-    # Validate settings
-    assert "start" in settings, "Lines start regex missing"
-    assert "end" in settings, "Lines end regex missing"
-    assert "line" in settings, "Line regex missing"
-
-    start = re.search(settings["start"], content)
-    end = re.search(settings["end"], content)
-    if not start or not end:
-        logger.warning(f"No lines found. Start match: {start}. End match: {end}")
-        return
-    content = content[start.end() : end.start()]
+    # logger.debug("_settings is *%s* for debugging", _settings)
     lines = []
-    current_row = {}
-
-    # We assume that structured line fields may either be individual lines or
-    # they may be main line items with descriptions or details following beneath.
-    # Using the first_line, line, last_line parameters we are able to capture this data.
-    # first_line - The main line item
-    # line - The detail lines, typically indented from the main lines
-    # last_line - The final line of the indented lines
-    # The code below will ignore both line and last_line until it finds first_line.
-    # It will then switch to extracting line patterns until it either reaches last_line
-    # or it reaches another first_line.
-
-    # As first_line and last_line are optional, if neither were provided,
-    # set the first_line to be the provided line parameter.
-    # In this way the code will simply loop through and extract the lines as expected.
-    if "first_line" not in settings and "last_line" not in settings:
-        settings["first_line"] = settings["line"]
-    # As we enter the loop, we set the boolean for first_line being found to False,
-    # This indicates the we are looking for the first_line pattern
-    first_line_found = False
-    for line in re.split(settings["line_separator"], content):
-        # If the line has empty lines in it , skip them
-        if not line.strip("").strip("\n").strip("\r") or not line:
+    current_row = []
+    plugin_settings = DEFAULT_OPTIONS.copy()
+    indexno = -1
+
+    # Backward Compatability if someone trys to pass something else instead of a list
+    if type(_settings) != list:
+        li = []
+        li.append(_settings)
+        _settings = li
+
+    for setting in _settings:
+        logger.debug(" Settingcheck: is \n *%s*", setting)
+        # As first_line and last_line are optional, if neither were provided,
+        # set the first_line to be the provided line parameter.
+        # In this way the code will simply loop through and extract the lines as expected.
+
+        if "first_line" not in setting and "last_line" not in setting:
+            logger.debug("temp Setting : is \n *%s*", setting)
+            setting["first_line"] = setting["line"]
+            # this does not seem to work, but it should
+            # Move this code outside of the line section
+
+        # Set the setting number to act as an index for the current_row dict
+        indexno += 1
+        setting["index"] = indexno
+        logger.debug("Setting %s: is \n *%s*", indexno, setting)
+        current_row.append("")
+        plugin_settings.update(setting)
+
+        # First apply default options.
+        plugin_settings.update(setting)
+        setting = plugin_settings
+
+        # Validate settings
+        assert "start" in setting, "Lines start regex missing"
+        assert "end" in setting, "Lines end regex missing"
+        assert "line" in setting, "Line regex missing"
+
+        start = re.search(setting["start"], content)
+        end = re.search(setting["end"], content)
+        if not start or not end:
+            logger.warning(f"No lines found. Start match: {start}. End match: {end}")
             continue
-        if "first_line" in settings:
-            # Check if the current lines the first_line pattern
-            match = re.search(settings["first_line"], line)
-            if match:
-                # The line matches the first_line pattern so append current row to output
-                # then assign a new current_row
-                if current_row:
-                    lines.append(current_row)
-                current_row = {field: value.strip() if value else "" for field, value in match.groupdict().items()}
-                # Flip first_line_found boolean as first_line has been found
-                # This will allow last_line and line to be matched on below
-                first_line_found = True
-                continue
-        # If the first_line has not yet been found, do not look for line or last_line
-        # Just continue to the next line
-        if first_line_found is False:
+
+    for line_content in re.split(plugin_settings["line_separator"], content):
+
+        # not sure if code below should stay.
+        # match_emptyline = re.fullmatch("^\s+\n?\r?$",line_content) # .strip("^\s+$\n?\r?")
+        # if match_emptyline:
+        #    logger.debug(f"Empty line found, skipping")
+        #    continue
+
+        # If the line has empty lines in it , skip them
+        if not line_content.strip("").strip("\n").strip("\r") or not line_content:
             continue
-        # If last_line was provided, check that
-        if "last_line" in settings:
-            # last_line pattern provided, so check if the current line is that line
-            match = re.search(settings["last_line"], line)
-            if match:
-                # This is the last_line, so parse all lines thus far,
-                # append to output,
-                # and reset current_row
-                current_row = parse_current_row(match, current_row)
-                if current_row:
-                    lines.append(current_row)
-                current_row = {}
-                # Flip first_line_found boolean to look for first_line again on next loop
-                first_line_found = False
+        # added .strip("^\s+$")
+        logger.debug("Parsing line *%s*", line_content)
+        # Loop trough settings from the template file.
+        # As we want to have the output in the same order as the invoice file.
+        for setting in _settings:
+
+            if "first_line_found" not in setting:
+                setting["first_line_found"] = False
+
+            # Strip the content down to the items between start and end tag of this setting
+            start = re.search(setting["start"], content)
+            end = re.search(setting["end"], content)
+            if not start or not end:
+                logger.warning(f"No content found between:\n Start match: {start}.\n End match: {end}")
                 continue
-        # Next we see if this is a line that should be skipped
-        if "skip_line" in settings:
-            # If skip_line was provided, check for a match now
-            if isinstance(settings["skip_line"], list):
-                # Accepts a list
-                skip_line_results = [re.search(x, line) for x in settings["skip_line"]]
+
+            content_of_setting = content[start.end() : end.start()]
+
+            # search if the current line is in the content.
+            # If the current line is not between the start and end tag.
+            # continue to the next setting in the for loop.
+            content_is_between_start_end = re.search(re.escape(line_content), content_of_setting)
+            if content_is_between_start_end:
+                logger.debug("Setting %s: This line is between start and end tag", setting["index"])
             else:
-                # Or a simple string
-                skip_line_results = [re.search(settings["skip_line"], line)]
-            if any(skip_line_results):
-                # There was at least one match to a skip_line
-                logger.debug("skip_line match on *%s*", line)
                 continue
-        # If none of those have continued the loop, check if this is just a normal line
-        match = re.search(settings["line"], line)
-        if match:
-            # This is one of the lines between first_line and last_line
-            # Parse the data and add it to the current_row
-            current_row = parse_current_row(match, current_row)
-            continue
-        # If the line doesn't match anything, log and continue to next line
-        logger.debug("ignoring *%s* because it doesn't match anything", line)
-    if current_row:
-        # All lines processed, so append whatever the final current_row was to output
-        lines.append(current_row)
 
-    types = settings.get("types", [])
+            # We assume that structured line fields may either be individual lines or
+            # they may be main line items with descriptions or details following beneath.
+            # Using the first_line, line, last_line parameters we are able to capture this data.
+            # first_line - The main line item
+            # line - The detail lines
+            # last_line - The final line
+            # The code below will ignore both line and last_line until it finds first_line.
+            # It will then switch to extracting line patterns until it either reaches last_line
+            # or it reaches another first_line.
+
+            # here loop trough settings
+            if "first_line" in setting:
+                # Check if the current lines matches the first_line pattern
+                match = re.search(setting["first_line"], line_content)
+                if match:
+                    # if the previous match of this setting was also a firstline, write to output
+                    if setting['first_line_found'] and current_row[setting["index"]]:
+                        lines.append(current_row[setting["index"]])
+                        logger.debug(
+                            "Setting %s: another first_line is found, so all lines processed, result: %s",
+                            setting["index"],
+                            current_row[setting["index"]]
+                        )
+                        current_row[setting["index"]] = {}
+                        setting["first_line_found"] = False
+                    # The line matches the first_line pattern so append
+                    logger.debug("Setting %s: first_line matched", setting["index"])
+                    logger.debug("Setting %s: converting first_line to data:", setting["index"])
+                    # on first_line always begin a new output line.
+                    current_row[setting["index"]] = {}
+                    current_row[setting["index"]] = parse_current_row(match, current_row[setting["index"]])
+                    # Flip first_line_found boolean as first_line has been found
+                    # This will allow last_line and line to be matched
+                    setting["first_line_found"] = True
+                    continue
+
+            # If the first_line has not yet been found, do not look for line or last_line
+            # Just continue to the next line
+            if not setting['first_line_found']:
+                logger.debug("Setting %s: Skipping because first_line is not found!", setting["index"])
+                continue
+
+            # Next we see if this is a line that should be skipped
+            if "skip_line" in setting:
+                # If skip_line was provided, check for a match now
+                if isinstance(setting["skip_line"], list):
+                    # Accepts a list
+                    skip_line_results = [re.search(x, line_content) for x in setting["skip_line"]]
+                else:
+                    # Or a simple string
+                    skip_line_results = [re.search(setting["skip_line"], line_content)]
+                if any(skip_line_results):
+                    # There was at least one match to a skip_line
+                    logger.debug("skip_line match on *%s*", line_content)
+                    continue
+
+            # If last_line was provided, check that
+            if "last_line" in setting:
+                # last_line pattern provided, so check if the current line is that line
+                match = re.search(setting["last_line"], line_content)
+                if match:
+                    # This is the last_line, so parse all lines thus far,
+                    # append the values to the lines output.
+                    # and reset current_row
+                    logger.debug("Setting %s: converting last_line to data:", setting["index"])
+                    current_row[setting["index"]] = parse_current_row(match, current_row[setting["index"]])
+                    if current_row[setting["index"]]:
+                        lines.append(current_row[setting["index"]])
+                        logger.debug(
+                            "Setting %s: last_line found, assembled result:\n *%s*",
+                            setting["index"], current_row[setting["index"]]
+                        )
+                    current_row[setting["index"]] = {}
+                    # Flip first_line_found boolean to look for first_line again on next loop
+                    setting["first_line_found"] = False
+                    continue
+
+            # If none of those have continued the loop, check if this is just a normal line
+            match = re.search(setting["line"], line_content)
+            if match:
+                # This is one of the lines between first_line and last_line
+                # Parse the data and add it to the current_row
+                logger.debug("Setting %s: converting line to data:", setting["index"])
+                current_row[setting["index"]] = parse_current_row(match, current_row[setting["index"]])
+                continue
+
+            # All lines processed, so append whatever the final current_row was to output
+            if current_row[setting["index"]]:
+                lines.append(current_row[setting["index"]])
+                logger.debug(
+                    "Setting %s: all lines processed, result: %s", setting["index"],
+                    current_row[setting["index"]]
+                )
+                current_row[setting["index"]] = {}
+            # else:
+                # If the line doesn't match anything, log and continue to next line
+                # logger.debug("Setting %s: ignoring the line because it doesn't match anything", setting["index"])
+
+    types = setting.get("types", [])
     for row in lines:
         for name in row.keys():
             if name in types:
                 row[name] = template.coerce_type(row[name], types[name])
-
     return lines
 
 
 def parse_current_row(match, current_row):
     # Parse the current row data
+    # append the data to the key
     for field, value in match.groupdict().items():
+        logger.debug("result: {'%s': '%s'} ", field, value)
         current_row[field] = "%s%s%s" % (
             current_row.get(field, ""),
             current_row.get(field, "") and "\n" or "",

diff --git a/tests/compare/QualityHosting.json b/tests/compare/QualityHosting.json
@@ -41,13 +41,19 @@
             {
                 "pos": "6",
                 "qty": 1.0,
-                "desc": "Small Business QualityExchange 2010\nGrundgebühr pro Einheit\nDienst: OUDJQ_jauernik\n01.05.14-31.05.14\nQualityHosting AG - Uferweg 40-42 - D-63571 Gelnhausen\niViveLabs Ltd.\n93B Sai Yu Chung\nYuen Long, N.T.\nHong Kong\nPos.            Menge      Beschreibung                                                            Rabatt %     VK-Preis     Zeilenbetrag\nOhne      Ohne MwSt.\nMwSt.",
+                "desc": "Small Business QualityExchange 2010\nGrundgebühr pro Einheit\nDienst: OUDJQ_jauernik\n01.05.14-31.05.14",
                 "price": 5.39
             },
+            {
+                "desc": "QualityHosting AG - Uferweg 40-42 - D-63571 Gelnhausen\niViveLabs Ltd.\n93B Sai Yu Chung\nYuen Long, N.T.\nHong Kong"
+            },
+            {
+                "desc": "Pos.            Menge      Beschreibung                                                            Rabatt %     VK-Preis     Zeilenbetrag\nOhne      Ohne MwSt.\nMwSt."
+            },
             {
                 "pos": "7",
                 "qty": 1.0,
-                "desc": "Small Business StandardExchange 2010\nGrundgebühr pro Einheit\nDienst: OUDJQ_office\n01.05.14-31.05.14\n",
+                "desc": "Small Business StandardExchange 2010\nGrundgebühr pro Einheit\nDienst: OUDJQ_office\n01.05.14-31.05.14",
                 "price": 3.89
             }
         ],