From b1cdfb70ec71ff10bf1148affed2c8c228c391e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 30 Sep 2022 23:54:39 +0200 Subject: [PATCH] Add "priority" support for templates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of multiple templates matching given invoice - choose the one with the highest "priority" value. To provide proper support for prioritizing AND existing templates (backward compatibility) the default value 5 is assumed in case "priority" property is missing. This feature can be used for writing more generic as well as more specific templates. So far all templates were assumed to be company-specific. With this change we can have: 1. Invoice-generating software specific templates 2. In-company varying templates This feature may be very useful for: 1. Countries with just few very popular accounting software applications 2. Big companies with multiple departments adding some invoice details Signed-off-by: Rafał Miłecki --- TUTORIAL.md | 15 +++++++++++++++ src/invoice2data/extract/loader.py | 3 +++ src/invoice2data/main.py | 26 +++++++++++--------------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/TUTORIAL.md b/TUTORIAL.md index 9b3ac22c..f1852898 100644 --- a/TUTORIAL.md +++ b/TUTORIAL.md @@ -301,6 +301,21 @@ options and their defaults are: different fields, you can supply a list here. The extraction will fail if not all fields are matched. +### Priority + +In case of multiple templates matching single invoice the one with the +highest priority will be used. Default `priority` value (assigned if +missing) is 5. + +This property needs to be specified only when designing some generic or +very specific templates. + +Suggested values: + +- 0-4: accounting/invoice software specific template +- 5: company specific template +- 6-10: company department/unit specific template + ### Example of template using most options issuer: Free Mobile diff --git a/src/invoice2data/extract/loader.py b/src/invoice2data/extract/loader.py index 95418b49..f8ad030d 100644 --- a/src/invoice2data/extract/loader.py +++ b/src/invoice2data/extract/loader.py @@ -107,6 +107,9 @@ def read_templates(folder=None): elif type(tpl["exclude_keywords"]) is not list: tpl["exclude_keywords"] = [tpl["exclude_keywords"]] + if 'priority' not in tpl.keys(): + tpl['priority'] = 5 + output.append(InvoiceTemplate(tpl)) logger.info("Loaded %d templates from %s", len(output), folder) diff --git a/src/invoice2data/main.py b/src/invoice2data/main.py index c322b179..f750975a 100644 --- a/src/invoice2data/main.py +++ b/src/invoice2data/main.py @@ -79,10 +79,6 @@ def extract_data(invoicefile, templates=None, input_module=None): 'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'} """ - if templates is None: - templates = read_templates() - - # print(templates[0]) if input_module is None: if invoicefile.lower().endswith('.txt'): @@ -98,18 +94,18 @@ def extract_data(invoicefile, templates=None, input_module=None): logger.debug("START pdftotext result ===========================\n" + extracted_str) logger.debug("END pdftotext result =============================") - for t in templates: - optimized_str = t.prepare_input(extracted_str) - - if t.matches_input(optimized_str): - logger.info("Using %s template", t["template_name"]) - # Call extract with entire text and the invoicefile path - # The path is used if an area is called as a field option - return t.extract(optimized_str, invoicefile, input_module) - - logger.error("No template for %s", invoicefile) - return False + if templates is None: + templates = read_templates() + templates = filter(lambda t: t.matches_input(t.prepare_input(extracted_str)), templates) + templates = sorted(templates, key=lambda k: k['priority'], reverse=True) + if not templates: + logger.error("No template for %s", invoicefile) + return False + t = templates[0] + logger.info("Using %s template", t["template_name"]) + optimized_str = t.prepare_input(extracted_str) + return t.extract(optimized_str, invoicefile, input_module) def create_parser(): """Returns argument parser """