From 6bbe40ce63a20529de5d492d35db9f5b1c4f4fbf Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 30 Jul 2024 17:28:51 +0200 Subject: [PATCH] allow multiword list entities --- orangecontrib/storynavigation/modules/settinganalysis.py | 9 +++++---- .../storynavigation/resources/dutch_entities.csv | 4 ++++ .../storynavigation/resources/english_entities.csv | 0 3 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 orangecontrib/storynavigation/resources/dutch_entities.csv create mode 100644 orangecontrib/storynavigation/resources/english_entities.csv diff --git a/orangecontrib/storynavigation/modules/settinganalysis.py b/orangecontrib/storynavigation/modules/settinganalysis.py index 755c051..23b913f 100644 --- a/orangecontrib/storynavigation/modules/settinganalysis.py +++ b/orangecontrib/storynavigation/modules/settinganalysis.py @@ -70,14 +70,15 @@ def __process_texts(self, nlp, text_tuples, callback=None): def __analyze_text_with_list(self, text, nlp, entity_list): matcher = Matcher(nlp.vocab) for entity_group in self.ENTITY_GROUPS: - patterns = [[{"ORTH": entity_text}] + patterns = [[{"lower": entity_token} for entity_token in entity_text.lower().split()] for entity_label, entity_text in entity_list if entity_label in entity_group] matcher.add(entity_group[0], patterns) tokens = nlp(text) - return { tokens[m[1]].idx: {"text": tokens[m[1]].text, - "label_": nlp.vocab.strings[m[0]]} - for m in matcher(tokens) } # presumes list entities contain 1 token + return {tokens[m[1]].idx: { + "text": " ".join([tokens[token_id].text for token_id in range(m[1], m[2])]), + "label_": nlp.vocab.strings[m[0]] + } for m in matcher(tokens)} def __process_text(self, text_id, text, nlp): diff --git a/orangecontrib/storynavigation/resources/dutch_entities.csv b/orangecontrib/storynavigation/resources/dutch_entities.csv new file mode 100644 index 0000000..7166d21 --- /dev/null +++ b/orangecontrib/storynavigation/resources/dutch_entities.csv @@ -0,0 +1,4 @@ +EVENT,coronacrisis +EVENT,corona-crisis +EVENT,corona crisis +EVENT,Koningsdag diff --git a/orangecontrib/storynavigation/resources/english_entities.csv b/orangecontrib/storynavigation/resources/english_entities.csv new file mode 100644 index 0000000..e69de29