Skip to content

Commit

Permalink
allow multiword list entities
Browse files Browse the repository at this point in the history
  • Loading branch information
eriktks committed Jul 30, 2024
1 parent 956de95 commit 6bbe40c
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 4 deletions.
9 changes: 5 additions & 4 deletions orangecontrib/storynavigation/modules/settinganalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,15 @@ def __process_texts(self, nlp, text_tuples, callback=None):
def __analyze_text_with_list(self, text, nlp, entity_list):
matcher = Matcher(nlp.vocab)
for entity_group in self.ENTITY_GROUPS:
patterns = [[{"ORTH": entity_text}]
patterns = [[{"lower": entity_token} for entity_token in entity_text.lower().split()]
for entity_label, entity_text in entity_list
if entity_label in entity_group]
matcher.add(entity_group[0], patterns)
tokens = nlp(text)
return { tokens[m[1]].idx: {"text": tokens[m[1]].text,
"label_": nlp.vocab.strings[m[0]]}
for m in matcher(tokens) } # presumes list entities contain 1 token
return {tokens[m[1]].idx: {
"text": " ".join([tokens[token_id].text for token_id in range(m[1], m[2])]),
"label_": nlp.vocab.strings[m[0]]
} for m in matcher(tokens)}


def __process_text(self, text_id, text, nlp):
Expand Down
4 changes: 4 additions & 0 deletions orangecontrib/storynavigation/resources/dutch_entities.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
EVENT,coronacrisis
EVENT,corona-crisis
EVENT,corona crisis
EVENT,Koningsdag
Empty file.

0 comments on commit 6bbe40c

Please sign in to comment.