From 8c0659781c33f45be55121e226ffc6b831768de0 Mon Sep 17 00:00:00 2001 From: Juanjo Diaz Date: Thu, 15 Jun 2023 11:13:53 +0200 Subject: [PATCH] fix: issue with tokens that match prefixes (#106) --- simplemma/strategies/prefix_decomposition.py | 3 +++ tests/strategies/test_strategies.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/simplemma/strategies/prefix_decomposition.py b/simplemma/strategies/prefix_decomposition.py index 62407a0..78300b4 100644 --- a/simplemma/strategies/prefix_decomposition.py +++ b/simplemma/strategies/prefix_decomposition.py @@ -85,6 +85,9 @@ def get_lemma(self, token: str, lang: str) -> Optional[str]: return None prefix = prefix_match[1] + if prefix == token: + return None + subword = self._dictionary_lookup.get_lemma(token[len(prefix) :], lang) if subword is None: return None diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py index 5d5882a..a427c7d 100644 --- a/tests/strategies/test_strategies.py +++ b/tests/strategies/test_strategies.py @@ -4,6 +4,7 @@ DefaultStrategy, GreedyDictionaryLookupStrategy, AffixDecompositionStrategy, + PrefixDecompositionStrategy, ) @@ -51,3 +52,5 @@ def test_search() -> None: ) assert AffixDecompositionStrategy(greedy=True).get_lemma("ccc", "de") is None + + assert PrefixDecompositionStrategy().get_lemma("auf", "de") is None