Skip to content

Commit

Permalink
feat: use cached object for legacy functions (#108)
Browse files Browse the repository at this point in the history
  • Loading branch information
juanjoDiaz committed Aug 2, 2023
1 parent 8c06597 commit fa1d964
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 105 deletions.
178 changes: 91 additions & 87 deletions simplemma/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
LemmatizationStrategy,
DefaultStrategy,
DictionaryLookupStrategy,
DefaultDictionaryFactory,
LemmatizationFallbackStrategy,
ToLowercaseFallbackStrategy,
)
Expand All @@ -45,93 +46,6 @@ def _control_input_type(token: Any) -> None:
raise ValueError("Wrong input type: empty string")


def is_known(
token: str,
lang: Union[str, Tuple[str, ...]],
) -> bool:
"""Check if a token is known in the specified language(s).
Args:
token: The token to check.
lang: The language or languages to check in.
Returns:
bool: True if the token is known, False otherwise.
"""

return Lemmatizer().is_known(token, lang)


def lemmatize(
token: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> str:
"""Lemmatize a token in the specified language(s).
Args:
token: The token to lemmatize.
lang: The language or languages for lemmatization.
greedy: A flag indicating whether to use greedy lemmatization (default: False).
Returns:
str: The lemmatized form of the token.
"""
return Lemmatizer(
lemmatization_strategy=DefaultStrategy(greedy),
).lemmatize(token, lang)


def text_lemmatizer(
text: str,
lang: Union[str, Tuple[str, ...]],
greedy: bool = False,
tokenizer: Tokenizer = RegexTokenizer(),
) -> List[str]:
"""Lemmatize a text in the specified language(s).
Args:
text: The text to lemmatize.
lang: The language or languages for lemmatization.
greedy: A flag indicating whether to use greedy lemmatization (default: False).
tokenizer: The tokenizer to use (default: RegexTokenizer()).
Returns:
List[str]: The list of lemmatized tokens.
"""

return list(
lemma_iterator(
text,
lang,
greedy,
tokenizer=tokenizer,
)
)


def lemma_iterator(
text: str,
lang: Union[str, Tuple[str, ...]],
greedy: bool = False,
tokenizer: Tokenizer = RegexTokenizer(),
) -> Iterator[str]:
"""Iterate over lemmatized tokens in a text.
Args:
text: The text to iterate over.
lang: The language or languages for lemmatization.
greedy: A flag indicating whether to use greedy lemmatization (default: False).
tokenizer: The tokenizer to use (default: RegexTokenizer()).
Yields:
str: The lemmatized tokens in the text.
"""

return Lemmatizer(
tokenizer=tokenizer,
lemmatization_strategy=DefaultStrategy(greedy),
).get_lemmas_in_text(text, lang)


class Lemmatizer:
"""Lemmatizer class for performing token lemmatization.
Expand Down Expand Up @@ -243,3 +157,93 @@ def get_lemmas_in_text(
for token in self._tokenizer.split_text(text):
yield self.lemmatize(token.lower() if initial else token, lang)
initial = token in PUNCTUATION


# From here down are legacy function pre-1.0

_legacy_dictionary_factory = DefaultDictionaryFactory()
_legacy_lemmatizer = Lemmatizer(
lemmatization_strategy=DefaultStrategy(
dictionary_factory=_legacy_dictionary_factory
)
)
_legacy_greedy_lemmatizer = Lemmatizer(
lemmatization_strategy=DefaultStrategy(
greedy=True, dictionary_factory=_legacy_dictionary_factory
)
)


def is_known(
token: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> bool:
"""Check if a token is known in the specified language(s).
Args:
token: The token to check.
lang: The language or languages to check in.
Returns:
bool: True if the token is known, False otherwise.
"""
lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer
return lemmatizer.is_known(token, lang)


def lemmatize(
token: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> str:
"""Lemmatize a token in the specified language(s).
Args:
token: The token to lemmatize.
lang: The language or languages for lemmatization.
greedy: A flag indicating whether to use greedy lemmatization (default: False).
Returns:
str: The lemmatized form of the token.
"""
lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer
return lemmatizer.lemmatize(token, lang)


def text_lemmatizer(
text: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> List[str]:
"""Lemmatize a text in the specified language(s).
Args:
text: The text to lemmatize.
lang: The language or languages for lemmatization.
greedy: A flag indicating whether to use greedy lemmatization (default: False).
tokenizer: The tokenizer to use (default: RegexTokenizer()).
Returns:
List[str]: The list of lemmatized tokens.
"""

return list(
lemma_iterator(
text,
lang,
greedy,
)
)


def lemma_iterator(
text: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False
) -> Iterator[str]:
"""Iterate over lemmatized tokens in a text.
Args:
text: The text to iterate over.
lang: The language or languages for lemmatization.
greedy: A flag indicating whether to use greedy lemmatization (default: False).
tokenizer: The tokenizer to use (default: RegexTokenizer()).
Yields:
str: The lemmatized tokens in the text.
"""
lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer
return lemmatizer.get_lemmas_in_text(text, lang)
39 changes: 21 additions & 18 deletions simplemma/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,6 @@
)


def simple_tokenizer(text: str, splitting_regex: Pattern[str] = TOKREGEX) -> List[str]:
"""
Simple regular expression tokenizer.
This function takes a string as input and returns a list of tokens.
Args:
text (str): The input text to tokenize.
splitting_regex (Pattern[str], optional): The regular expression pattern used for tokenization.
Defaults to `TOKREGEX`.
Returns:
List[str]: The list of tokens extracted from the input text.
"""
return list(RegexTokenizer(splitting_regex).split_text(text))


class Tokenizer(Protocol):
"""
Abstract base class for tokenizers.
Expand Down Expand Up @@ -109,3 +91,24 @@ def split_text(self, text: str) -> Iterator[str]:
"""
return (match[0] for match in self._splitting_regex.finditer(text))


_legacy_tokenizer = RegexTokenizer()


def simple_tokenizer(text: str) -> List[str]:
"""
Simple regular expression tokenizer.
This function takes a string as input and returns a list of tokens.
Args:
text (str): The input text to tokenize.
splitting_regex (Pattern[str], optional): The regular expression pattern used for tokenization.
Defaults to `TOKREGEX`.
Returns:
List[str]: The list of tokens extracted from the input text.
"""
return list(_legacy_tokenizer.split_text(text))

0 comments on commit fa1d964

Please sign in to comment.