Skip to content

Commit

Permalink
feat: better evaluation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
juanjoDiaz committed Sep 15, 2023
1 parent 75561b6 commit 81f8c98
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 169 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ jobs:
if: matrix.python-version != '3.6' && matrix.python-version != '3.7'
run: pip install -r requirements-dev.txt

- name: Install training dependencies
if: matrix.python-version != '3.6' && matrix.python-version != '3.7'
run: pip install -r training/requirements.txt

- name: Install dependencies (legacy versions)
if: matrix.python-version == '3.6' || matrix.python-version == '3.7'
run: |
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,4 @@ Makefile

# eval
UD/
training/**/data
13 changes: 0 additions & 13 deletions eval/README.rst

This file was deleted.

1 change: 0 additions & 1 deletion eval/eval-requirements.txt

This file was deleted.

134 changes: 0 additions & 134 deletions eval/udscore.py

This file was deleted.

20 changes: 8 additions & 12 deletions simplemma/language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,18 +198,14 @@ def proportion_in_target_languages(
Returns:
float: The proportion of text in the target language(s).
"""
tokens = self._token_sampler.sample_text(text)
if len(tokens) == 0:
return 0

in_target = 0
for token in tokens:
for lang_code in self._lang:
candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
if candidate is not None:
in_target += 1
break
return in_target / len(tokens)
return sum(
percentage
for (
lang_code,
percentage,
) in self.proportion_in_each_language(text).items()
if lang_code != "unk"
)

def main_language(
self,
Expand Down
9 changes: 0 additions & 9 deletions tests/test_language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,6 @@ def test_in_target_language() -> None:
== 1.0
)

langs = ("en", "de")
text = "It was a true gift"
assert (
LanguageDetector(lang=langs).proportion_in_target_languages(text)
== in_target_language(text, lang=langs)
== 1.0
)
in_target_language("It was a true gift", lang=("en", "de"))


def test_main_language():
text = "Dieser Satz ist auf Deutsch."
Expand Down
15 changes: 15 additions & 0 deletions training/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Instructions to run the evaluation
----------------------------------

The scores are calculated on `Universal Dependencies <https://universaldependencies.org/>`_ treebanks on single word tokens (including some contractions but not merged prepositions). They can be reproduced by the following steps:

1. Install the evaluation dependencies (``pip install -r training/requirements.txt``)
2. Update ``DATA_URL`` in ``training/download-eval-data.py`` to point to the latest treebanks archive from `Universal Dependencies <https://universaldependencies.org/#download>` (or the version that you which to use).
3. Run ``python3 training/download-eval-data.py`` which will
1. Download the archive
2. Extract relevant data (language and if applicable specific treebank, see notes in the results table)
3. Concatenate the train, dev and test data into a single file (e.g. ``cat de_gsd*.conllu > de-gsd-all.conllu``)
4. Store the files at the expected location (``training/data/UD/``)
4. Run the script, e.g. from the home directory ``python3 training/evaluate_simplema.py``
5. Results are stored at ``training/data/results/results_summary.csv``. Also, errors are written in a CSV file for each dataset under the ``data/results``folder.
71 changes: 71 additions & 0 deletions training/download-eval-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from typing import Iterable, List, Tuple
from os import mkdir, path, scandir
import re
import logging
import tarfile
import requests
from glob import glob

from simplemma.strategies.dictionaries.dictionary_factory import SUPPORTED_LANGUAGES

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
DATA_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y"
DATA_FOLDER = path.join(path.dirname(__file__), "data")
DATA_FILE = path.join(DATA_FOLDER, "ud-treeebanks.tgz")
CLEAN_DATA_FOLDER = path.join(DATA_FOLDER, "UD")


def get_dirs(file_name: str) -> List[str]:
return [dir.name for dir in scandir(file_name) if dir.is_dir()]


def get_files(file_name: str) -> List[str]:
return [dir.name for dir in scandir(file_name) if dir.is_file()]


def get_relevant_language_data_folders(data_folder) -> Iterable[Tuple[str, str, str]]:
for lang_folder in get_dirs(data_folder):
lang_data_folder = path.join(uncompressed_data_folder, lang_folder)
conllu_file = glob(path.join(lang_data_folder, "*.conllu"))[0]
matches_files = re.search("^.*/(.*)-ud.*$", conllu_file)
if matches_files is not None:
dataset_name = matches_files.groups()[0]
lang = dataset_name.split("_")[0]

if lang in SUPPORTED_LANGUAGES:
yield (lang, dataset_name, lang_data_folder)


if path.exists(DATA_FOLDER) or path.exists(CLEAN_DATA_FOLDER):
raise Exception(
"Data folder seems to be already present. Delete it before creating new data."
)

mkdir(DATA_FOLDER)
mkdir(CLEAN_DATA_FOLDER)

log.info("Downloading evaluation data...")
response = requests.get(DATA_URL)
open(DATA_FILE, "wb").write(response.content)

log.info("Uncompressing evaluation data...")
with tarfile.open(DATA_FILE) as tar:
tar.extractall(DATA_FOLDER)
uncompressed_data_folder = path.join(
DATA_FOLDER, glob(f"{DATA_FOLDER}/ud-treebanks-*")[0]
)

log.info("Filtering files...")
for lang, dataset_name, dataset__folder in get_relevant_language_data_folders(
uncompressed_data_folder
):
log.info(lang + " - " + dataset__folder)
# Concatenate the train, dev and test data into a single file (e.g. ``cat de_gsd*.conllu > de-gsd-all.conllu``)
lang_clean_data_file = path.join(CLEAN_DATA_FOLDER, f"{dataset_name}.conllu")
log.debug(f"Procressing data for {dataset_name}")
with open(lang_clean_data_file, "w") as outfile:
for file in glob(path.join(dataset__folder, "*.conllu")):
with open(file) as infile:
for line in infile:
outfile.write(line)
Loading

0 comments on commit 81f8c98

Please sign in to comment.