diff --git a/examples/textbooks_A2YN/labeler_filtering.py b/examples/textbooks_A2YN/labeler_filtering.py new file mode 100644 index 0000000..89dd14a --- /dev/null +++ b/examples/textbooks_A2YN/labeler_filtering.py @@ -0,0 +1,28 @@ +from datasets import load_dataset +from transformers import pipeline, AutoTokenizer + +MODEL_NAME="CarperAI/code_edu_classifier_multi_lang" +TOKENIZER_NAME="bigcode/starencoder" + +tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) +tokenizer.pad_token = tokenizer.eos_token +pipe = pipeline( + "text-classification", model=MODEL_NAME, tokenizer=tokenizer, device="cuda:0" +) +data_dir = "" +languages = ["python", "java", "javascript", "go", "c", "cpp"] +tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':1024} + +def func(x): + labels = [] + scores = [] + for i in pipe(x["content"], truncation=True, padding="max_length", max_length=1024, batch_size=256): + labels.append(i["label"]) + scores.append(i["score"]) + return {"label": labels, "score": scores} + +for lang in languages: + ds = load_dataset("parquet", data_dir=f"{data_dir}/{lang}", split="train") + print(f"Loaded {lang} dataset with {len(ds)} examples") + ds = ds.map(lambda x: func(x), batched=True, batch_size=256) + ds.to_parquet(f"{data_dir}/{lang}_labeled/") \ No newline at end of file diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb index 4222d74..ae34b0b 100644 --- a/nbs/00_core.ipynb +++ b/nbs/00_core.ipynb @@ -496,6 +496,10 @@ " \"score\": [l[\"score\"] for l in predicted],\n", " }\n", "\n", + "\n", + " # TODO: first just label the dataset with scores and everything\n", + " # then just split the dataset into the number of subsets and configs so that people can specify which one they want\n", + "\n", " # Label the dataset\n", " dataset = dataset.map(\n", " lambda x: label(x[text_column]),\n", diff --git a/treasure_trove/core.py b/treasure_trove/core.py index ac6806b..1f51701 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -176,6 +176,10 @@ def label(x): "score": [l["score"] for l in predicted], } + + # TODO: first just label the dataset with scores and everything + # then just split the dataset into the number of subsets and configs so that people can specify which one they want + # Label the dataset dataset = dataset.map( lambda x: label(x[text_column]),