Add todo comment

CarperAI · Aug 27, 2023 · 2442d34 · 2442d34
1 parent eb3f8c5
commit 2442d34
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 0 deletions.
diff --git a/examples/textbooks_A2YN/labeler_filtering.py b/examples/textbooks_A2YN/labeler_filtering.py
@@ -0,0 +1,28 @@
+from datasets import load_dataset
+from transformers import pipeline, AutoTokenizer
+
+MODEL_NAME="CarperAI/code_edu_classifier_multi_lang"
+TOKENIZER_NAME="bigcode/starencoder"
+
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+tokenizer.pad_token = tokenizer.eos_token
+pipe = pipeline(
+    "text-classification", model=MODEL_NAME, tokenizer=tokenizer, device="cuda:0"
+)
+data_dir = "<path to data>"
+languages = ["python", "java", "javascript", "go", "c", "cpp"]
+tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':1024}
+
+def func(x):
+    labels = []
+    scores = []
+    for i in pipe(x["content"], truncation=True, padding="max_length", max_length=1024, batch_size=256):
+        labels.append(i["label"])
+        scores.append(i["score"])
+    return {"label": labels, "score": scores}
+
+for lang in languages:
+    ds = load_dataset("parquet", data_dir=f"{data_dir}/{lang}", split="train")
+    print(f"Loaded {lang} dataset with {len(ds)} examples")
+    ds = ds.map(lambda x: func(x), batched=True, batch_size=256)
+    ds.to_parquet(f"{data_dir}/{lang}_labeled/")
diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
@@ -496,6 +496,10 @@
     "            \"score\": [l[\"score\"] for l in predicted],\n",
     "        }\n",
     "\n",
+    "\n",
+    "    # TODO: first just label the dataset with scores and everything\n",
+    "    # then just split the dataset into the number of subsets and configs so that people can specify which one they want\n",
+    "\n",
     "    # Label the dataset\n",
     "    dataset = dataset.map(\n",
     "        lambda x: label(x[text_column]),\n",

diff --git a/treasure_trove/core.py b/treasure_trove/core.py
@@ -176,6 +176,10 @@ def label(x):
             "score": [l["score"] for l in predicted],
         }
 
+
+    # TODO: first just label the dataset with scores and everything
+    # then just split the dataset into the number of subsets and configs so that people can specify which one they want
+
     # Label the dataset
     dataset = dataset.map(
         lambda x: label(x[text_column]),