Skip to content

Commit

Permalink
Add todo comment
Browse files Browse the repository at this point in the history
  • Loading branch information
ncoop57 committed Aug 27, 2023
1 parent eb3f8c5 commit 2442d34
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 0 deletions.
28 changes: 28 additions & 0 deletions examples/textbooks_A2YN/labeler_filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer

MODEL_NAME="CarperAI/code_edu_classifier_multi_lang"
TOKENIZER_NAME="bigcode/starencoder"

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
tokenizer.pad_token = tokenizer.eos_token
pipe = pipeline(
"text-classification", model=MODEL_NAME, tokenizer=tokenizer, device="cuda:0"
)
data_dir = "<path to data>"
languages = ["python", "java", "javascript", "go", "c", "cpp"]
tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':1024}

def func(x):
labels = []
scores = []
for i in pipe(x["content"], truncation=True, padding="max_length", max_length=1024, batch_size=256):
labels.append(i["label"])
scores.append(i["score"])
return {"label": labels, "score": scores}

for lang in languages:
ds = load_dataset("parquet", data_dir=f"{data_dir}/{lang}", split="train")
print(f"Loaded {lang} dataset with {len(ds)} examples")
ds = ds.map(lambda x: func(x), batched=True, batch_size=256)
ds.to_parquet(f"{data_dir}/{lang}_labeled/")
4 changes: 4 additions & 0 deletions nbs/00_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,10 @@
" \"score\": [l[\"score\"] for l in predicted],\n",
" }\n",
"\n",
"\n",
" # TODO: first just label the dataset with scores and everything\n",
" # then just split the dataset into the number of subsets and configs so that people can specify which one they want\n",
"\n",
" # Label the dataset\n",
" dataset = dataset.map(\n",
" lambda x: label(x[text_column]),\n",
Expand Down
4 changes: 4 additions & 0 deletions treasure_trove/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ def label(x):
"score": [l["score"] for l in predicted],
}


# TODO: first just label the dataset with scores and everything
# then just split the dataset into the number of subsets and configs so that people can specify which one they want

# Label the dataset
dataset = dataset.map(
lambda x: label(x[text_column]),
Expand Down

0 comments on commit 2442d34

Please sign in to comment.