Skip to content

Commit

Permalink
integrating to core.py
Browse files Browse the repository at this point in the history
  • Loading branch information
TheodoreGalanos committed Jul 8, 2023
1 parent 111ea6e commit 5d8dd1f
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 16 deletions.
35 changes: 21 additions & 14 deletions examples/textbooks_A2YN/gpt_labeling_sk.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import ast
from datasets import concatenate_datasets, load_dataset
from squeakily.helpers import LLMLabeler
from treasure_trove.core import label_dataset
from treasure_trove.core import label_dataset, label_dataset_sk, sk_function

import semantic_kernel as sk
import semantic_kernel.connectors.ai.open_ai as sk_oai
Expand All @@ -19,22 +19,29 @@
)

skills_dir = os.getcwd() + "/skills"
annotation_skills = kernel.import_semantic_skill_from_directory(skills_dir, "annotation")
annotation_function = annotation_skills["gpt_labeling"]
skill_category = "annotation"
func_name = "gpt_labeling"
skfunction = sk_function(skills_dir, skill_category, func_name)

annotation_context = kernel.create_new_context()
skfunction_context = kernel.create_new_context()

# Single example without function, should be integrated with label_dataset and classify
labels = ["high quality", "medium quality", "low quality"]
languages = ["python", "go", "java", "javascript", "c", "c++"]
subsets = []
for lang in languages:
annotation_context['code']
ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"]
ds = label_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"]
sample = 50 / len(ds)
ds.shuffle(seed=115).select(range(int(len(ds) * sample)))
evalutation = annotation_function(context=annotation_context)
evaluation = re.sub('\s+', ' ', evaluation.result).replace('\n', '')
eval_ast = ast.literal_eval(evaluation)
rationale = eval_ast['rationale']
label = eval_ast['evaluation']
#....
subset = label_dataset_sk(ds, "content", labels, skfunction, sample=sample, num_workers=8)
new_column = [lang] * len(subset)
subset = subset.add_column("language", new_column)
subsets.append(subset)

labeled_ds = concatenate_datasets(subsets)

# upload to huggingface
labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True)

# print number of each class
print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}")
print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}")
print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}")
67 changes: 65 additions & 2 deletions treasure_trove/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
# %% ../nbs/00_core.ipynb 2
import evaluate
import time

import re
import numpy as np
import ast
import sk

from transformers import (
AutoModelForSequenceClassification,
Expand All @@ -16,6 +18,22 @@
Trainer,
)

def sk_function(skills_dir: str, skill_category: str, func_name: str):
kernel = sk.kernel()
skills = kernel.import_semantic_skill_from_directory(skills_dir, skill_category)
function = skills[func_name]
return function

def sk_code_eval(code: str, skfunction):
kernel = sk.kernel()
function_context = kernel.create_new_context()
function_context['code'] = code
code_eval = skfunction(context=function_context)
code_eval = re.sub('\s+', ' ', code_eval.result).replace('\n', '')
code_eval = ast.literal_eval(code_eval)
rationale, label = code_eval['rationale'], code_eval['evalaution']
return rationale, label

# %% ../nbs/00_core.ipynb 4
def classify(x, labels, llm_labeler, max_failures=5, default_label=0):
failures = 0
Expand All @@ -31,10 +49,55 @@ def classify(x, labels, llm_labeler, max_failures=5, default_label=0):
pass
if failures == max_failures:
return default_label

def classify_sk(x, labels, skfunction, max_failures=5, default_label=0, default_rationale=None):
failures = 0
while failures < max_failures:
try:
rationale, label = sk_code_eval(x, skfunction)
label = labels.index(label)
time.sleep(1)
return {"label": label, "rationale": rationale}
except Exception as e:
failures += 1
print(e)
time.sleep(1)
pass
if failures == max_failures:
return {"label": default_label, "rationale": default_rationale}

def label_dataset_sk(
dataset, text_column, labels, skfunction, sample=0.1, num_workers=4, max_chars=4096
):
"""
Filters a dataset using a labeler model.
Args:
dataset (datasets.Dataset): Dataset to filter
text_column (str): Name of the column containing the text to classify
labels (List[str]): List of labels
skfunction (Any): The semantic kernel annotation function
sample (float): The fraction of the dataset to label and use for filtering
batch_size (int): Batch size for labeling
num_workers (int): Number of workers for labeling
max_chars (int): Maximum number of characters to truncate the text to before labeling (reduces rate limiting errors)
"""

# Get a subset of the dataset
subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample)))

# Label the subset
subset = subset.map(
lambda x: classify_sk(x[text_column][:max_chars], labels, skfunction),
batched=False,
num_proc=num_workers,
)

return subset

# %% ../nbs/00_core.ipynb 5
def label_dataset(
dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4_096
dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4096
):
"""
Filters a dataset using a labeler model.
Expand Down

0 comments on commit 5d8dd1f

Please sign in to comment.