Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mixeval evaluator #106

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions mttl/evaluators/mixeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import json
import os
import shutil
import threading

from mttl.models.base_model import BaseExpertModel

try:
from mix_eval.api.registry import register_model
from mix_eval.evaluate import compute_metrics_p, eval, parse_args
from mix_eval.models.base import ChatModel

mixeval_available = True

except ImportError:
mixeval_available = False
register_model = lambda x: x


from copy import deepcopy
from dataclasses import dataclass

import torch
from transformers import AutoTokenizer

from mttl.datamodule.utils import get_tokenizer_with_args
from mttl.evaluators.base import GenerativeEvaluator
from mttl.models.expert_model import MultiExpertModel, MultiExpertModelConfig
from mttl.models.library.expert_library import ExpertLibrary


@dataclass
class MixEvalConfig:
batch_size: int = 8
model_name: str = "mix_eval_expert_adapter"
benchmark: str = "mixeval_hard"
data_path: str = None
free_form_parser: str = "model"
multi_choice_parser: str = "model"
multichoice_judge: str = "gpt-4o-mini"
freeform_judge: str = "gpt-4o-mini"
extract_base_model_response: bool = False
compute_score_from_judged_file: bool = False
version: str = "2024-08-11"
split: str = None
output_dir: str = None
verbose: bool = False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we still need an --api_base_url

api_parallel_num: int = 10


@register_model("mix_eval_expert_adapter")
class MultiExpertAdapter(ChatModel):
# model context is used to inject model into the class
model_context = threading.local()

def chunk_generate(
self,
inputs,
model,
tok,
max_tokens: int,
sliding_window: int = 128 * 1024,
chunk_size: int = 2500,
verbose: bool = False,
chunked: bool = False,
**kwargs,
):
if chunked:
raise ValueError("Chunked is not supported.")

with torch.no_grad():
input_ids = inputs.input_ids # (b, n)
attention_mask = inputs.attention_mask # (b, n)

outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=max_tokens,
**kwargs,
)
generated_ids = [
output_ids[len(in_ids) :] for in_ids, output_ids in zip(input_ids, outputs)
]
responses = tok.batch_decode(generated_ids, skip_special_tokens=True)
return responses

def __init__(self, args):
super().__init__(args)

self.model: BaseExpertModel = self.model_context.model
self.tokenizer = get_tokenizer_with_args(
model_name=self.model.base_model_name_or_path,
model_family="gpt",
padding_side="left",
truncation_side="left",
for_generation=True,
)

self.SYSTEM_MESSAGE = {
"role": "system",
"content": "You are a helpful assistant.",
} # set to None if no system message
self.USER_MESSAGE_TEMPLATE = lambda x: {"role": "user", "content": x}
self.ASSISTANT_MESSAGE_TEMPLATE = lambda x: {"role": "assistant", "content": x}

self.model_max_len = self.model.max_position_embeddings
self.max_input_length_closeend = (
min(self.model_max_len, self.max_input_length)
- self.closeended_max_new_tokens
)
self.max_input_length_openend = (
min(self.model_max_len, self.max_input_length)
- self.openended_max_new_tokens
)


class MixEvalEvaluator(GenerativeEvaluator):
def __init__(self, config: MixEvalConfig = None):
super().__init__(config=config or MixEvalConfig())

if not mixeval_available:
raise ValueError(
"MixEval is not installed. Please install it using `pip install mix-eval`."
)

self.download_data()

def download_data(self):
import subprocess

import mix_eval

repo_url = "https://github.com/Psycoy/MixEval.git"
data_folder = "mix_eval/data"
temp_dir = "/tmp/mixeval_repo"
target_dir = os.path.join(os.path.dirname(mix_eval.__file__), "data")

self.config.data_path = target_dir

if os.path.exists(target_dir):
return

# Clone the repository
subprocess.run(["git", "clone", repo_url, temp_dir], check=True)

# Copy the data folder to the target directory
shutil.copytree(
os.path.join(temp_dir, data_folder), target_dir, dirs_exist_ok=True
)

# Clean up the temporary directory
shutil.rmtree(temp_dir)

def evaluate(
self,
model,
split=None,
output_path=None,
verbose=False,
recompute=False,
**kwargs,
):
from mix_eval.compute_metrics import AVAILABLE_MODELS

# inject model into MultiExpertAdapter
MultiExpertAdapter.model_context.model = model

# inject model into config
self.config.verbose = verbose

if split is not None:
self.config.split = split

if output_path is not None:
self.config.output_dir = output_path
else:
raise ValueError("Output path is required for evaluation.")

if recompute:
shutil.rmtree(self.config.output_dir, ignore_errors=True)

eval(self.config)

# for some reason, available models is filled by hand rather than by the decorator, /shrug
AVAILABLE_MODELS[self.config.model_name] = "MultiExpertAdapter"
compute_metrics_p(self.config)

with open(os.path.join(self.config.output_dir, "score.json"), "r") as f:
score = json.load(f)
return score[self.config.model_name]["overall score (final score)"]


if __name__ == "__main__":
from mttl.models.containers.selectors import ArrowSelector, ArrowSelectorConfig
from mttl.models.library.library_transforms import ArrowConfig, ArrowTransform

model = MultiExpertModel.from_pretrained_library(
"sordonia/Phi-3.5-mini-instruct-28Aug",
device_map="cuda:0",
attn_implementation="flash_attention_2",
selector_config=ArrowSelectorConfig(top_k=2),
)
MixEvalEvaluator().evaluate(
model, output_path="/tmp/mixeval_phi_3.5_arrow/", verbose=True, recompute=True
)
17 changes: 17 additions & 0 deletions mttl/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import torch
from huggingface_hub import hf_hub_download
from transformers import PreTrainedModel
from transformers.modeling_outputs import CausalLMOutput

from mttl.logging import logger
Expand Down Expand Up @@ -64,6 +65,10 @@ def __init__(
if model_object is None
else model_object
)
if not isinstance(self.model, PreTrainedModel):
raise ValueError(
f"Model is not a subclass of PreTrainedModel. Got {type(self.model)}."
)

if model_object:
logger.warning(
Expand All @@ -73,6 +78,18 @@ def __init__(
self.config = config
self.loading_kwargs = loading_kwargs

@property
def base_model_name_or_path(self) -> str:
return self.config.base_model

@property
def max_position_embeddings(self) -> int:
return self.base_model.config.max_position_embeddings

@property
def base_model(self) -> PreTrainedModel:
return self.model

def _delete_non_trainable_params(
self, state_dict: Dict[str, torch.Tensor]
) -> Dict[str, torch.Tensor]:
Expand Down
15 changes: 13 additions & 2 deletions mttl/models/containers/selectors/per_token_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,11 @@ def _log_angle(self, angle):
else:
mean_angle = angle.mean()

task = self.routing_infos.task_names[0]
task_names = self.routing_infos.task_names
if task_names is None:
return

task = task_names[0]

to_store = {"angle": mean_angle.item()}
self.metric_logger.update(prefix=f"task_{task}", value_dict=to_store)
Expand All @@ -126,7 +130,11 @@ def _log_entropy(self, logits):
else:
mean_entropy = entropy.mean()

task = self.routing_infos.task_names[0]
task_names = self.routing_infos.task_names
if task_names is None:
return

task = task_names[0]

to_store = {"ent_routing": mean_entropy.item()}
self.metric_logger.update(prefix=f"task_{task}", value_dict=to_store)
Expand All @@ -139,7 +147,10 @@ def _log_entropy(self, logits):
def _maybe_log_in_dist(self, logits):
probs = F.softmax(logits, dim=-1)
bs, seq_len, _ = probs.size()

task_names = self.routing_infos.task_names
if task_names is None:
return

if all([t in self.task_to_expert_name for t in task_names]):
expert_names = [self.task_to_expert_name[t] for t in task_names]
Expand Down
13 changes: 6 additions & 7 deletions mttl/models/packed_attention_monkey_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ def flash_attn_varlen_func_wrapper(
causal,
**flash_kwargs,
):
if query_states.shape != key_states.shape:
raise ValueError("q and k must have the same shape")

context = InfoContainer.get()
if context is not None and context.routing_infos.packed_seq_lens is not None:
if query_states.shape != key_states.shape:
raise ValueError("q and k must have the same shape")

warn_once(
"\n\n\n\nUsing the Flash Attention 2 Sequence Packing Wrapper\n\n\n\n"
)
Expand Down Expand Up @@ -89,15 +89,14 @@ def flash_attn_func_wrapper(
deterministic=False,
return_attn_probs=False,
):

if q.shape != k.shape:
raise ValueError("q and k must have the same shape")

# assert there are no padding tokens if we get here
context = InfoContainer.get()
assert (context.routing_infos.attention_mask == 1).all() # no padding tokens

if context.routing_infos.packed_seq_lens is not None:
if q.shape != k.shape:
raise ValueError("q and k must have the same shape")

cu_seqlens_q = cu_seqlens_k = context.routing_infos.packed_seq_lens
max_seqlen_q = max_seqlen_k = context.routing_infos.seq_lens.max().item()
q, k, v = q.flatten(0, 1), k.flatten(0, 1), v.flatten(0, 1)
Expand Down