Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eval/bugfix/optional eval inputs #37425

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -487,21 +487,22 @@ def _evaluate( # pylint: disable=too-many-locals
)

trace_destination = pf_client._config.get_trace_destination()

target_run = None

target_generated_columns = set()

# Create default configuration for evaluators that directly maps
# input data names to keyword inputs of the same name in the evaluators.
if not evaluator_config:
evaluator_config = {}
if "default" not in evaluator_config:
evaluator_config["default"] = {}

# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
if data is not None and target is not None:
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
)

# Make sure, the default is always in the configuration.
if not evaluator_config:
evaluator_config = {}
if "default" not in evaluator_config:
evaluator_config["default"] = {}

for evaluator_name, mapping in evaluator_config.items():
mapped_to_values = set(mapping.values())
for col in target_generated_columns:
Expand All @@ -518,6 +519,16 @@ def _evaluate( # pylint: disable=too-many-locals
# everything we need for evaluators.
_validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)

# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
# via target mapping.
# If both the data and the output dictionary of the target function
# have the same column, then the target function value is used.
if input_data_df is not None:
for col in input_data_df.columns:
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
# Also ignore columns that are already in config, since they've been covered by target mapping.
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
evaluator_config["default"][col] = f"${{data.{col}}}"
# Batch Run
evaluators_info = {}
use_pf_client = kwargs.get("_use_pf_client", True)
Expand Down Expand Up @@ -586,7 +597,6 @@ def _evaluate( # pylint: disable=too-many-locals
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
metrics.update(evaluators_metric)

studio_url = _log_metrics_and_instance_results(
metrics,
result_df,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"question":"How long is flight from Earth to LV-426?","answer":"There is nothing good there."}
{"question":"Why there is no central heating on the street?","answer":"There is no central heating on the streets today, but it will be, I promise."}
{"question":"Why these questions are so strange?","answer":"The life is strange..."}
113 changes: 113 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ def questions_wrong_file():
def questions_answers_file():
return _get_file("questions_answers.jsonl")

@pytest.fixture
def questions_answers_basic_file():
return _get_file("questions_answers_basic.jsonl")


def _target_fn(query):
"""An example target function."""
Expand All @@ -89,6 +93,15 @@ def _target_fn2(query):
response["query"] = f"The query is as follows: {query}"
return response

def _new_answer_target():
return {"answer": "new answer"}

def _question_override_target(question):
return {"question": "new question"}

def _question_answer_override_target(question, answer):
return {"question": "new question", "answer": "new answer"}


@pytest.mark.usefixtures("mock_model_config")
@pytest.mark.unittest
Expand Down Expand Up @@ -507,3 +520,103 @@ def test_general_aggregation(self):
assert aggregation["thing.metric"] == 3
assert aggregation["other_thing.other_meteric"] == -3
assert aggregation["final_thing.final_metric"] == 0.4

@pytest.mark.parametrize("use_pf_client", [True, False])
def test_optional_inputs_with_data(self, questions_file, questions_answers_basic_file, use_pf_client):
from test_evaluators.test_inputs_evaluators import (
NonOptionalEval,
HalfOptionalEval,
OptionalEval,
NoInputEval
)

# All variants work with both keyworded inputs
results = evaluate(
data=questions_answers_basic_file,
evaluators={
"non": NonOptionalEval(),
"half": HalfOptionalEval(),
"opt": OptionalEval(),
"no": NoInputEval()
},
_use_pf_client=use_pf_client
) # type: ignore

first_row = results["rows"][0]
assert first_row["outputs.non.non_score"] == 0
assert first_row["outputs.half.half_score"] == 1
assert first_row["outputs.opt.opt_score"] == 3
# CodeClient doesn't like no-input evals.
if use_pf_client:
assert first_row["outputs.no.no_score"] == 0

# Variant with no default inputs fails on single input
with pytest.raises(ValueError) as exc_info:
evaluate(
data=questions_file,
evaluators={
"non": NonOptionalEval(),
},
_use_pf_client=use_pf_client
) # type: ignore
assert exc_info._excinfo[1].__str__() == "Missing required inputs for evaluator non : ['answer']." # type: ignore

# Variants with default answer work when only question is inputted
only_question_results = evaluate(
data=questions_file,
evaluators={
"half": HalfOptionalEval(),
"opt": OptionalEval(),
"no": NoInputEval()
},
_use_pf_client=use_pf_client
) # type: ignore

first_row_2 = only_question_results["rows"][0]
assert first_row_2["outputs.half.half_score"] == 0
assert first_row_2["outputs.opt.opt_score"] == 1
if use_pf_client:
assert first_row["outputs.no.no_score"] == 0

@pytest.mark.parametrize("use_pf_client", [True, False])
def test_optional_inputs_with_target(self, questions_file, questions_answers_basic_file, use_pf_client):
from test_evaluators.test_inputs_evaluators import EchoEval

# Check that target overrides default inputs
target_answer_results = evaluate(
data=questions_file,
target=_new_answer_target,
evaluators={
"echo": EchoEval()
},
_use_pf_client=use_pf_client
) # type: ignore

assert target_answer_results['rows'][0]['outputs.echo.echo_question'] == 'How long is flight from Earth to LV-426?'
assert target_answer_results['rows'][0]['outputs.echo.echo_answer'] == 'new answer'

# Check that target replaces inputs from data (I.E. if both data and target have same output
# the target output is sent to the evaluator.)
question_override_results = evaluate(
data=questions_answers_basic_file,
target=_question_override_target,
evaluators={
"echo": EchoEval()
},
_use_pf_client=use_pf_client
) # type: ignore

assert question_override_results['rows'][0]['outputs.echo.echo_question'] == "new question"
assert question_override_results['rows'][0]['outputs.echo.echo_answer'] == 'There is nothing good there.'

# Check that target can replace default and data inputs at the same time.
double_override_results = evaluate(
data=questions_answers_basic_file,
target=_question_answer_override_target,
evaluators={
"echo": EchoEval()
},
_use_pf_client=use_pf_client
) # type: ignore
assert double_override_results['rows'][0]['outputs.echo.echo_question'] == "new question"
assert double_override_results['rows'][0]['outputs.echo.echo_answer'] == "new answer"
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

# A collection of very simple evaluators designed to test column mappings.
# (aka proper data file -> _call__ input mapping)

class NonOptionalEval():
def __init__(self):
pass

def __call__(self, question, answer):
return {"non_score": 0}

class HalfOptionalEval():
def __init__(self):
pass

def __call__(self, question, *, answer = "default"):
return {"half_score": 0 if answer == "default" else 1}


class OptionalEval():
def __init__(self):
pass

def __call__(self, *, question = "default", answer = "default"):
return {"opt_score": (0 if question == "default" else 1) + (0 if answer == "default" else 2)}

class NoInputEval():
def __init__(self):
pass

def __call__(self):
return {"no_score": 0}

class EchoEval():
def __init__(self):
pass

def __call__(self, *, question = "default", answer = "default"):
return {"echo_question": question, "echo_answer": answer}


Loading