From 2725d9283bcc320e411b9d0b710627707ac3a4a6 Mon Sep 17 00:00:00 2001 From: Miles Holland Date: Tue, 17 Sep 2024 11:33:13 -0400 Subject: [PATCH 1/6] default evaluate column mapping --- .../azure/ai/evaluation/evaluate/_evaluate.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py index 653d352d6084..4ee04bd59b69 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py @@ -488,21 +488,23 @@ def _evaluate( # pylint: disable=too-many-locals ) trace_destination = pf_client._config.get_trace_destination() - target_run = None - target_generated_columns = set() + + # Create default configuration for evaluators that directly maps + # input data names to keyword inputs of the same name in the evaluators. + if not evaluator_config: + evaluator_config = {} + if "default" not in evaluator_config: + evaluator_config["default"] = {} + for col in input_data_df.columns: + evaluator_config["default"][col] = f"${{data.{col}}}" + if data is not None and target is not None: input_data_df, target_generated_columns, target_run = _apply_target_to_data( target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name") ) - # Make sure, the default is always in the configuration. - if not evaluator_config: - evaluator_config = {} - if "default" not in evaluator_config: - evaluator_config["default"] = {} - for evaluator_name, mapping in evaluator_config.items(): mapped_to_values = set(mapping.values()) for col in target_generated_columns: From 46db6d0f930742f2a9653779930484d7407ce9f1 Mon Sep 17 00:00:00 2001 From: Miles Holland Date: Tue, 17 Sep 2024 13:09:00 -0400 Subject: [PATCH 2/6] auto column mapping in evaluate --- .../azure/ai/evaluation/evaluate/_evaluate.py | 1 - .../data/questions_answers_basic.jsonl | 3 ++ .../tests/unittests/test_evaluate.py | 50 +++++++++++++++++++ .../test_evaluators/test_inputs_evaluators.py | 29 +++++++++++ 4 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/data/questions_answers_basic.jsonl create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py index 4ee04bd59b69..21e1c3f838ae 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py @@ -589,7 +589,6 @@ def _evaluate( # pylint: disable=too-many-locals result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True) metrics = _aggregate_metrics(evaluators_result_df, evaluators) metrics.update(evaluators_metric) - studio_url = _log_metrics_and_instance_results( metrics, result_df, diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/questions_answers_basic.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/questions_answers_basic.jsonl new file mode 100644 index 000000000000..c12881badec2 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/questions_answers_basic.jsonl @@ -0,0 +1,3 @@ +{"question":"How long is flight from Earth to LV-426?","answer":"There is nothing good there."} +{"question":"Why there is no central heating on the street?","answer":"There is no central heating on the streets today, but it will be, I promise."} +{"question":"Why these questions are so strange?","answer":"The life is strange..."} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 9562bf03d995..c5ea5e74043f 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -8,6 +8,7 @@ import pytest from pandas.testing import assert_frame_equal from promptflow.client import PFClient +from sympy import im from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME from azure.ai.evaluation.evaluate import evaluate @@ -67,6 +68,10 @@ def questions_wrong_file(): def questions_answers_file(): return _get_file("questions_answers.jsonl") +@pytest.fixture +def questions_answers_basic_file(): + return _get_file("questions_answers_basic.jsonl") + def _target_fn(question): """An example target function.""" @@ -507,3 +512,48 @@ def test_general_aggregation(self): assert aggregation["thing.metric"] == 3 assert aggregation["other_thing.other_meteric"] == -3 assert aggregation["final_thing.final_metric"] == 0.4 + + @pytest.mark.parametrize("use_pf_client", [True]) + def test_optional_inputs(self, questions_file, questions_answers_basic_file, use_pf_client,): + from test_evaluators.test_inputs_evaluators import NonOptionalEval, HalfOptionalEval, OptionalEval + + # All variants work with both keyworded inputs + results = evaluate( + data=questions_answers_basic_file, + evaluators={ + "non": NonOptionalEval(), + "half": HalfOptionalEval(), + "opt": OptionalEval() + }, + _use_pf_client=use_pf_client + ) + + first_row = results["rows"][0] + assert first_row["outputs.non.non_score"] == 0 + assert first_row["outputs.half.half_score"] == 1 + assert first_row["outputs.opt.opt_score"] == 3 + + # Variant with no default inputs fails on single input + with pytest.raises(ValueError) as exc_info: + evaluate( + data=questions_file, + evaluators={ + "non": NonOptionalEval(), + }, + _use_pf_client=use_pf_client + ) + assert exc_info._excinfo[1].__str__() == "Missing required inputs for evaluator non : ['answer']." + + # Variants with default answer work when only question is inputted + only_question_results = evaluate( + data=questions_file, + evaluators={ + "half": HalfOptionalEval(), + "opt": OptionalEval() + }, + _use_pf_client=use_pf_client + ) + + first_row_2 = only_question_results["rows"][0] + assert first_row_2["outputs.half.half_score"] == 0 + assert first_row_2["outputs.opt.opt_score"] == 1 diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py new file mode 100644 index 000000000000..f31609644503 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py @@ -0,0 +1,29 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# A collection of very simple evaluators designed to test column mappings. +# (aka proper data file -> _call__ input mapping) + +class NonOptionalEval(): + def __init__(self): + pass + + def __call__(self, question, answer): + return {"non_score": 0} + + +class HalfOptionalEval(): + def __init__(self): + pass + + def __call__(self, question, answer = "default"): + return {"half_score": 0 if answer == "default" else 1} + + +class OptionalEval(): + def __init__(self): + pass + + def __call__(self, question = "default", answer = "default"): + return {"opt_score": (0 if question == "default" else 1) + (0 if answer == "default" else 2)} From 221b5dfae745c7d0a756ebf044757e8d4ab1d754 Mon Sep 17 00:00:00 2001 From: Miles Holland Date: Tue, 17 Sep 2024 13:20:10 -0400 Subject: [PATCH 3/6] add * to call signatures --- .../tests/unittests/test_evaluators/test_inputs_evaluators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py index f31609644503..01839f112e6e 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py @@ -17,7 +17,7 @@ class HalfOptionalEval(): def __init__(self): pass - def __call__(self, question, answer = "default"): + def __call__(self, question, *, answer = "default"): return {"half_score": 0 if answer == "default" else 1} @@ -25,5 +25,5 @@ class OptionalEval(): def __init__(self): pass - def __call__(self, question = "default", answer = "default"): + def __call__(self, *, question = "default", answer = "default"): return {"opt_score": (0 if question == "default" else 1) + (0 if answer == "default" else 2)} From a01e0fe13396f53b2e862f6b38db060dc51e30f8 Mon Sep 17 00:00:00 2001 From: Miles Holland Date: Thu, 19 Sep 2024 12:43:30 -0400 Subject: [PATCH 4/6] account for target --- .../azure/ai/evaluation/evaluate/_evaluate.py | 13 ++- .../tests/unittests/test_evaluate.py | 86 ++++++++++++++++--- .../test_evaluators/test_inputs_evaluators.py | 17 +++- 3 files changed, 102 insertions(+), 14 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py index 21e1c3f838ae..4c33de8b57c0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/evaluate/_evaluate.py @@ -497,9 +497,8 @@ def _evaluate( # pylint: disable=too-many-locals evaluator_config = {} if "default" not in evaluator_config: evaluator_config["default"] = {} - for col in input_data_df.columns: - evaluator_config["default"][col] = f"${{data.{col}}}" + # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs if data is not None and target is not None: input_data_df, target_generated_columns, target_run = _apply_target_to_data( target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name") @@ -521,6 +520,16 @@ def _evaluate( # pylint: disable=too-many-locals # everything we need for evaluators. _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config) + # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned + # via target mapping. + # If both the data and the output dictionary of the target function + # have the same column, then the target function value is used. + if input_data_df is not None: + for col in input_data_df.columns: + # Ignore columns added by target mapping. These are formatted as "__outputs." + # Also ignore columns that are already in config, since they've been covered by target mapping. + if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys(): + evaluator_config["default"][col] = f"${{data.{col}}}" # Batch Run evaluators_info = {} use_pf_client = kwargs.get("_use_pf_client", True) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index c5ea5e74043f..11f8fd41d704 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -94,6 +94,15 @@ def _target_fn2(question): response["question"] = f"The question is as follows: {question}" return response +def _new_answer_target(): + return {"answer": "new answer"} + +def _question_override_target(question): + return {"question": "new question"} + +def _question_answer_override_target(question, answer): + return {"question": "new question", "answer": "new answer"} + @pytest.mark.usefixtures("mock_model_config") @pytest.mark.unittest @@ -513,25 +522,34 @@ def test_general_aggregation(self): assert aggregation["other_thing.other_meteric"] == -3 assert aggregation["final_thing.final_metric"] == 0.4 - @pytest.mark.parametrize("use_pf_client", [True]) - def test_optional_inputs(self, questions_file, questions_answers_basic_file, use_pf_client,): - from test_evaluators.test_inputs_evaluators import NonOptionalEval, HalfOptionalEval, OptionalEval - + @pytest.mark.parametrize("use_pf_client", [True, False]) + def test_optional_inputs_with_data(self, questions_file, questions_answers_basic_file, use_pf_client): + from test_evaluators.test_inputs_evaluators import ( + NonOptionalEval, + HalfOptionalEval, + OptionalEval, + NoInputEval + ) + # All variants work with both keyworded inputs results = evaluate( data=questions_answers_basic_file, evaluators={ "non": NonOptionalEval(), "half": HalfOptionalEval(), - "opt": OptionalEval() + "opt": OptionalEval(), + "no": NoInputEval() }, _use_pf_client=use_pf_client - ) + ) # type: ignore first_row = results["rows"][0] assert first_row["outputs.non.non_score"] == 0 assert first_row["outputs.half.half_score"] == 1 assert first_row["outputs.opt.opt_score"] == 3 + # CodeClient doesn't like no-input evals. + if use_pf_client: + assert first_row["outputs.no.no_score"] == 0 # Variant with no default inputs fails on single input with pytest.raises(ValueError) as exc_info: @@ -541,19 +559,65 @@ def test_optional_inputs(self, questions_file, questions_answers_basic_file, use "non": NonOptionalEval(), }, _use_pf_client=use_pf_client - ) - assert exc_info._excinfo[1].__str__() == "Missing required inputs for evaluator non : ['answer']." + ) # type: ignore + assert exc_info._excinfo[1].__str__() == "Missing required inputs for evaluator non : ['answer']." # type: ignore # Variants with default answer work when only question is inputted only_question_results = evaluate( data=questions_file, evaluators={ "half": HalfOptionalEval(), - "opt": OptionalEval() + "opt": OptionalEval(), + "no": NoInputEval() }, _use_pf_client=use_pf_client - ) - + ) # type: ignore + first_row_2 = only_question_results["rows"][0] assert first_row_2["outputs.half.half_score"] == 0 assert first_row_2["outputs.opt.opt_score"] == 1 + if use_pf_client: + assert first_row["outputs.no.no_score"] == 0 + + @pytest.mark.parametrize("use_pf_client", [True]) + def test_optional_inputs_with_target(self, questions_file, questions_answers_basic_file, use_pf_client): + from test_evaluators.test_inputs_evaluators import EchoEval + + # Check that target overrides default inputs + target_answer_results = evaluate( + data=questions_file, + target=_new_answer_target, + evaluators={ + "echo": EchoEval() + }, + _use_pf_client=use_pf_client + ) # type: ignore + + assert target_answer_results['rows'][0]['outputs.echo.echo_question'] == 'How long is flight from Earth to LV-426?' + assert target_answer_results['rows'][0]['outputs.echo.echo_answer'] == 'new answer' + + # Check that target replaces inputs from data (I.E. if both data and target have same output + # the target output is sent to the evaluator.) + question_override_results = evaluate( + data=questions_answers_basic_file, + target=_question_override_target, + evaluators={ + "echo": EchoEval() + }, + _use_pf_client=use_pf_client + ) # type: ignore + + assert question_override_results['rows'][0]['outputs.echo.echo_question'] == "new question" + assert question_override_results['rows'][0]['outputs.echo.echo_answer'] == 'There is nothing good there.' + + # Check that target can replace default and data inputs at the same time. + double_override_results = evaluate( + data=questions_answers_basic_file, + target=_question_answer_override_target, + evaluators={ + "echo": EchoEval() + }, + _use_pf_client=use_pf_client + ) # type: ignore + assert double_override_results['rows'][0]['outputs.echo.echo_question'] == "new question" + assert double_override_results['rows'][0]['outputs.echo.echo_answer'] == "new answer" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py index 01839f112e6e..82222eb916af 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluators/test_inputs_evaluators.py @@ -12,7 +12,6 @@ def __init__(self): def __call__(self, question, answer): return {"non_score": 0} - class HalfOptionalEval(): def __init__(self): pass @@ -27,3 +26,19 @@ def __init__(self): def __call__(self, *, question = "default", answer = "default"): return {"opt_score": (0 if question == "default" else 1) + (0 if answer == "default" else 2)} + +class NoInputEval(): + def __init__(self): + pass + + def __call__(self): + return {"no_score": 0} + +class EchoEval(): + def __init__(self): + pass + + def __call__(self, *, question = "default", answer = "default"): + return {"echo_question": question, "echo_answer": answer} + + From 795bce32f843aae11826382d905549687d9403ec Mon Sep 17 00:00:00 2001 From: Miles Holland Date: Thu, 19 Sep 2024 13:05:08 -0400 Subject: [PATCH 5/6] remove accidental import --- .../azure-ai-evaluation/tests/unittests/test_evaluate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 46d16bb7926c..070a3b2a2fd8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -8,7 +8,6 @@ import pytest from pandas.testing import assert_frame_equal from promptflow.client import PFClient -from sympy import im from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME from azure.ai.evaluation._evaluate._evaluate import ( From 65c85320cab05a9302fbda6f783ca49bb770be5d Mon Sep 17 00:00:00 2001 From: Miles Holland Date: Thu, 19 Sep 2024 15:43:11 -0400 Subject: [PATCH 6/6] parameterize --- .../azure-ai-evaluation/tests/unittests/test_evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 070a3b2a2fd8..6fe4e4cd01ba 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -578,7 +578,7 @@ def test_optional_inputs_with_data(self, questions_file, questions_answers_basic if use_pf_client: assert first_row["outputs.no.no_score"] == 0 - @pytest.mark.parametrize("use_pf_client", [True]) + @pytest.mark.parametrize("use_pf_client", [True, False]) def test_optional_inputs_with_target(self, questions_file, questions_answers_basic_file, use_pf_client): from test_evaluators.test_inputs_evaluators import EchoEval