-
Notifications
You must be signed in to change notification settings - Fork 833
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/add eci and ip simulators (#3688)
Adds 2 new simulators and evaluators. One for Protected Materials, and an internal one for ECI. Tests for all new systems are included, *but are somewhat incomplete until some backend questions are answered*.
- Loading branch information
1 parent
8546ad9
commit acdc2f4
Showing
16 changed files
with
268,189 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
90 changes: 90 additions & 0 deletions
90
src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
from promptflow._utils.async_utils import async_run_allowing_running_loop | ||
from promptflow.evals._common.constants import _InternalEvaluationMetrics | ||
from promptflow.evals._common.rai_service import evaluate_with_rai_service | ||
|
||
|
||
class _AsyncECIEvaluator: | ||
def __init__(self, project_scope: dict, credential=None): | ||
self._project_scope = project_scope | ||
self._credential = credential | ||
|
||
async def __call__(self, *, question: str, answer: str, **kwargs): | ||
# Validate inputs | ||
# Raises value error if failed, so execution alone signifies success. | ||
if not (question and question.strip() and question != "None") or not ( | ||
answer and answer.strip() and answer != "None" | ||
): | ||
raise ValueError("Both 'question' and 'answer' must be non-empty strings.") | ||
|
||
# Run score computation based on supplied metric. | ||
result = await evaluate_with_rai_service( | ||
metric_name=_InternalEvaluationMetrics.ECI, | ||
question=question, | ||
answer=answer, | ||
project_scope=self._project_scope, | ||
credential=self._credential, | ||
) | ||
return result | ||
|
||
|
||
class ECIEvaluator: | ||
""" | ||
Initialize an ECI evaluator to evaluate ECI based on the following guidelines: | ||
Detects whether ECI is present without a disclaimer in the AI system’s response. | ||
After the response is annotated by GPT using the binary annotation prompt | ||
(which checks for disclaimer or authoritative sources), | ||
we now want to check, if there was a disclaimer, the language should be exactly | ||
"AI-generated content may be incorrect. If you are seeking ECI-related information, please go to Bing Search." | ||
Outputs True or False with AI-generated reasoning. | ||
:param project_scope: The scope of the Azure AI project. | ||
It contains subscription id, resource group, and project name. | ||
:type project_scope: dict | ||
:param credential: The credential for connecting to Azure AI project. | ||
:type credential: ~azure.core.credentials.TokenCredential | ||
:return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning | ||
:rtype: Dict[str, str] | ||
**Usage** | ||
.. code-block:: python | ||
project_scope = { | ||
"subscription_id": "<subscription_id>", | ||
"resource_group_name": "<resource_group_name>", | ||
"project_name": "<project_name>", | ||
} | ||
eval_fn = ECIEvaluator(project_scope) | ||
result = eval_fn(question="What is the capital of France?", answer="Paris.") | ||
**Output format** | ||
.. code-block:: python | ||
{ | ||
"label": "False", | ||
"reasoning": "Some reason." | ||
} | ||
""" | ||
|
||
def __init__(self, project_scope: dict, credential=None) -> None: | ||
self._async_evaluator = _AsyncECIEvaluator(project_scope, credential) | ||
|
||
def __call__(self, *, question: str, answer: str, **kwargs): | ||
""" | ||
Evaluates ECI content. | ||
:keyword question: The question to be evaluated. | ||
:paramtype question: str | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:return: The ECI score. | ||
:rtype: dict | ||
""" | ||
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs) | ||
|
||
def _to_async(self): | ||
return self._async_evaluator |
5 changes: 5 additions & 0 deletions
5
src/promptflow-evals/promptflow/evals/evaluators/_protected_materials/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from ._protected_materials import ProtectedMaterialsEvaluator | ||
|
||
__all__ = [ | ||
"ProtectedMaterialsEvaluator", | ||
] |
95 changes: 95 additions & 0 deletions
95
...promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
# --------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# --------------------------------------------------------- | ||
from promptflow._utils.async_utils import async_run_allowing_running_loop | ||
from promptflow.evals._common.constants import EvaluationMetrics | ||
from promptflow.evals._common.rai_service import evaluate_with_rai_service | ||
|
||
|
||
class _AsyncProtectedMaterialsEvaluator: | ||
def __init__(self, project_scope: dict, credential=None): | ||
self._project_scope = project_scope | ||
self._credential = credential | ||
|
||
async def __call__(self, *, question: str, answer: str, **kwargs): | ||
""" | ||
Evaluates content according to this evaluator's metric. | ||
:keyword question: The question to be evaluated. | ||
:paramtype question: str | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:return: The evaluation score computation based on the Content Safety metric (self.metric). | ||
:rtype: Any | ||
""" | ||
# Validate inputs | ||
# Raises value error if failed, so execution alone signifies success. | ||
if not (question and question.strip() and question != "None") or not ( | ||
answer and answer.strip() and answer != "None" | ||
): | ||
raise ValueError("Both 'question' and 'answer' must be non-empty strings.") | ||
|
||
# Run score computation based on supplied metric. | ||
result = await evaluate_with_rai_service( | ||
metric_name=EvaluationMetrics.PROTECTED_MATERIAL, | ||
question=question, | ||
answer=answer, | ||
project_scope=self._project_scope, | ||
credential=self._credential, | ||
) | ||
return result | ||
|
||
|
||
class ProtectedMaterialsEvaluator: | ||
""" | ||
Initialize a protected materials evaluator to detect whether protected material | ||
is present in your AI system's response. Outputs True or False with AI-generated reasoning. | ||
:param project_scope: The scope of the Azure AI project. | ||
It contains subscription id, resource group, and project name. | ||
:type project_scope: dict | ||
:param credential: The credential for connecting to Azure AI project. | ||
:type credential: ~azure.core.credentials.TokenCredential | ||
:return: Whether or not protected material was found in the response, with AI-generated reasoning. | ||
:rtype: Dict[str, str] | ||
**Usage** | ||
.. code-block:: python | ||
project_scope = { | ||
"subscription_id": "<subscription_id>", | ||
"resource_group_name": "<resource_group_name>", | ||
"project_name": "<project_name>", | ||
} | ||
eval_fn = ProtectedMaterialsEvaluator(project_scope) | ||
result = eval_fn(question="What is the capital of France?", answer="Paris.") | ||
**Output format** | ||
.. code-block:: python | ||
{ | ||
"label": "False", | ||
"reasoning": "This question does not contain any protected material." | ||
} | ||
""" | ||
|
||
def __init__(self, project_scope: dict, credential=None): | ||
self._async_evaluator = _AsyncProtectedMaterialsEvaluator(project_scope, credential) | ||
|
||
def __call__(self, *, question: str, answer: str, **kwargs): | ||
""" | ||
Evaluates protected materials content. | ||
:keyword question: The question to be evaluated. | ||
:paramtype question: str | ||
:keyword answer: The answer to be evaluated. | ||
:paramtype answer: str | ||
:return: A dictionary containing a boolean label and reasoning. | ||
:rtype: dict | ||
""" | ||
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs) | ||
|
||
def _to_async(self): | ||
return self._async_evaluator |
Oops, something went wrong.