Skip to content

Commit

Permalink
Feature/add eci and ip simulators (#3688)
Browse files Browse the repository at this point in the history
Adds 2 new simulators and evaluators. One for Protected Materials, and
an internal one for ECI.

Tests for all new systems are included, *but are somewhat incomplete
until some backend questions are answered*.
  • Loading branch information
MilesHolland committed Aug 28, 2024
1 parent 8546ad9 commit acdc2f4
Show file tree
Hide file tree
Showing 16 changed files with 268,189 additions and 15 deletions.
1 change: 1 addition & 0 deletions src/promptflow-evals/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## v0.3.3 (Upcoming)
### Features Added
- Add a new evaluator (ProtectedMaterialsEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected materials, which determines if given inputs contain materials protected by IP laws.

### Bugs Fixed
- Fixed evaluators to accept (non-Azure) Open AI Configs.
Expand Down
22 changes: 21 additions & 1 deletion src/promptflow-evals/promptflow/evals/_common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,33 @@ class Tasks:
"""Defines types of annotation tasks supported by RAI Service."""

CONTENT_HARM = "content harm"
PROTECTED_MATERIAL = "protected material"


class _InternalAnnotationTasks:
"""Defines types of annotation tasks that are supported for internal use. Such tasks are
experimental and subject to potential change or migration to the main Evaluation Metrics
enum over time."""

ECI = "eci"


class EvaluationMetrics:
"""Evaluation metrics for content safety evaluation."""
"""Evaluation metrics to aid the RAI service in determining what
metrics to request, and how to present them back to the user."""

HATE_FAIRNESS = "hate_fairness"
HATE_UNFAIRNESS = "hate_unfairness"
VIOLENCE = "violence"
SELF_HARM = "self_harm"
SEXUAL = "sexual"
PROTECTED_MATERIAL = "protected_material"


class _InternalEvaluationMetrics:
"""Evaluation metrics that are not publicly supported.
These metrics are experimental and subject to potential change or migration to the main
enum over time.
"""

ECI = "eci"
81 changes: 71 additions & 10 deletions src/promptflow-evals/promptflow/evals/_common/rai_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,18 @@
from azure.core.credentials import TokenCredential
from azure.identity import DefaultAzureCredential

try:
from .constants import CommonConstants, EvaluationMetrics, RAIService, Tasks
from .utils import get_harm_severity_level
except ImportError:
from constants import CommonConstants, EvaluationMetrics, RAIService, Tasks
from utils import get_harm_severity_level

from promptflow.evals._http_utils import get_async_http_client

from .constants import (
CommonConstants,
EvaluationMetrics,
RAIService,
Tasks,
_InternalAnnotationTasks,
_InternalEvaluationMetrics,
)
from .utils import get_harm_severity_level

try:
version = importlib.metadata.version("promptflow-evals")
except importlib.metadata.PackageNotFoundError:
Expand Down Expand Up @@ -81,6 +84,39 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
)


def generate_payload(normalized_user_text: str, metric: str) -> Dict:
"""Generate the payload for the annotation request
:param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
:type normalized_user_text: str
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
in the payload.
:type metric: str
:return: The payload for the annotation request.
:rtype: Dict
"""
include_metric = True
task = Tasks.CONTENT_HARM
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
task = Tasks.PROTECTED_MATERIAL
include_metric = False
elif metric == _InternalEvaluationMetrics.ECI:
task = _InternalAnnotationTasks.ECI
include_metric = False
return (
{
"UserTextList": [normalized_user_text],
"AnnotationTask": task,
"MetricList": [metric],
}
if include_metric
else {
"UserTextList": [normalized_user_text],
"AnnotationTask": task,
}
)


async def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, token: str) -> str:
"""Submit request to Responsible AI service for evaluation and return operation ID
Expand All @@ -99,7 +135,7 @@ async def submit_request(question: str, answer: str, metric: str, rai_svc_url: s
"""
user_text = f"<Human>{question}</><System>{answer}</>"
normalized_user_text = user_text.replace("'", '\\"')
payload = {"UserTextList": [normalized_user_text], "AnnotationTask": Tasks.CONTENT_HARM, "MetricList": [metric]}
payload = generate_payload(normalized_user_text, metric)

url = rai_svc_url + "/submitannotation"
headers = get_common_headers(token)
Expand Down Expand Up @@ -161,8 +197,33 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre

def parse_response( # pylint: disable=too-many-branches,too-many-statements
batch_response: List[Dict], metric_name: str
) -> List[List[dict]]:
"""Parse the annotation response from Responsible AI service
) -> Dict:
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
:param batch_response: The annotation response from Responsible AI service.
:type batch_response: List[Dict]
:param metric_name: The evaluation metric to use.
:type metric_name: str
:return: The parsed annotation result.
:rtype: List[List[Dict]]
"""

if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
return {}
response = batch_response[0][metric_name]
response = response.replace("false", "False")
response = response.replace("true", "True")
parsed_response = literal_eval(response)
result = {}
result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
return result
return _parse_content_harm_response(batch_response, metric_name)


def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
:param batch_response: The annotation response from Responsible AI service.
:type batch_response: List[Dict]
Expand Down
2 changes: 2 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ._f1_score import F1ScoreEvaluator
from ._fluency import FluencyEvaluator
from ._groundedness import GroundednessEvaluator
from ._protected_materials import ProtectedMaterialsEvaluator
from ._qa import QAEvaluator
from ._relevance import RelevanceEvaluator
from ._similarity import SimilarityEvaluator
Expand All @@ -34,4 +35,5 @@
"HateUnfairnessEvaluator",
"ContentSafetyEvaluator",
"ContentSafetyChatEvaluator",
"ProtectedMaterialsEvaluator",
]
90 changes: 90 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from promptflow._utils.async_utils import async_run_allowing_running_loop
from promptflow.evals._common.constants import _InternalEvaluationMetrics
from promptflow.evals._common.rai_service import evaluate_with_rai_service


class _AsyncECIEvaluator:
def __init__(self, project_scope: dict, credential=None):
self._project_scope = project_scope
self._credential = credential

async def __call__(self, *, question: str, answer: str, **kwargs):
# Validate inputs
# Raises value error if failed, so execution alone signifies success.
if not (question and question.strip() and question != "None") or not (
answer and answer.strip() and answer != "None"
):
raise ValueError("Both 'question' and 'answer' must be non-empty strings.")

# Run score computation based on supplied metric.
result = await evaluate_with_rai_service(
metric_name=_InternalEvaluationMetrics.ECI,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)
return result


class ECIEvaluator:
"""
Initialize an ECI evaluator to evaluate ECI based on the following guidelines:
Detects whether ECI is present without a disclaimer in the AI system’s response.
After the response is annotated by GPT using the binary annotation prompt
(which checks for disclaimer or authoritative sources),
we now want to check, if there was a disclaimer, the language should be exactly
"AI-generated content may be incorrect. If you are seeking ECI-related information, please go to Bing Search."
Outputs True or False with AI-generated reasoning.
:param project_scope: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type project_scope: dict
:param credential: The credential for connecting to Azure AI project.
:type credential: ~azure.core.credentials.TokenCredential
:return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
:rtype: Dict[str, str]
**Usage**
.. code-block:: python
project_scope = {
"subscription_id": "<subscription_id>",
"resource_group_name": "<resource_group_name>",
"project_name": "<project_name>",
}
eval_fn = ECIEvaluator(project_scope)
result = eval_fn(question="What is the capital of France?", answer="Paris.")
**Output format**
.. code-block:: python
{
"label": "False",
"reasoning": "Some reason."
}
"""

def __init__(self, project_scope: dict, credential=None) -> None:
self._async_evaluator = _AsyncECIEvaluator(project_scope, credential)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates ECI content.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The ECI score.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from ._protected_materials import ProtectedMaterialsEvaluator

__all__ = [
"ProtectedMaterialsEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from promptflow._utils.async_utils import async_run_allowing_running_loop
from promptflow.evals._common.constants import EvaluationMetrics
from promptflow.evals._common.rai_service import evaluate_with_rai_service


class _AsyncProtectedMaterialsEvaluator:
def __init__(self, project_scope: dict, credential=None):
self._project_scope = project_scope
self._credential = credential

async def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates content according to this evaluator's metric.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The evaluation score computation based on the Content Safety metric (self.metric).
:rtype: Any
"""
# Validate inputs
# Raises value error if failed, so execution alone signifies success.
if not (question and question.strip() and question != "None") or not (
answer and answer.strip() and answer != "None"
):
raise ValueError("Both 'question' and 'answer' must be non-empty strings.")

# Run score computation based on supplied metric.
result = await evaluate_with_rai_service(
metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)
return result


class ProtectedMaterialsEvaluator:
"""
Initialize a protected materials evaluator to detect whether protected material
is present in your AI system's response. Outputs True or False with AI-generated reasoning.
:param project_scope: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type project_scope: dict
:param credential: The credential for connecting to Azure AI project.
:type credential: ~azure.core.credentials.TokenCredential
:return: Whether or not protected material was found in the response, with AI-generated reasoning.
:rtype: Dict[str, str]
**Usage**
.. code-block:: python
project_scope = {
"subscription_id": "<subscription_id>",
"resource_group_name": "<resource_group_name>",
"project_name": "<project_name>",
}
eval_fn = ProtectedMaterialsEvaluator(project_scope)
result = eval_fn(question="What is the capital of France?", answer="Paris.")
**Output format**
.. code-block:: python
{
"label": "False",
"reasoning": "This question does not contain any protected material."
}
"""

def __init__(self, project_scope: dict, credential=None):
self._async_evaluator = _AsyncProtectedMaterialsEvaluator(project_scope, credential)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates protected materials content.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: A dictionary containing a boolean label and reasoning.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

def _to_async(self):
return self._async_evaluator
Loading

0 comments on commit acdc2f4

Please sign in to comment.