Feature/add eci and ip simulators (#3688)

Adds 2 new simulators and evaluators. One for Protected Materials, and an internal one for ECI. Tests for all new systems are included, *but are somewhat incomplete until some backend questions are answered*.
microsoft · Aug 28, 2024 · acdc2f4 · acdc2f4
1 parent 8546ad9
commit acdc2f4
Show file tree

Hide file tree

Showing 16 changed files with 268,189 additions and 15 deletions.
diff --git a/src/promptflow-evals/CHANGELOG.md b/src/promptflow-evals/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## v0.3.3 (Upcoming)
 ### Features Added
+- Add a new evaluator (ProtectedMaterialsEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected materials, which determines if given inputs contain materials protected by IP laws.
 
 ### Bugs Fixed
 - Fixed evaluators to accept (non-Azure) Open AI Configs.

diff --git a/src/promptflow-evals/promptflow/evals/_common/constants.py b/src/promptflow-evals/promptflow/evals/_common/constants.py
@@ -31,13 +31,33 @@ class Tasks:
     """Defines types of annotation tasks supported by RAI Service."""
 
     CONTENT_HARM = "content harm"
+    PROTECTED_MATERIAL = "protected material"
+
+
+class _InternalAnnotationTasks:
+    """Defines types of annotation tasks that are supported for internal use. Such tasks are
+    experimental and subject to potential change or migration to the main Evaluation Metrics
+    enum over time."""
+
+    ECI = "eci"
 
 
 class EvaluationMetrics:
-    """Evaluation metrics for content safety evaluation."""
+    """Evaluation metrics to aid the RAI service in determining what
+    metrics to request, and how to present them back to the user."""
 
     HATE_FAIRNESS = "hate_fairness"
     HATE_UNFAIRNESS = "hate_unfairness"
     VIOLENCE = "violence"
     SELF_HARM = "self_harm"
     SEXUAL = "sexual"
+    PROTECTED_MATERIAL = "protected_material"
+
+
+class _InternalEvaluationMetrics:
+    """Evaluation metrics that are not publicly supported.
+    These metrics are experimental and subject to potential change or migration to the main
+    enum over time.
+    """
+
+    ECI = "eci"
diff --git a/src/promptflow-evals/promptflow/evals/_common/rai_service.py b/src/promptflow-evals/promptflow/evals/_common/rai_service.py
@@ -14,15 +14,18 @@
 from azure.core.credentials import TokenCredential
 from azure.identity import DefaultAzureCredential
 
-try:
-    from .constants import CommonConstants, EvaluationMetrics, RAIService, Tasks
-    from .utils import get_harm_severity_level
-except ImportError:
-    from constants import CommonConstants, EvaluationMetrics, RAIService, Tasks
-    from utils import get_harm_severity_level
-
 from promptflow.evals._http_utils import get_async_http_client
 
+from .constants import (
+    CommonConstants,
+    EvaluationMetrics,
+    RAIService,
+    Tasks,
+    _InternalAnnotationTasks,
+    _InternalEvaluationMetrics,
+)
+from .utils import get_harm_severity_level
+
 try:
     version = importlib.metadata.version("promptflow-evals")
 except importlib.metadata.PackageNotFoundError:
@@ -81,6 +84,39 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
         )
 
 
+def generate_payload(normalized_user_text: str, metric: str) -> Dict:
+    """Generate the payload for the annotation request
+
+    :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
+    :type normalized_user_text: str
+    :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
+        in the payload.
+    :type metric: str
+    :return: The payload for the annotation request.
+    :rtype: Dict
+    """
+    include_metric = True
+    task = Tasks.CONTENT_HARM
+    if metric == EvaluationMetrics.PROTECTED_MATERIAL:
+        task = Tasks.PROTECTED_MATERIAL
+        include_metric = False
+    elif metric == _InternalEvaluationMetrics.ECI:
+        task = _InternalAnnotationTasks.ECI
+        include_metric = False
+    return (
+        {
+            "UserTextList": [normalized_user_text],
+            "AnnotationTask": task,
+            "MetricList": [metric],
+        }
+        if include_metric
+        else {
+            "UserTextList": [normalized_user_text],
+            "AnnotationTask": task,
+        }
+    )
+
+
 async def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, token: str) -> str:
     """Submit request to Responsible AI service for evaluation and return operation ID
 
@@ -99,7 +135,7 @@ async def submit_request(question: str, answer: str, metric: str, rai_svc_url: s
     """
     user_text = f"<Human>{question}</><System>{answer}</>"
     normalized_user_text = user_text.replace("'", '\\"')
-    payload = {"UserTextList": [normalized_user_text], "AnnotationTask": Tasks.CONTENT_HARM, "MetricList": [metric]}
+    payload = generate_payload(normalized_user_text, metric)
 
     url = rai_svc_url + "/submitannotation"
     headers = get_common_headers(token)
@@ -161,8 +197,33 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
 
 def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     batch_response: List[Dict], metric_name: str
-) -> List[List[dict]]:
-    """Parse the annotation response from Responsible AI service
+) -> Dict:
+    """Parse the annotation response from Responsible AI service for a content harm evaluation.
+
+    :param batch_response: The annotation response from Responsible AI service.
+    :type batch_response: List[Dict]
+    :param metric_name: The evaluation metric to use.
+    :type metric_name: str
+    :return: The parsed annotation result.
+    :rtype: List[List[Dict]]
+    """
+
+    if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
+        if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
+            return {}
+        response = batch_response[0][metric_name]
+        response = response.replace("false", "False")
+        response = response.replace("true", "True")
+        parsed_response = literal_eval(response)
+        result = {}
+        result["label"] = parsed_response["label"] if "label" in parsed_response else np.nan
+        result["reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
+        return result
+    return _parse_content_harm_response(batch_response, metric_name)
+
+
+def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
+    """Parse the annotation response from Responsible AI service for a content harm evaluation.
 
     :param batch_response: The annotation response from Responsible AI service.
     :type batch_response: List[Dict]

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/__init__.py
@@ -15,6 +15,7 @@
 from ._f1_score import F1ScoreEvaluator
 from ._fluency import FluencyEvaluator
 from ._groundedness import GroundednessEvaluator
+from ._protected_materials import ProtectedMaterialsEvaluator
 from ._qa import QAEvaluator
 from ._relevance import RelevanceEvaluator
 from ._similarity import SimilarityEvaluator
@@ -34,4 +35,5 @@
     "HateUnfairnessEvaluator",
     "ContentSafetyEvaluator",
     "ContentSafetyChatEvaluator",
+    "ProtectedMaterialsEvaluator",
 ]
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py b/src/promptflow-evals/promptflow/evals/evaluators/_eci/_eci.py
@@ -0,0 +1,90 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from promptflow._utils.async_utils import async_run_allowing_running_loop
+from promptflow.evals._common.constants import _InternalEvaluationMetrics
+from promptflow.evals._common.rai_service import evaluate_with_rai_service
+
+
+class _AsyncECIEvaluator:
+    def __init__(self, project_scope: dict, credential=None):
+        self._project_scope = project_scope
+        self._credential = credential
+
+    async def __call__(self, *, question: str, answer: str, **kwargs):
+        # Validate inputs
+        # Raises value error if failed, so execution alone signifies success.
+        if not (question and question.strip() and question != "None") or not (
+            answer and answer.strip() and answer != "None"
+        ):
+            raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
+
+        # Run score computation based on supplied metric.
+        result = await evaluate_with_rai_service(
+            metric_name=_InternalEvaluationMetrics.ECI,
+            question=question,
+            answer=answer,
+            project_scope=self._project_scope,
+            credential=self._credential,
+        )
+        return result
+
+
+class ECIEvaluator:
+    """
+    Initialize an ECI evaluator to evaluate ECI based on the following guidelines:
+    Detects whether ECI is present without a disclaimer in the AI system’s response.
+    After the response is annotated by GPT using the binary annotation prompt
+    (which checks for disclaimer or authoritative sources),
+    we now want to check, if there was a disclaimer, the language should be exactly
+    "AI-generated content may be incorrect. If you are seeking ECI-related information, please go to Bing Search."
+    Outputs True or False with AI-generated reasoning.
+
+    :param project_scope: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type project_scope: dict
+    :param credential: The credential for connecting to Azure AI project.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
+    :rtype: Dict[str, str]
+
+    **Usage**
+
+    .. code-block:: python
+
+        project_scope = {
+            "subscription_id": "<subscription_id>",
+            "resource_group_name": "<resource_group_name>",
+            "project_name": "<project_name>",
+        }
+        eval_fn = ECIEvaluator(project_scope)
+        result = eval_fn(question="What is the capital of France?", answer="Paris.")
+
+    **Output format**
+
+    .. code-block:: python
+
+        {
+            "label": "False",
+            "reasoning": "Some reason."
+        }
+    """
+
+    def __init__(self, project_scope: dict, credential=None) -> None:
+        self._async_evaluator = _AsyncECIEvaluator(project_scope, credential)
+
+    def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates ECI content.
+
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: The ECI score.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_protected_materials/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/_protected_materials/__init__.py
@@ -0,0 +1,5 @@
+from ._protected_materials import ProtectedMaterialsEvaluator
+
+__all__ = [
+    "ProtectedMaterialsEvaluator",
+]
diff --git a/...promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py b/...promptflow-evals/promptflow/evals/evaluators/_protected_materials/_protected_materials.py
@@ -0,0 +1,95 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from promptflow._utils.async_utils import async_run_allowing_running_loop
+from promptflow.evals._common.constants import EvaluationMetrics
+from promptflow.evals._common.rai_service import evaluate_with_rai_service
+
+
+class _AsyncProtectedMaterialsEvaluator:
+    def __init__(self, project_scope: dict, credential=None):
+        self._project_scope = project_scope
+        self._credential = credential
+
+    async def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates content according to this evaluator's metric.
+
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: The evaluation score computation based on the Content Safety metric (self.metric).
+        :rtype: Any
+        """
+        # Validate inputs
+        # Raises value error if failed, so execution alone signifies success.
+        if not (question and question.strip() and question != "None") or not (
+            answer and answer.strip() and answer != "None"
+        ):
+            raise ValueError("Both 'question' and 'answer' must be non-empty strings.")
+
+        # Run score computation based on supplied metric.
+        result = await evaluate_with_rai_service(
+            metric_name=EvaluationMetrics.PROTECTED_MATERIAL,
+            question=question,
+            answer=answer,
+            project_scope=self._project_scope,
+            credential=self._credential,
+        )
+        return result
+
+
+class ProtectedMaterialsEvaluator:
+    """
+    Initialize a protected materials evaluator to detect whether protected material
+    is present in your AI system's response. Outputs True or False with AI-generated reasoning.
+
+    :param project_scope: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type project_scope: dict
+    :param credential: The credential for connecting to Azure AI project.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :return: Whether or not protected material was found in the response, with AI-generated reasoning.
+    :rtype: Dict[str, str]
+
+    **Usage**
+
+    .. code-block:: python
+
+        project_scope = {
+            "subscription_id": "<subscription_id>",
+            "resource_group_name": "<resource_group_name>",
+            "project_name": "<project_name>",
+        }
+        eval_fn = ProtectedMaterialsEvaluator(project_scope)
+        result = eval_fn(question="What is the capital of France?", answer="Paris.")
+
+    **Output format**
+
+    .. code-block:: python
+
+        {
+            "label": "False",
+            "reasoning": "This question does not contain any protected material."
+        }
+    """
+
+    def __init__(self, project_scope: dict, credential=None):
+        self._async_evaluator = _AsyncProtectedMaterialsEvaluator(project_scope, credential)
+
+    def __call__(self, *, question: str, answer: str, **kwargs):
+        """
+        Evaluates protected materials content.
+
+        :keyword question: The question to be evaluated.
+        :paramtype question: str
+        :keyword answer: The answer to be evaluated.
+        :paramtype answer: str
+        :return: A dictionary containing a boolean label and reasoning.
+        :rtype: dict
+        """
+        return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)
+
+    def _to_async(self):
+        return self._async_evaluator