explodinggradients · shahules786 · Sep 15, 2024 · Sep 15, 2024 · Sep 16, 2024
diff --git a/docs/concepts/metrics/sql.md b/docs/concepts/metrics/sql.md
@@ -0,0 +1,57 @@
+# SQL 
+
+
+## Execution based metrics
+In these metrics the resulting SQL is compared after executing the SQL query on the database and then comparing the `response` with the expected results. 
+
+### DataCompy Score
+
+DataCompy is a python library that compares two pandas DataFrames. It provides a simple interface to compare two DataFrames and provides a detailed report of the differences. In this metric the `response` is executed on the database and the resulting data is compared with the expected data, ie `reference`. To enable comparison both `response` and `reference` should be in the form of a Comma-Separated Values as shown in the example.
+
+Dataframes can be compared across rows or columns. This can be configured using `mode` parameter. 
+
+If mode is `row` then the comparison is done row-wise. If mode is `column` then the comparison is done column-wise.
+
+```{math}
+:label: precision
+\text{Precision } = {|\text{Number of matching rows in response and reference}| \over |\text{Total number of rows in response}|}
+```
+
+```{math}
+:label: recall
+\text{Precision } = {|\text{Number of matching rows in response and reference}| \over |\text{Total number of rows in reference}|}
+```
+
+By default, the mode is set to `row`, and metric is F1 score which is the harmonic mean of precision and recall.
+
+
+```{code-block} python
+from ragas.metrics._datacompy_score import DataCompyScore
+from ragas.dataset_schema import SingleTurnSample
+
+data1 = """acct_id,dollar_amt,name,float_fld,date_fld
+10000001234,123.45,George Maharis,14530.1555,2017-01-01
+10000001235,0.45,Michael Bluth,1,2017-01-01
+10000001236,1345,George Bluth,,2017-01-01
+10000001237,123456,Bob Loblaw,345.12,2017-01-01
+10000001238,1.05,Lucille Bluth,,2017-01-01
+10000001238,1.05,Loose Seal Bluth,,2017-01-01
+"""
+
+data2 = """acct_id,dollar_amt,name,float_fld
+10000001234,123.4,George Michael Bluth,14530.155
+10000001235,0.45,Michael Bluth,
+10000001236,1345,George Bluth,1
+10000001237,123456,Robert Loblaw,345.12
+10000001238,1.05,Loose Seal Bluth,111
+"""
+sample = SingleTurnSample(response=data1, reference=data2)
+scorer = DataCompyScore()
+await scorer.single_turn_ascore(sample)
+```
+To change the mode to column-wise comparison, set the `mode` parameter to `column`.
+
+
+```{code-block} python
+scorer = DataCompyScore(mode="column", metric="recall")
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,8 @@ all = [
     "rouge_score",
     "fuzzywuzzy",
     "rapidfuzz",
+    "pandas",
+    "datacompy",
 ]
 
 [tool.setuptools]

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -13,4 +13,6 @@ graphene
 fuzzywuzzy
 rouge_score
 nltk
-rapidfuzz
+rapidfuzz
+pandas
+datacompy
diff --git a/src/ragas/metrics/_datacompy_score.py b/src/ragas/metrics/_datacompy_score.py
@@ -0,0 +1,78 @@
+import logging
+import typing as t
+from dataclasses import dataclass, field
+from io import StringIO
+
+import numpy as np
+from langchain_core.callbacks import Callbacks
+
+from ragas.dataset_schema import SingleTurnSample
+from ragas.metrics.base import MetricType, SingleTurnMetric
+from ragas.run_config import RunConfig
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataCompyScore(SingleTurnMetric):
+    name: str = "data_compare_score"  # type: ignore
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {MetricType.SINGLE_TURN: {"reference", "response"}}
+    )
+    mode: t.Literal["rows", "columns"] = "rows"
+    metric: t.Literal["precision", "recall", "f1"] = "f1"
+
+    def __post_init__(self):
+        try:
+            import datacompy
+            import pandas as pd
+        except ImportError as e:
+            raise ImportError(
+                f"{e.name} is required for bleu score. Please install it using `pip install {e.name}`"
+            )
+
+        self.data_compare = datacompy
+        self.pd = pd
+        if self.mode not in ["rows", "columns"]:
+            raise ValueError("Mode should be either rows or columns")
+
+        if self.metric not in ["precision", "recall", "f1"]:
+            raise ValueError("Metric should be either precision, recall or f1")
+
+    def init(self, run_config: RunConfig):
+        pass
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        reference = sample.reference
+        response = sample.response
+        assert isinstance(reference, str), "Expecting a string"
+        assert isinstance(response, str), "Expecting a string"
+        try:
+            reference_df = self.pd.read_csv(StringIO(reference))
+            response_df = self.pd.read_csv(StringIO(response))
+        except Exception as e:
+            logging.error(f"Error in reading csv: {e}")
+            return np.nan
+
+        compare = self.data_compare.Compare(reference_df, response_df, on_index=True)
+        if self.mode == "rows":
+            recall = compare.count_matching_rows() / reference_df.shape[0]
+            precision = compare.count_matching_rows() / response_df.shape[0]
+        else:
+            matched_cols = len(
+                [col for col in compare.column_stats if col["unequal_cnt"] == 0]
+            )
+            recall = matched_cols / reference_df.shape[1]
+            precision = matched_cols / response_df.shape[1]
+
+        if self.metric == "precision":
+            return precision
+        elif self.metric == "recall":
+            return recall
+        else:
+            return 2 * (precision * recall) / (precision + recall)
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)