From dc50e3a1aee25b8686c83e970d35b25b3bd472f7 Mon Sep 17 00:00:00 2001
From: Uri Smashnov <uri.smashnov@h2o.ai>
Date: Sat, 21 Sep 2024 08:13:03 +0200
Subject: [PATCH] Fix pytest errors

---
 .../text_causal_language_modeling_ds.py       |  2 +-
 .../test_text_causal_language_modeling_ds.py  | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/llm_studio/src/datasets/text_causal_language_modeling_ds.py b/llm_studio/src/datasets/text_causal_language_modeling_ds.py
index ffc9d251b..7d3681afe 100644
--- a/llm_studio/src/datasets/text_causal_language_modeling_ds.py
+++ b/llm_studio/src/datasets/text_causal_language_modeling_ds.py
@@ -338,7 +338,7 @@ def sanity_check(cls, df: pd.DataFrame, cfg: Any, mode: str = "train"):
         ):
             assert (df[cfg.dataset.parent_id_column] != df["id"]).all(), (
                 f"Parent id column:{cfg.dataset.parent_id_column}"
-                "is the same as id column for some rows"
+                " is the same as id column for some rows"
             )
             assert (df[cfg.dataset.parent_id_column].fillna("") == "").sum() > 0, (
                 "Did not find any conversation chain. "
diff --git a/tests/src/datasets/test_text_causal_language_modeling_ds.py b/tests/src/datasets/test_text_causal_language_modeling_ds.py
index d19e89654..143568a4d 100644
--- a/tests/src/datasets/test_text_causal_language_modeling_ds.py
+++ b/tests/src/datasets/test_text_causal_language_modeling_ds.py
@@ -1,3 +1,4 @@
+import re
 from unittest import mock
 from unittest.mock import MagicMock, patch
 
@@ -89,7 +90,8 @@ def test_sanity_check_raises_error():
         }
     )
     with pytest.raises(
-        AssertionError, match="Parent id column is the same as id column for some rows"
+        AssertionError,
+        match=r"Parent id column:.* is the same as id column for some rows",
     ):
         CustomDataset.sanity_check(invalid_df_1, mock_config)
 
@@ -102,8 +104,19 @@ def test_sanity_check_raises_error():
     )
     with pytest.raises(
         AssertionError,
-        match="Did not find any conversation start. "
-        "Please ensure that some parent ids are empty.",
+        match=re.escape(
+            "Did not find any conversation chain. "
+            "Please ensure that some parent ids are empty."
+            "\n"
+            "Conversations are chained using parent id, "
+            "start conversation record should "
+            "not have parent id populated"
+            "\n"
+            "Parent id column checked:parent_id"
+            "\n"
+            "Number of records with empty <parent_id>"
+            "column:{(df[cfg.dataset.parent_id_column].fillna('') == '').sum()}"
+        ),
     ):
         CustomDataset.sanity_check(invalid_df_2, mock_config)