From dc50e3a1aee25b8686c83e970d35b25b3bd472f7 Mon Sep 17 00:00:00 2001 From: Uri Smashnov Date: Sat, 21 Sep 2024 08:13:03 +0200 Subject: [PATCH] Fix pytest errors --- .../text_causal_language_modeling_ds.py | 2 +- .../test_text_causal_language_modeling_ds.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/llm_studio/src/datasets/text_causal_language_modeling_ds.py b/llm_studio/src/datasets/text_causal_language_modeling_ds.py index ffc9d251b..7d3681afe 100644 --- a/llm_studio/src/datasets/text_causal_language_modeling_ds.py +++ b/llm_studio/src/datasets/text_causal_language_modeling_ds.py @@ -338,7 +338,7 @@ def sanity_check(cls, df: pd.DataFrame, cfg: Any, mode: str = "train"): ): assert (df[cfg.dataset.parent_id_column] != df["id"]).all(), ( f"Parent id column:{cfg.dataset.parent_id_column}" - "is the same as id column for some rows" + " is the same as id column for some rows" ) assert (df[cfg.dataset.parent_id_column].fillna("") == "").sum() > 0, ( "Did not find any conversation chain. " diff --git a/tests/src/datasets/test_text_causal_language_modeling_ds.py b/tests/src/datasets/test_text_causal_language_modeling_ds.py index d19e89654..143568a4d 100644 --- a/tests/src/datasets/test_text_causal_language_modeling_ds.py +++ b/tests/src/datasets/test_text_causal_language_modeling_ds.py @@ -1,3 +1,4 @@ +import re from unittest import mock from unittest.mock import MagicMock, patch @@ -89,7 +90,8 @@ def test_sanity_check_raises_error(): } ) with pytest.raises( - AssertionError, match="Parent id column is the same as id column for some rows" + AssertionError, + match=r"Parent id column:.* is the same as id column for some rows", ): CustomDataset.sanity_check(invalid_df_1, mock_config) @@ -102,8 +104,19 @@ def test_sanity_check_raises_error(): ) with pytest.raises( AssertionError, - match="Did not find any conversation start. " - "Please ensure that some parent ids are empty.", + match=re.escape( + "Did not find any conversation chain. " + "Please ensure that some parent ids are empty." + "\n" + "Conversations are chained using parent id, " + "start conversation record should " + "not have parent id populated" + "\n" + "Parent id column checked:parent_id" + "\n" + "Number of records with empty " + "column:{(df[cfg.dataset.parent_id_column].fillna('') == '').sum()}" + ), ): CustomDataset.sanity_check(invalid_df_2, mock_config)