diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index fc55f59f..07d11c1c 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -151,7 +151,7 @@ def _ensureIdCol(self, df: DataFrame) -> DataFrame: id_col_name = self.getIdCol() df_withid = ( df - if self.isSet("idCol") + if id_col_name in df.columns else df.select(monotonically_increasing_id().alias(id_col_name), "*") ) return df_withid diff --git a/python/tests/test_nearest_neighbors.py b/python/tests/test_nearest_neighbors.py index c50d605b..034b2042 100644 --- a/python/tests/test_nearest_neighbors.py +++ b/python/tests/test_nearest_neighbors.py @@ -234,6 +234,17 @@ def assert_knn_metadata_equal(knn_metadata: List[List[str]]) -> None: assert knnjoin_queries[i]["features"] == query[i][0] assert knnjoin_queries[i]["metadata"] == query[i][1] + # Test fit(dataset, ParamMap) that copies existing estimator + # After copy, self.isSet("idCol") becomes true. But the added id column does not exist in the dataframe + paramMap = gpu_knn.extractParamMap() + gpu_model_v2 = gpu_knn.fit(item_df_withid, paramMap) + + assert gpu_knn.isSet("idCol") is False + assert gpu_model_v2.isSet("idCol") is True + + (_, _, knn_df_v2) = gpu_model_v2.kneighbors(query_df) + assert knn_df_v2.collect() == knn_df.collect() + return gpu_knn, gpu_model