From cfdf893f8861a48f0f2522fd3356aa6829aae3eb Mon Sep 17 00:00:00 2001
From: Rishi <77904151+rishic3@users.noreply.github.com>
Date: Sun, 29 Sep 2024 12:22:54 -0400
Subject: [PATCH] Fix UMAP test failure (#745)

* Bump threshold

* Remove subsampling

* signoff

Signed-off-by: Rishi Chandra <rishic@nvidia.com>

* Add sample fraction test

* Update confidence

* formatting

* Set random state

* Avoid hardcoding, cleanup

* black formatting

* formatting

---------

Signed-off-by: Rishi Chandra <rishic@nvidia.com>
---
 python/tests/test_umap.py | 63 ++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/python/tests/test_umap.py b/python/tests/test_umap.py
index 2af2700b..affbc231 100644
--- a/python/tests/test_umap.py
+++ b/python/tests/test_umap.py
@@ -78,7 +78,6 @@ def _spark_umap_trustworthiness(
     supervised: bool,
     n_parts: int,
     gpu_number: int,
-    sampling_ratio: float,
     dtype: np.dtype,
     feature_type: str,
 ) -> float:
@@ -102,7 +101,7 @@ def _spark_umap_trustworthiness(
             )
 
         data_df = data_df.repartition(n_parts)
-        umap_estimator.setFeaturesCol(features_col).setSampleFraction(sampling_ratio)
+        umap_estimator.setFeaturesCol(features_col)
         umap_model = umap_estimator.fit(data_df)
         pdf = umap_model.transform(data_df).toPandas()
         embedding = cp.asarray(pdf["embedding"].to_list()).astype(cp.float32)
@@ -115,7 +114,6 @@ def _run_spark_test(
     n_parts: int,
     gpu_number: int,
     n_rows: int,
-    sampling_ratio: float,
     supervised: bool,
     dataset: str,
     n_neighbors: int,
@@ -131,15 +129,14 @@ def _run_spark_test(
         supervised,
         n_parts,
         gpu_number,
-        sampling_ratio,
         dtype,
         feature_type,
     )
 
     loc_umap = _local_umap_trustworthiness(local_X, local_y, n_neighbors, supervised)
 
-    print("Local UMAP trustworthiness score : {:.2f}".format(loc_umap))
-    print("Spark UMAP trustworthiness score : {:.2f}".format(dist_umap))
+    print("Local UMAP trustworthiness score : {:.4f}".format(loc_umap))
+    print("Spark UMAP trustworthiness score : {:.4f}".format(dist_umap))
 
     trust_diff = loc_umap - dist_umap
 
@@ -148,7 +145,6 @@ def _run_spark_test(
 
 @pytest.mark.parametrize("n_parts", [2, 9])
 @pytest.mark.parametrize("n_rows", [100, 500])
-@pytest.mark.parametrize("sampling_ratio", [0.55, 0.9])
 @pytest.mark.parametrize("supervised", [True, False])
 @pytest.mark.parametrize("dataset", ["digits", "iris"])
 @pytest.mark.parametrize("n_neighbors", [10])
@@ -159,7 +155,6 @@ def test_spark_umap(
     n_parts: int,
     gpu_number: int,
     n_rows: int,
-    sampling_ratio: float,
     supervised: bool,
     dataset: str,
     n_neighbors: int,
@@ -170,7 +165,6 @@ def test_spark_umap(
         n_parts,
         gpu_number,
         n_rows,
-        sampling_ratio,
         supervised,
         dataset,
         n_neighbors,
@@ -183,7 +177,6 @@ def test_spark_umap(
             n_parts,
             gpu_number,
             n_rows,
-            sampling_ratio,
             supervised,
             dataset,
             n_neighbors,
@@ -196,7 +189,6 @@ def test_spark_umap(
 
 @pytest.mark.parametrize("n_parts", [5])
 @pytest.mark.parametrize("n_rows", [500])
-@pytest.mark.parametrize("sampling_ratio", [0.7])
 @pytest.mark.parametrize("supervised", [True])
 @pytest.mark.parametrize("dataset", ["digits"])
 @pytest.mark.parametrize("n_neighbors", [10])
@@ -206,7 +198,6 @@ def test_spark_umap_fast(
     n_parts: int,
     gpu_number: int,
     n_rows: int,
-    sampling_ratio: float,
     supervised: bool,
     dataset: str,
     n_neighbors: int,
@@ -218,7 +209,6 @@ def test_spark_umap_fast(
         n_parts,
         gpu_number,
         n_rows,
-        sampling_ratio,
         supervised,
         dataset,
         n_neighbors,
@@ -231,7 +221,6 @@ def test_spark_umap_fast(
             n_parts,
             gpu_number,
             n_rows,
-            sampling_ratio,
             supervised,
             dataset,
             n_neighbors,
@@ -375,3 +364,49 @@ def assert_umap_model(model: UMAPModel) -> None:
         trust_diff = loc_umap - dist_umap
 
         assert trust_diff <= 0.15
+
+
+def test_umap_sample_fraction(gpu_number: int) -> None:
+    from cuml.datasets import make_blobs
+
+    n_rows = 5000
+    sample_fraction = 0.5
+
+    X, _ = make_blobs(
+        n_rows,
+        10,
+        centers=42,
+        cluster_std=0.1,
+        dtype=np.float32,
+        random_state=10,
+    )
+
+    with CleanSparkSession() as spark:
+        pyspark_type = "float"
+        feature_cols = [f"c{i}" for i in range(X.shape[1])]
+        schema = [f"{c} {pyspark_type}" for c in feature_cols]
+        df = spark.createDataFrame(X.tolist(), ",".join(schema))
+        df = df.withColumn("features", array(*feature_cols)).drop(*feature_cols)
+
+        umap = (
+            UMAP(num_workers=gpu_number, random_state=42)
+            .setFeaturesCol("features")
+            .setSampleFraction(sample_fraction)
+        )
+        assert umap.getSampleFraction() == sample_fraction
+
+        umap_model = umap.fit(df)
+
+        def assert_umap_model(model: UMAPModel) -> None:
+            embedding = np.array(model.embedding)
+            raw_data = np.array(model.raw_data)
+
+            threshold = 2 * np.sqrt(
+                n_rows * sample_fraction * (1 - sample_fraction)
+            )  # 2 std devs
+            assert np.abs(n_rows * sample_fraction - embedding.shape[0]) <= threshold
+            assert np.abs(n_rows * sample_fraction - raw_data.shape[0]) <= threshold
+            assert model.dtype == "float32"
+            assert model.n_cols == X.shape[1]
+
+        assert_umap_model(model=umap_model)