Bug fix for woe_1d, plus minor cosmetic and test issues (#98)

* Fix for pd.Series input to woe_1d The woe_1d function wasn't running y.reset_index() in the case where its y input was already a pd.Series. This meant that it would return incorrect values when indexes in X and y were noncontinuous. Added a check for this condition to tests/test_metrics.py as well. * Fix negative IV values in summary tables The scikit-learn WOEEncoder uses ln(%bad / %good) instead of ln(%good / %bad) for some reason, which flips all of the signs. A previous fix switched the computation of WOE in the summary table to match. However, this fix caused the IV values to be negative in summary tables, which I found surprising. I added an abs() call to the IV calculation to correct this cosmetic defect. * DataFrame.append was removed in pandas 2.0 Tests were failing in Pandas 2.0 due to the use of DataFrame.append, which has been removed. I fixed it by changing this to a pd.concat call. --------- Co-authored-by: Reinier Koops <[email protected]>
ing-bank · Jul 6, 2023 · 439b7e0 · 439b7e0
1 parent b47cfc2
commit 439b7e0
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 9 deletions.
diff --git a/skorecard/metrics/metrics.py b/skorecard/metrics/metrics.py
@@ -18,14 +18,18 @@ def woe_1d(X, y, epsilon=0.00001):
         - counts_0: count of entries per bin where y==0
         - counts_1: count of entries per bin where y==1
     """
-    X = X.copy().reset_index(drop=True)
+    # Make sure y has the right number of rows.
+    if y.shape[0] != X.shape[0]:
+        raise ValueError(f"y has {y.shape[0]}, but expected {X.shape[0]}")
+
+    # Make sure y is a pd.Series so we can reset its index.
     if not isinstance(y, pd.Series):
-        if y.shape[0] == X.shape[0]:
-            y = pd.Series(y).reset_index(drop=True)
-        else:
-            raise ValueError(f"y has {y.shape[0]}, but expected {X.shape[0]}")
+        y = pd.Series(y)
+
+    X = X.reset_index(drop=True)
+    y = y.reset_index(drop=True)
 
-    # Ensure classes in y start at zero
+    # Ensure classes in y start at zero.
     y = y - min(y)
 
     df = pd.concat([X, y], axis=1, ignore_index=True)

diff --git a/skorecard/reporting/report.py b/skorecard/reporting/report.py
@@ -127,7 +127,7 @@ def build_bucket_table(
     stats["WoE"] = (event_percentage / non_event_percentage).apply(lambda x: np.log(x))
     stats.loc[stats["Count"] == 0, "WoE"] = np.nan
 
-    stats["IV"] = (stats["% Non-event"] - stats["% Event"]) * stats["WoE"]
+    stats["IV"] = abs((stats["% Non-event"] - stats["% Event"]) * stats["WoE"])
 
     stats["% Event"] = np.round(100 * stats["% Event"], 2)
     stats["% Non-event"] = np.round(100 * stats["% Non-event"], 2)

diff --git a/skorecard/rescale/rescale.py b/skorecard/rescale/rescale.py
@@ -144,12 +144,13 @@ def _calculate_scorecard_points(self):
         scorecard = pd.concat(
             [
                 scorecard,
-                pd.DataFrame(
+                pd.DataFrame.from_records(
                     [{"feature": "Intercept", "coef": self.model.intercept_[0], "bin_index": 0, "map": 0, "woe": 0}]
                 ),
             ],
             ignore_index=True,
         )
+
         #     return buckets, woes
         scorecard["contribution"] = scorecard["woe"] * scorecard["coef"]
 

diff --git a/tests/test_bucket_table_woe_values.py b/tests/test_bucket_table_woe_values.py
@@ -20,3 +20,5 @@ def test_bucket_table_woe_values():
         b_tab_woes = {x for x in b_tab_woes if pd.notna(x)}
         data_woes = set(np.round(X_woe[c].value_counts().index, 3))
         assert b_tab_woes == data_woes
+        iv_lt_zero = [x < 0 for x in bucket_table["IV"]]
+        assert True not in iv_lt_zero
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -68,8 +68,14 @@ def test_psi_values(X1_X2):
 def test_IV_values(X_y):
     """Assert IV values match expectations."""
     X, y = X_y
-    X = pd.DataFrame(X, columns=["col1", "col2"])
+    random_index = [2 * x for x in range(0, len(y))]
+    X = pd.DataFrame(X, columns=["col1", "col2"], index=random_index)
     expected_iv = {"col1": 5.307, "col2": 4.635}
     iv_vals = skorecard.reporting.report.iv(X, y)
+    np.testing.assert_array_almost_equal(pd.Series(expected_iv).values, pd.Series(iv_vals).values, decimal=2)
 
+    # Make sure these are still accurate if y is a pd.Series with the same
+    # non-continuous indices as X.
+    y = pd.Series(y, index=random_index)
+    iv_vals = skorecard.reporting.report.iv(X, y)
     np.testing.assert_array_almost_equal(pd.Series(expected_iv).values, pd.Series(iv_vals).values, decimal=2)