diff --git a/skorecard/metrics/metrics.py b/skorecard/metrics/metrics.py index 9969a08..46578c6 100644 --- a/skorecard/metrics/metrics.py +++ b/skorecard/metrics/metrics.py @@ -18,14 +18,18 @@ def woe_1d(X, y, epsilon=0.00001): - counts_0: count of entries per bin where y==0 - counts_1: count of entries per bin where y==1 """ - X = X.copy().reset_index(drop=True) + # Make sure y has the right number of rows. + if y.shape[0] != X.shape[0]: + raise ValueError(f"y has {y.shape[0]}, but expected {X.shape[0]}") + + # Make sure y is a pd.Series so we can reset its index. if not isinstance(y, pd.Series): - if y.shape[0] == X.shape[0]: - y = pd.Series(y).reset_index(drop=True) - else: - raise ValueError(f"y has {y.shape[0]}, but expected {X.shape[0]}") + y = pd.Series(y) + + X = X.reset_index(drop=True) + y = y.reset_index(drop=True) - # Ensure classes in y start at zero + # Ensure classes in y start at zero. y = y - min(y) df = pd.concat([X, y], axis=1, ignore_index=True) diff --git a/skorecard/reporting/report.py b/skorecard/reporting/report.py index 7cb924b..d52f5e4 100644 --- a/skorecard/reporting/report.py +++ b/skorecard/reporting/report.py @@ -127,7 +127,7 @@ def build_bucket_table( stats["WoE"] = (event_percentage / non_event_percentage).apply(lambda x: np.log(x)) stats.loc[stats["Count"] == 0, "WoE"] = np.nan - stats["IV"] = (stats["% Non-event"] - stats["% Event"]) * stats["WoE"] + stats["IV"] = abs((stats["% Non-event"] - stats["% Event"]) * stats["WoE"]) stats["% Event"] = np.round(100 * stats["% Event"], 2) stats["% Non-event"] = np.round(100 * stats["% Non-event"], 2) diff --git a/skorecard/rescale/rescale.py b/skorecard/rescale/rescale.py index 5b9602b..037e1db 100644 --- a/skorecard/rescale/rescale.py +++ b/skorecard/rescale/rescale.py @@ -144,12 +144,13 @@ def _calculate_scorecard_points(self): scorecard = pd.concat( [ scorecard, - pd.DataFrame( + pd.DataFrame.from_records( [{"feature": "Intercept", "coef": self.model.intercept_[0], "bin_index": 0, "map": 0, "woe": 0}] ), ], ignore_index=True, ) + # return buckets, woes scorecard["contribution"] = scorecard["woe"] * scorecard["coef"] diff --git a/tests/test_bucket_table_woe_values.py b/tests/test_bucket_table_woe_values.py index dcb2ea6..5f54b3e 100644 --- a/tests/test_bucket_table_woe_values.py +++ b/tests/test_bucket_table_woe_values.py @@ -20,3 +20,5 @@ def test_bucket_table_woe_values(): b_tab_woes = {x for x in b_tab_woes if pd.notna(x)} data_woes = set(np.round(X_woe[c].value_counts().index, 3)) assert b_tab_woes == data_woes + iv_lt_zero = [x < 0 for x in bucket_table["IV"]] + assert True not in iv_lt_zero diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 98fdb05..7f7c0ab 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -68,8 +68,14 @@ def test_psi_values(X1_X2): def test_IV_values(X_y): """Assert IV values match expectations.""" X, y = X_y - X = pd.DataFrame(X, columns=["col1", "col2"]) + random_index = [2 * x for x in range(0, len(y))] + X = pd.DataFrame(X, columns=["col1", "col2"], index=random_index) expected_iv = {"col1": 5.307, "col2": 4.635} iv_vals = skorecard.reporting.report.iv(X, y) + np.testing.assert_array_almost_equal(pd.Series(expected_iv).values, pd.Series(iv_vals).values, decimal=2) + # Make sure these are still accurate if y is a pd.Series with the same + # non-continuous indices as X. + y = pd.Series(y, index=random_index) + iv_vals = skorecard.reporting.report.iv(X, y) np.testing.assert_array_almost_equal(pd.Series(expected_iv).values, pd.Series(iv_vals).values, decimal=2)