Skip to content

Commit

Permalink
Bug fix for woe_1d, plus minor cosmetic and test issues (#98)
Browse files Browse the repository at this point in the history
* Fix for pd.Series input to woe_1d

The woe_1d function wasn't running y.reset_index() in
the case where its y input was already a pd.Series. This
meant that it would return incorrect values when indexes
in X and y were noncontinuous.

Added a check for this condition to tests/test_metrics.py
as well.

* Fix negative IV values in summary tables

The scikit-learn WOEEncoder uses ln(%bad / %good) instead of
ln(%good / %bad) for some reason, which flips all of the signs.
A previous fix switched the computation of WOE in the summary
table to match. However, this fix caused the IV values to be
negative in summary tables, which I found surprising.

I added an abs() call to the IV calculation to correct this
cosmetic defect.

* DataFrame.append was removed in pandas 2.0

Tests were failing in Pandas 2.0 due to the use of
DataFrame.append, which has been removed. I fixed it
by changing this to a pd.concat call.

---------

Co-authored-by: Reinier Koops <[email protected]>
  • Loading branch information
lorenjan and Reinier Koops authored Jul 6, 2023
1 parent b47cfc2 commit 439b7e0
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 9 deletions.
16 changes: 10 additions & 6 deletions skorecard/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@ def woe_1d(X, y, epsilon=0.00001):
- counts_0: count of entries per bin where y==0
- counts_1: count of entries per bin where y==1
"""
X = X.copy().reset_index(drop=True)
# Make sure y has the right number of rows.
if y.shape[0] != X.shape[0]:
raise ValueError(f"y has {y.shape[0]}, but expected {X.shape[0]}")

# Make sure y is a pd.Series so we can reset its index.
if not isinstance(y, pd.Series):
if y.shape[0] == X.shape[0]:
y = pd.Series(y).reset_index(drop=True)
else:
raise ValueError(f"y has {y.shape[0]}, but expected {X.shape[0]}")
y = pd.Series(y)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Ensure classes in y start at zero
# Ensure classes in y start at zero.
y = y - min(y)

df = pd.concat([X, y], axis=1, ignore_index=True)
Expand Down
2 changes: 1 addition & 1 deletion skorecard/reporting/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def build_bucket_table(
stats["WoE"] = (event_percentage / non_event_percentage).apply(lambda x: np.log(x))
stats.loc[stats["Count"] == 0, "WoE"] = np.nan

stats["IV"] = (stats["% Non-event"] - stats["% Event"]) * stats["WoE"]
stats["IV"] = abs((stats["% Non-event"] - stats["% Event"]) * stats["WoE"])

stats["% Event"] = np.round(100 * stats["% Event"], 2)
stats["% Non-event"] = np.round(100 * stats["% Non-event"], 2)
Expand Down
3 changes: 2 additions & 1 deletion skorecard/rescale/rescale.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,13 @@ def _calculate_scorecard_points(self):
scorecard = pd.concat(
[
scorecard,
pd.DataFrame(
pd.DataFrame.from_records(
[{"feature": "Intercept", "coef": self.model.intercept_[0], "bin_index": 0, "map": 0, "woe": 0}]
),
],
ignore_index=True,
)

# return buckets, woes
scorecard["contribution"] = scorecard["woe"] * scorecard["coef"]

Expand Down
2 changes: 2 additions & 0 deletions tests/test_bucket_table_woe_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ def test_bucket_table_woe_values():
b_tab_woes = {x for x in b_tab_woes if pd.notna(x)}
data_woes = set(np.round(X_woe[c].value_counts().index, 3))
assert b_tab_woes == data_woes
iv_lt_zero = [x < 0 for x in bucket_table["IV"]]
assert True not in iv_lt_zero
8 changes: 7 additions & 1 deletion tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,14 @@ def test_psi_values(X1_X2):
def test_IV_values(X_y):
"""Assert IV values match expectations."""
X, y = X_y
X = pd.DataFrame(X, columns=["col1", "col2"])
random_index = [2 * x for x in range(0, len(y))]
X = pd.DataFrame(X, columns=["col1", "col2"], index=random_index)
expected_iv = {"col1": 5.307, "col2": 4.635}
iv_vals = skorecard.reporting.report.iv(X, y)
np.testing.assert_array_almost_equal(pd.Series(expected_iv).values, pd.Series(iv_vals).values, decimal=2)

# Make sure these are still accurate if y is a pd.Series with the same
# non-continuous indices as X.
y = pd.Series(y, index=random_index)
iv_vals = skorecard.reporting.report.iv(X, y)
np.testing.assert_array_almost_equal(pd.Series(expected_iv).values, pd.Series(iv_vals).values, decimal=2)

0 comments on commit 439b7e0

Please sign in to comment.