Skip to content

Commit

Permalink
Add explicit state setting (#242)
Browse files Browse the repository at this point in the history
Set the random states explicitly. 

Tasks:

- [x] Adjust the code where "sample" does not use random_state
- [x] Adjust the test code for it
- [x] Make sure the tests use it consistently.
- [x] Look if we can remove some unnecessary checks as mentioned in this
issue: #221
  • Loading branch information
Reinier Koops authored Mar 28, 2024
1 parent 9a4d3ab commit c1285c3
Show file tree
Hide file tree
Showing 16 changed files with 428 additions and 402 deletions.
30 changes: 15 additions & 15 deletions probatus/feature_elimination/feature_elimination.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sklearn.base import clone, is_classifier, is_regressor
from sklearn.model_selection import check_cv
from sklearn.model_selection._search import BaseSearchCV
from loguru import logger

from probatus.utils import (
BaseFitComputePlotClass,
Expand Down Expand Up @@ -156,9 +157,8 @@ def __init__(
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings
- 51 - 100 - shows other warnings and prints
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set at each round of feature elimination. If it is None, the results will not be
Expand Down Expand Up @@ -395,7 +395,7 @@ def _get_feature_shap_values_per_fold(
score_val = self.scorer.scorer(clf, X_val, y_val)

# Compute SHAP values
shap_values = shap_calc(clf, X_val, verbose=self.verbose, **shap_kwargs)
shap_values = shap_calc(clf, X_val, verbose=self.verbose, random_state=self.random_state, **shap_kwargs)
return shap_values, score_train, score_val

def fit(
Expand Down Expand Up @@ -537,7 +537,7 @@ def fit(
self.min_features_to_select = 0
# This ensures that, if columns_to_keep is provided ,
# the last features remaining are only the columns_to_keep.
if self.verbose > 50:
if self.verbose > 1:
warnings.warn(f"Minimum features to select : {stopping_criteria}")

while len(current_features_set) > stopping_criteria:
Expand Down Expand Up @@ -615,8 +615,8 @@ def fit(
val_metric_mean=np.mean(scores_val),
val_metric_std=np.std(scores_val),
)
if self.verbose > 50:
print(
if self.verbose > 1:
logger.info(
f"Round: {round_number}, Current number of features: {len(current_features_set)}, "
f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
Expand Down Expand Up @@ -841,8 +841,8 @@ def _get_best_num_features(self, best_method, standard_error_threshold=1.0):
)

# Log shap_report for users who want to inspect / debug
if self.verbose > 50:
print(shap_report)
if self.verbose > 1:
logger.info(shap_report)

return best_num_features

Expand Down Expand Up @@ -1110,10 +1110,9 @@ def __init__(
verbose (int, optional):
Controls verbosity of the output:
- 0 - nether prints nor warnings are shown
- 1 - 50 - only most important warnings
- 51 - 100 - shows other warnings and prints
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 0 - neither prints nor warnings are shown
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set at each round of feature elimination. If it is None, the results will not be
Expand Down Expand Up @@ -1210,7 +1209,8 @@ def _get_fit_params_lightGBM(
"eval_set": [(X_val, y_val)],
"callbacks": [early_stopping(self.early_stopping_rounds, first_metric_only=True)],
}
if self.verbose >= 100:

if self.verbose >= 2:
fit_params["callbacks"].append(log_evaluation(1))
else:
fit_params["callbacks"].append(log_evaluation(0))
Expand Down Expand Up @@ -1505,5 +1505,5 @@ def _get_feature_shap_values_per_fold(
score_val = self.scorer.scorer(clf, X_val, y_val)

# Compute SHAP values
shap_values = shap_calc(clf, X_val, verbose=self.verbose, **shap_kwargs)
shap_values = shap_calc(clf, X_val, verbose=self.verbose, random_state=self.random_state, **shap_kwargs)
return shap_values, score_train, score_val
16 changes: 12 additions & 4 deletions probatus/interpret/model_interpret.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class ShapModelInterpreter(BaseFitComputePlotClass):
<img src="../img/model_interpret_sample.png" width="320" />
"""

def __init__(self, clf, scoring="roc_auc", verbose=0):
def __init__(self, clf, scoring="roc_auc", verbose=0, random_state=None):
"""
Initializes the class.
Expand All @@ -98,13 +98,17 @@ def __init__(self, clf, scoring="roc_auc", verbose=0):
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings
- 51 - 100 - shows other warnings and prints
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set for the nr of samples. If it is None, the results will not be reproducible. For
reproducible results set it to an integer.
"""
self.clf = clf
self.scorer = get_single_scorer(scoring)
self.verbose = verbose
self.random_state = random_state

def fit(
self,
Expand Down Expand Up @@ -186,6 +190,7 @@ def fit(
column_names=self.column_names,
class_names=self.class_names,
verbose=self.verbose,
random_state=self.random_state,
**shap_kwargs,
)

Expand All @@ -200,6 +205,7 @@ def fit(
column_names=self.column_names,
class_names=self.class_names,
verbose=self.verbose,
random_state=self.random_state,
**shap_kwargs,
)

Expand All @@ -212,6 +218,7 @@ def _prep_shap_related_variables(
y,
approximate=False,
verbose=0,
random_state=None,
column_names=None,
class_names=None,
**shap_kwargs,
Expand All @@ -228,6 +235,7 @@ def _prep_shap_related_variables(
X,
approximate=approximate,
verbose=verbose,
random_state=random_state,
return_explainer=True,
**shap_kwargs,
)
Expand Down
21 changes: 16 additions & 5 deletions probatus/interpret/shap_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class DependencePlotter(BaseFitComputePlotClass):
<img src="../img/model_interpret_dep.png"/>
"""

def __init__(self, clf, verbose=0):
def __init__(self, clf, verbose=0, random_state=None):
"""
Initializes the class.
Expand All @@ -64,12 +64,16 @@ def __init__(self, clf, verbose=0):
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set for the nr of samples. If it is None, the results will not be reproducible. For
reproducible results set it to an integer.
"""
self.clf = clf
self.verbose = verbose
self.random_state = random_state

def __repr__(self):
"""
Expand Down Expand Up @@ -113,7 +117,14 @@ def fit(self, X, y, column_names=None, class_names=None, precalc_shap=None, **sh
if self.class_names is None:
self.class_names = ["Negative Class", "Positive Class"]

self.shap_vals_df = shap_to_df(self.clf, self.X, precalc_shap=precalc_shap, verbose=self.verbose, **shap_kwargs)
self.shap_vals_df = shap_to_df(
self.clf,
self.X,
precalc_shap=precalc_shap,
verbose=self.verbose,
random_state=self.random_state,
**shap_kwargs,
)

self.fitted = True
return self
Expand Down
24 changes: 12 additions & 12 deletions probatus/sample_similarity/resemblance_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import warnings

import matplotlib.pyplot as plt
from loguru import logger
import numpy as np
import pandas as pd
from shap import summary_plot
Expand Down Expand Up @@ -76,9 +77,8 @@ class is 'roc_auc'.
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings
- 51 - 100 - shows other warnings and prints
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set at each round of feature elimination. If it is None, the results will not be
Expand Down Expand Up @@ -178,8 +178,8 @@ def fit(self, X1, X2, column_names=None, class_names=None):
f"Train {self.scorer.metric_name}: {np.round(self.train_score, 3)},\n"
f"Test {self.scorer.metric_name}: {np.round(self.test_score, 3)}."
)
if self.verbose > 50:
print(f"Finished model training: \n{self.results_text}")
if self.verbose > 1:
logger.info(f"Finished model training: \n{self.results_text}")

if self.verbose > 0:
if self.train_score > self.test_score:
Expand Down Expand Up @@ -343,9 +343,8 @@ class is 'roc_auc'.
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings
- 51 - 100 - shows other warnings and prints
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set at each round of feature elimination. If it is None, the results will not be
Expand Down Expand Up @@ -572,9 +571,8 @@ class is 'roc_auc'.
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings
- 51 - 100 - shows other warnings and prints
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set at each round of feature elimination. If it is None, the results will not be
Expand Down Expand Up @@ -630,7 +628,9 @@ def fit(self, X1, X2, column_names=None, class_names=None, **shap_kwargs):
"""
super().fit(X1=X1, X2=X2, column_names=column_names, class_names=class_names)

self.shap_values_test = shap_calc(self.clf, self.X_test, verbose=self.verbose, **shap_kwargs)
self.shap_values_test = shap_calc(
self.clf, self.X_test, verbose=self.verbose, random_state=self.random_state, **shap_kwargs
)
self.report = calculate_shap_importance(self.shap_values_test, self.column_names)
return self

Expand Down
2 changes: 1 addition & 1 deletion probatus/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
assure_list_values_allowed,
)
from .plots import plot_distributions_of_feature
from .interface import BaseFitComputeClass, BaseFitComputePlotClass
from .base_class_interface import BaseFitComputeClass, BaseFitComputePlotClass

__all__ = [
"NotFittedError",
Expand Down
11 changes: 5 additions & 6 deletions probatus/utils/arrayfuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,9 @@ def preprocess_data(X, X_name=None, column_names=None, verbose=0):
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
Returns:
(pd.DataFrame):
Expand Down Expand Up @@ -255,9 +255,8 @@ def preprocess_labels(y, y_name=None, index=None, verbose=0):
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
Returns:
(pd.Series):
Expand Down
File renamed without changes.
16 changes: 10 additions & 6 deletions probatus/utils/shap_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def shap_calc(
X,
return_explainer=False,
verbose=0,
random_state=None,
sample_size=100,
approximate=False,
check_additivity=True,
Expand All @@ -54,10 +55,13 @@ def shap_calc(
verbose (int, optional):
Controls verbosity of the output:
- 0 - nether prints nor warnings are shown
- 1 - 50 - only most important warnings
- 51 - 100 - shows other warnings and prints
- above 100 - presents all prints and all warnings (including SHAP warnings).
- 0 - neither prints nor warnings are shown
- 1 - only most important warnings
- 2 - shows all prints and all warnings.
random_state (int, optional):
Random state set for the nr of samples. If it is None, the results will not be reproducible. For
reproducible results set it to an integer.
approximate (boolean):
if True uses shap approximations - less accurate, but very fast. It applies to tree-based explainers only.
Expand All @@ -82,7 +86,7 @@ def shap_calc(
)
# Suppress warnings regarding XGboost and Lightgbm models.
with warnings.catch_warnings():
if verbose <= 100:
if verbose <= 1:
warnings.simplefilter("ignore")

# For tree explainers, do not pass masker when feature_perturbation is
Expand All @@ -100,7 +104,7 @@ def shap_calc(
sample_size = int(np.ceil(X.shape[0] * 0.2))
else:
pass
mask = sample(X, sample_size)
mask = sample(X, sample_size, random_state=random_state)
explainer = Explainer(model, masker=mask, **shap_kwargs)

# For tree-explainers allow for using check_additivity and approximate arguments
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dependencies = [
"shap>=0.43.0 ; python_version != '3.8'",
"numpy>=1.23.2",
"numba>=0.57.0",
"loguru>=0.7.2",
]

[project.urls]
Expand Down
Loading

0 comments on commit c1285c3

Please sign in to comment.