diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..ce47c998 --- /dev/null +++ b/404.html @@ -0,0 +1,880 @@ + + + +
+ + + + + + + + + + + + + + + + + + +This module focuses on feature elimination and it contains two classes:
+ShapRFECV
+
+
+
+ Bases: BaseFitComputePlotClass
This class performs Backwards Recursive Feature Elimination, using SHAP feature importance.
+At each round, for a + given feature set, starting from all available features, the following steps are applied:
+step
lowest SHAP importance features from the dataset.At the end of the process, the user can plot the performance of the model for each iteration, and select the + optimal number of features and the features set.
+The functionality is
+ similar to RFECV.
+ The main difference is removing the lowest importance features based on SHAP features importance. It also
+ supports the use of sklearn compatible search CV for hyperparameter optimization e.g.
+ GridSearchCV,
+ RandomizedSearchCV, or
+ BayesSearchCV, which
+ needs to be passed as the model
. Thanks to this you can perform hyperparameter optimization at each step of
+ the feature elimination. Lastly, it supports categorical features (object and category dtype) and missing values
+ in the data, as long as the model supports them.
We recommend using LGBMClassifier, + because by default it handles missing values and categorical features. In case of other models, make sure to + handle these issues for your dataset and consider impact it might have on features importance.
+Example:
+import numpy as np
+import pandas as pd
+from probatus.feature_elimination import ShapRFECV
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import RandomizedSearchCV
+
+feature_names = [
+ 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7',
+ 'f8', 'f9', 'f10', 'f11', 'f12', 'f13',
+ 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20']
+
+# Prepare two samples
+X, y = make_classification(n_samples=200, class_sep=0.05, n_informative=6, n_features=20,
+ random_state=0, n_redundant=10, n_clusters_per_class=1)
+X = pd.DataFrame(X, columns=feature_names)
+
+
+# Prepare model and parameter search space
+model = RandomForestClassifier(max_depth=5, class_weight='balanced')
+
+param_grid = {
+ 'n_estimators': [5, 7, 10],
+ 'min_samples_leaf': [3, 5, 7, 10],
+}
+search = RandomizedSearchCV(model, param_grid)
+
+
+# Run feature elimination
+shap_elimination = ShapRFECV(
+ model=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
+report = shap_elimination.fit_compute(X, y)
+
+# Make plots
+performance_plot = shap_elimination.plot()
+
+# Get final feature set
+final_features_set = shap_elimination.get_reduced_features_set(num_features=3)
+
+
+
+ probatus/feature_elimination/feature_elimination.py
23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 +906 +907 +908 +909 +910 +911 +912 +913 +914 +915 +916 +917 +918 +919 +920 +921 +922 +923 +924 +925 +926 +927 +928 +929 +930 +931 +932 +933 +934 +935 +936 +937 +938 +939 +940 +941 +942 +943 +944 |
|
__init__(model, step=1, min_features_to_select=1, cv=None, scoring='roc_auc', n_jobs=-1, verbose=0, random_state=None)
+
+This method initializes the class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ classifier or regressor, sklearn compatible search CV e.g. GridSearchCV, RandomizedSearchCV or BayesSearchCV
+ |
+
+
+
+ A model that will be optimized and trained at each round of feature elimination. The recommended model +is LGBMClassifier, +because it by default handles the missing values and categorical variables. This parameter also supports +any hyperparameter search schema that is consistent with the sklearn API e.g. +GridSearchCV, +RandomizedSearchCV +or BayesSearchCV. + |
+ + required + | +
step |
+
+ int or float
+ |
+
+
+
+ Number of lowest importance features removed each round. If it is an int, then each round such a number of +features are discarded. If float, such a percentage of remaining features (rounded down) is removed each +iteration. It is recommended to use float, since it is faster for a large number of features, and slows +down and becomes more precise with fewer features. Note: the last round may remove fewer features in +order to reach min_features_to_select. +If columns_to_keep parameter is specified in the fit method, step is the number of features to remove after +keeping those columns. + |
+
+ 1
+ |
+
min_features_to_select |
+
+ int
+ |
+
+
+
+ Minimum number of features to be kept. This is a stopping criterion of the feature elimination. By +default the process stops when one feature is left. If columns_to_keep is specified in the fit method, +it may override this parameter to the maximum between length of columns_to_keep the two. + |
+
+ 1
+ |
+
cv |
+
+ int, cross-validation generator or an iterable
+ |
+
+
+
+ Determines the cross-validation splitting strategy. Compatible with sklearn +cv parameter. +If None, then cv of 5 is used. + |
+
+ None
+ |
+
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with predefined +classification scorers names in sklearn. + |
+
+ 'roc_auc'
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of cores to run in parallel while fitting across folds. None means 1 unless in a
+ |
+
+ -1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
probatus/feature_elimination/feature_elimination.py
104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 |
|
compute()
+
+Checks if fit() method has been run.
+and computes the DataFrame with results of feature elimination for each round.
+ + + +Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ DataFrame with results of feature elimination for each round. + |
+
probatus/feature_elimination/feature_elimination.py
178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 |
|
fit(X, y, sample_weight=None, columns_to_keep=None, column_names=None, groups=None, shap_variance_penalty_factor=None, **shap_kwargs)
+
+Fits the object with the provided data.
+The algorithm starts with the entire dataset, and then sequentially
+ eliminates features. If sklearn compatible search CV is passed as model e.g.
+ GridSearchCV,
+ RandomizedSearchCV
+ or BayesSearchCV,
+ the hyperparameter optimization is applied at each step of the elimination.
+ Then, the SHAP feature importance is calculated using Cross-Validation,
+ and step
lowest importance features are removed.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ Provided dataset. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ Labels for X. + |
+ + required + | +
sample_weight |
+
+ (Series, ndarray, list)
+ |
+
+
+
+ array-like of shape (n_samples,) - only use if the model you're using supports +sample weighting (check the corresponding scikit-learn documentation). +Array of weights that are assigned to individual samples. +Note that they're only used for fitting of the model, not during evaluation of metrics. +If not provided, then each sample is given unit weight. + |
+
+ None
+ |
+
columns_to_keep |
+
+ list of str
+ |
+
+
+
+ List of column names to keep. If given, +these columns will not be eliminated by the feature elimination process. +However, these feature will used for the calculation of the SHAP values. + |
+
+ None
+ |
+
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
groups |
+
+ (Series, ndarray, list)
+ |
+
+
+
+ array-like of shape (n_samples,)
+Group labels for the samples used while splitting the dataset into train/test set.
+Only used in conjunction with a "Group" |
+
+ None
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ ShapRFECV
+ |
+
+
+
+ Fitted object. + |
+
probatus/feature_elimination/feature_elimination.py
266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 |
|
fit_compute(X, y, sample_weight=None, columns_to_keep=None, column_names=None, shap_variance_penalty_factor=None, **shap_kwargs)
+
+Fits the object with the provided data.
+The algorithm starts with the entire dataset, and then sequentially
+ eliminates features. If sklearn compatible search CV is passed as model e.g.
+ GridSearchCV,
+ RandomizedSearchCV
+ or BayesSearchCV,
+ the hyperparameter optimization is applied at each step of the elimination.
+ Then, the SHAP feature importance is calculated using Cross-Validation,
+ and step
lowest importance features are removed. At the end, the
+ report containing results from each iteration is computed and returned to the user.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ Provided dataset. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ Labels for X. + |
+ + required + | +
sample_weight |
+
+ (Series, ndarray, list)
+ |
+
+
+
+ array-like of shape (n_samples,) - only use if the model you're using supports +sample weighting (check the corresponding scikit-learn documentation). +Array of weights that are assigned to individual samples. +Note that they're only used for fitting of the model, not during evaluation of metrics. +If not provided, then each sample is given unit weight. + |
+
+ None
+ |
+
columns_to_keep |
+
+ list of str
+ |
+
+
+
+ List of columns to keep. If given, these columns will not be eliminated. + |
+
+ None
+ |
+
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ DataFrame containing results of feature elimination from each iteration. + |
+
probatus/feature_elimination/feature_elimination.py
192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 |
|
get_reduced_features_set(num_features, standard_error_threshold=1.0, return_type='feature_names')
+
+Gets the features set after the feature elimination process, for a given number of features.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
num_features |
+
+ int or str
+ |
+
+
+
+ If int: Number of features in the reduced features set. +If str: One of the following automatic num feature selection methods supported: + 1. best: strictly selects the num_features with the highest model score. + 2. best_coherent: For iterations that are within standard_error_threshold of the highest + score, select the iteration with the lowest standard deviation of model score. + 3. best_parsimonious: For iterations that are within standard_error_threshold of the + highest score, select the iteration with the fewest features. + |
+ + required + | +
standard_error_threshold |
+
+ float
+ |
+
+
+
+ If num_features is 'best_coherent' or 'best_parsimonious', this parameter is used. + |
+
+ 1.0
+ |
+
return_type |
+ + | +
+
+
+ Accepts possible values of 'feature_names', 'support' or 'ranking'. These are defined as: + 1. feature_names: returns column names + 2. support: returns boolean mask + 3. ranking: returns numeric ranking of features + |
+
+ 'feature_names'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ list of str
+ |
+
+
+
+ Reduced features set. + |
+
probatus/feature_elimination/feature_elimination.py
757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 |
|
plot(show=True, **figure_kwargs)
+
+Generates plot of the model performance for each iteration of feature elimination.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned figure, before showing it. + |
+
+ True
+ |
+
**figure_kwargs |
+ + | +
+
+
+ Keyword arguments that are passed to the plt.figure, at its initialization. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ figure
+ |
+
+
+
+ Figure containing the performance plot. + |
+
probatus/feature_elimination/feature_elimination.py
459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 |
|
The aim of this module is to provide tools for model interpretation using the SHAP library. +The class below is a convenience wrapper that implements multiple plots for tree-based & linear models.
+ + +ShapModelInterpreter
+
+
+
+ Bases: BaseFitComputePlotClass
This class is a wrapper that allows to easily analyse a model's features.
+It allows us to plot SHAP feature importance, + SHAP summary plot and SHAP dependence plots.
+Example:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from probatus.interpret import ShapModelInterpreter
+import numpy as np
+import pandas as pd
+
+feature_names = ['f1', 'f2', 'f3', 'f4']
+
+# Prepare two samples
+X, y = make_classification(n_samples=5000, n_features=4, random_state=0)
+X = pd.DataFrame(X, columns=feature_names)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Prepare and fit model. Remember about class_weight="balanced" or an equivalent.
+model = RandomForestClassifier(class_weight='balanced', n_estimators = 100, max_depth=2, random_state=0)
+model.fit(X_train, y_train)
+
+# Train ShapModelInterpreter
+shap_interpreter = ShapModelInterpreter(model)
+feature_importance = shap_interpreter.fit_compute(X_train, X_test, y_train, y_test)
+
+# Make plots
+ax1 = shap_interpreter.plot('importance')
+ax2 = shap_interpreter.plot('summary')
+ax3 = shap_interpreter.plot('dependence', target_columns=['f1', 'f2'])
+ax4 = shap_interpreter.plot('sample', samples_index=[X_test.index.tolist()[0]])
+
++ + +
+ +probatus/interpret/model_interpret.py
19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 |
|
__init__(model, scoring='roc_auc', verbose=0, random_state=None)
+
+Initializes the class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ classifier or regressor
+ |
+
+
+
+ Model fitted on X_train. + |
+ + required + | +
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined classification scorers names in sklearn +(link). +Another option is using probatus.utils.Scorer to define a custom metric. + |
+
+ 'roc_auc'
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set for the nr of samples. If it is None, the results will not be reproducible. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
probatus/interpret/model_interpret.py
63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 |
|
compute(return_scores=False, shap_variance_penalty_factor=None)
+
+Computes the DataFrame that presents the importance of each feature.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return the train and test score of the model, together with +the model interpretation report. If true, the output of this method is a tuple of DataFrame, float, +float. + |
+
+ False
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame or tuple(DataFrame, float, float)
+ |
+
+
+
+ Dataframe with SHAP feature importance, or tuple containing the dataframe, train and test scores of the +model. + |
+
probatus/interpret/model_interpret.py
232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 |
|
fit(X_train, X_test, y_train, y_test, column_names=None, class_names=None, **shap_kwargs)
+
+Fits the object and calculates the shap values for the provided datasets.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X_train |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing training data. + |
+ + required + | +
X_test |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing test data. + |
+ + required + | +
y_train |
+
+ Series
+ |
+
+
+
+ Series of labels for train data. + |
+ + required + | +
y_test |
+
+ Series
+ |
+
+
+
+ Series of labels for test data. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
probatus/interpret/model_interpret.py
93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 |
|
fit_compute(X_train, X_test, y_train, y_test, column_names=None, class_names=None, return_scores=False, shap_variance_penalty_factor=None, **shap_kwargs)
+
+Fits the object and calculates the shap values for the provided datasets.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X_train |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing training data. + |
+ + required + | +
X_test |
+
+ DataFrame
+ |
+
+
+
+ Dataframe containing test data. + |
+ + required + | +
y_train |
+
+ Series
+ |
+
+
+
+ Series of labels for train data. + |
+ + required + | +
y_test |
+
+ Series
+ |
+
+
+
+ Series of labels for test data. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. +If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. +If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return +the train and test score of the model, +together with the model interpretation report. If true, +the output of this method is a tuple of DataFrame, float, +float. + |
+
+ False
+ |
+
shap_variance_penalty_factor |
+
+ int or float
+ |
+
+
+
+ Apply aggregation penalty when computing average of shap values for a given feature. +Results in a preference for features that have smaller standard deviation of shap +values (more coherent shap importance). Recommend value 0.5 - 1.0. +Formula: penalized_shap_mean = (mean_shap - (std_shap * shap_variance_penalty_factor)) + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame or tuple(DataFrame, float, float)
+ |
+
+
+
+ Dataframe with SHAP feature importance, or tuple containing the dataframe, train and test scores of the +model. + |
+
probatus/interpret/model_interpret.py
287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 |
|
plot(plot_type, target_set='test', target_columns=None, samples_index=None, show=True, **plot_kwargs)
+
+Plots the appropriate SHAP plot.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
plot_type |
+
+ str
+ |
+
+
+
+ One of the following: +
|
+ + required + | +
target_set |
+
+ str
+ |
+
+
+
+ The set for which the plot should be generated, either |
+
+ 'test'
+ |
+
target_columns |
+
+ None, str or list of str
+ |
+
+
+
+ List of features names, for which the plots should be generated. If None, all features will be plotted. + |
+
+ None
+ |
+
samples_index |
+
+ (None, int, list or Index)
+ |
+
+
+
+ Index of samples to be explained if the |
+
+ None
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned axis, before showing it. + |
+
+ True
+ |
+
**plot_kwargs |
+ + | +
+
+
+ Keyword arguments passed to the plot method. For 'importance' and 'summary' plot_type, the kwargs are +passed to shap.summary_plot, for 'dependence' plot_type, they are passed to +probatus.interpret.DependencePlotter.plot method. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ axes or list(axes)
+ |
+
+
+
+ An Axes with the plot, or list of axes when multiple plots are returned. + |
+
probatus/interpret/model_interpret.py
360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 |
|
DependencePlotter
+
+
+
+ Bases: BaseFitComputePlotClass
Plotter used to plot SHAP dependence plot together with the target rates.
+Currently it supports tree-based and linear models.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+ + | +
+
+
+ classifier for which interpretation is done. + |
+ + required + | +
Example:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from probatus.interpret import DependencePlotter
+
+X, y = make_classification(n_samples=15, n_features=3, n_informative=3, n_redundant=0, random_state=42)
+model = RandomForestClassifier().fit(X, y)
+bdp = DependencePlotter(model)
+shap_values = bdp.fit_compute(X, y)
+
+bdp.plot(feature=2)
+
+
+
+ probatus/interpret/shap_dependence.py
9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 |
|
__init__(model, verbose=0, random_state=None)
+
+Initializes the class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ regression or classification model or pipeline. + |
+ + required + | +
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set for the nr of samples. If it is None, the results will not be reproducible. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
probatus/interpret/shap_dependence.py
35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 |
|
__repr__()
+
+Represent string method.
+ +probatus/interpret/shap_dependence.py
58 +59 +60 +61 +62 |
|
compute()
+
+Computes the report returned to the user, namely the SHAP values generated on the dataset.
+ + + +Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ SHAP Values for X. + |
+
probatus/interpret/shap_dependence.py
112 +113 +114 +115 +116 +117 +118 +119 +120 +121 |
|
fit(X, y, column_names=None, class_names=None, precalc_shap=None, **shap_kwargs)
+
+Fits the plotter to the model and data by computing the shap values.
+If the shap_values are passed, they do not need to be computed.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ input variables. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ target variable. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
precalc_shap |
+
+ (Optional, None or array)
+ |
+
+
+
+ Precalculated shap values, If provided they don't need to be computed. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
probatus/interpret/shap_dependence.py
64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 |
|
fit_compute(X, y, column_names=None, class_names=None, precalc_shap=None, **shap_kwargs)
+
+Fits the plotter to the model and data by computing the shap values.
+If the shap_values are passed, they do not need to be computed
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X |
+
+ DataFrame
+ |
+
+
+
+ Provided dataset. + |
+ + required + | +
y |
+
+ Series
+ |
+
+
+
+ Labels for X. + |
+ + required + | +
column_names |
+
+ None, or list of str
+ |
+
+
+
+ List of feature names for the dataset. If None, then column names from the X_train dataframe are used. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are +used. + |
+
+ None
+ |
+
precalc_shap |
+
+ (Optional, None or array)
+ |
+
+
+
+ Precalculated shap values, If provided they don't need to be computed. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+
+
+
+ SHAP Values for X. + |
+
probatus/interpret/shap_dependence.py
123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 |
|
plot(feature, figsize=(15, 10), bins=10, show=True, min_q=0, max_q=1, alpha=1.0)
+
+Plots the shap values for data points for a given feature, as well as the target rate and values distribution.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
feature |
+
+ str or int
+ |
+
+
+
+ Feature name of the feature to be analyzed. + |
+ + required + | +
figsize |
+
+ float, float)
+ |
+
+
+
+ Tuple specifying size (width, height) of resulting figure in inches. + |
+
+ (15, 10)
+ |
+
bins |
+
+ int or list[float]
+ |
+
+
+
+ Number of bins or boundaries of bins (supplied in list) for target-rate plot. + |
+
+ 10
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned axis, before showing it. + |
+
+ True
+ |
+
min_q |
+
+ float
+ |
+
+
+
+ Optional minimum quantile from which to consider values, used for plotting under outliers. + |
+
+ 0
+ |
+
max_q |
+
+ float
+ |
+
+
+
+ Optional maximum quantile until which data points are considered, used for plotting under outliers. + |
+
+ 1
+ |
+
alpha |
+
+ float
+ |
+
+
+
+ Optional alpha blending value, between 0 (transparent) and 1 (opaque). + |
+
+ 1.0
+ |
+
Returns + (list(matplotlib.axes)): + List of axes that include the plots.
+ +probatus/interpret/shap_dependence.py
160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 |
|
The goal of sample similarity module is understanding how different two samples are from a multivariate perspective.
+One of the ways to indicate this is Resemblance Model. Having two datasets - say X1 and X2 - one can analyse how easy it is to recognize which dataset a randomly selected row comes from. The Resemblance model assigns label 0 to the dataset X1, and label 1 to X2 and trains a binary classification model to predict which sample a given row comes from. +By looking at the test AUC, one can conclude that the samples have a different distribution if the AUC is significantly higher than 0.5. Furthermore, by analysing feature importance one can understand which of the features have predictive power.
+ +The following features are implemented:
+BaseResemblanceModel
+
+
+
+ Bases: BaseFitComputePlotClass
This model checks for the similarity of two samples.
+A possible use case is analysis of whether th train sample differs +from the test sample, due to e.g. non-stationarity.
+This is a base class and needs to be extended by a fit() method, which implements how the data is split, +how the model is trained and evaluated. +Further, inheriting classes need to implement how feature importance should be indicated.
+ +probatus/sample_similarity/resemblance_model.py
15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 |
|
__init__(model, scoring='roc_auc', test_prc=0.25, n_jobs=1, verbose=0, random_state=None)
+
+Initializes the class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ Regression or classification model or pipeline. + |
+ + required + | +
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined +classification scorers names in sklearn. +Another option is using probatus.utils.Scorer to define a custom metric. The recommended option for this +class is 'roc_auc'. + |
+
+ 'roc_auc'
+ |
+
test_prc |
+
+ float
+ |
+
+
+
+ Percentage of data used to test the model. By default 0.25 is set. + |
+
+ 0.25
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of parallel executions. If -1 use all available cores. By default 1. + |
+
+ 1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to an integer. + |
+
+ None
+ |
+
probatus/sample_similarity/resemblance_model.py
27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 |
|
compute(return_scores=False)
+
+Checks if fit() method has been run and computes the output variables.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return a tuple (feature importances, train score, +test score), or feature importances. By default the second option is selected. + |
+
+ False
+ |
+
Returns:
+Type | +Description | +
---|---|
+ tuple(DataFrame, float, float) or DataFrame
+ |
+
+
+
+ Depending on value of return_tuple either returns a tuple (feature importances, train AUC, test AUC), or +feature importances. + |
+
probatus/sample_similarity/resemblance_model.py
181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 |
|
fit(X1, X2, column_names=None, class_names=None)
+
+Base fit functionality that should be executed before each fit.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ BaseResemblanceModel
+ |
+
+
+
+ Fitted object + |
+
probatus/sample_similarity/resemblance_model.py
87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 |
|
fit_compute(X1, X2, column_names=None, class_names=None, return_scores=False, **fit_kwargs)
+
+Fits the resemblance model and computes the report regarding feature importance.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
return_scores |
+
+ bool
+ |
+
+
+
+ Flag indicating whether the method should return a tuple (feature importances, train score, +test score), or feature importances. By default the second option is selected. + |
+
+ False
+ |
+
**fit_kwargs |
+ + | +
+
+
+ In case any other arguments are accepted by fit() method, they can be passed as keyword arguments. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ tuple of (pd.DataFrame, float, float) or pd.DataFrame
+ |
+
+
+
+ Depending on value of return_tuple either returns a tuple (feature importances, train AUC, test AUC), or +feature importances. + |
+
probatus/sample_similarity/resemblance_model.py
202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 |
|
get_data_splits()
+
+Returns the data splits used to train the Resemblance model.
+ + + +Returns:
+Type | +Description | +
---|---|
+ (DataFrame, DataFrame, Series, Series)
+ |
+
+
+
+ X_train, X_test, y_train, y_test. + |
+
probatus/sample_similarity/resemblance_model.py
170 +171 +172 +173 +174 +175 +176 +177 +178 +179 |
|
plot()
+
+Plot.
+ +probatus/sample_similarity/resemblance_model.py
245 +246 +247 +248 +249 |
|
PermutationImportanceResemblance
+
+
+
+ Bases: BaseResemblanceModel
This model checks the similarity of two samples.
+A possible use case is analysis of whether the train sample differs +from the test sample, due to e.g. non-stationarity.
+It assigns labels to each sample, 0 to the first sample, 1 to the second. Then, it randomly selects a portion of +data to train on. The resulting model tries to distinguish which sample a given test row comes from. This +provides insights on how distinguishable these samples are and which features contribute to that. The feature +importance is calculated using permutation importance.
+If the model achieves a test AUC significantly different than 0.5, it indicates that it is possible to distinguish +between the samples, and therefore, the samples differ. +Features with a high permutation importance contribute to that effect the most. +Thus, their distribution might differ between two samples.
+Examples:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from probatus.sample_similarity import PermutationImportanceResemblance
+X1, _ = make_classification(n_samples=100, n_features=5)
+X2, _ = make_classification(n_samples=100, n_features=5, shift=0.5)
+model = RandomForestClassifier(max_depth=2)
+perm = PermutationImportanceResemblance(model)
+feature_importance = perm.fit_compute(X1, X2)
+perm.plot()
+
+
+
+ probatus/sample_similarity/resemblance_model.py
252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 |
|
__init__(model, iterations=100, scoring='roc_auc', test_prc=0.25, n_jobs=1, verbose=0, random_state=None)
+
+Initializes the class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ Regression or classification model or pipeline. + |
+ + required + | +
iterations |
+
+ int
+ |
+
+
+
+ Number of iterations performed to calculate permutation importance. By default 100 iterations per +feature are done. + |
+
+ 100
+ |
+
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined +classification scorers names in sklearn. +Another option is using probatus.utils.Scorer to define a custom metric. Recommended option for this +class is 'roc_auc'. + |
+
+ 'roc_auc'
+ |
+
test_prc |
+
+ float
+ |
+
+
+
+ Percentage of data used to test the model. By default 0.25 is set. + |
+
+ 0.25
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of parallel executions. If -1 use all available cores. By default 1. + |
+
+ 1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to integer. + |
+
+ None
+ |
+
probatus/sample_similarity/resemblance_model.py
284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 |
|
fit(X1, X2, column_names=None, class_names=None)
+
+This function assigns labels to each sample, 0 to the first sample, 1 to the second.
+Then, it randomly selects a + portion of data to train on. The resulting model tries to distinguish which sample a given test row + comes from. This provides insights on how distinguishable these samples are and which features contribute to + that. The feature importance is calculated using permutation importance.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ PermutationImportanceResemblance
+ |
+
+
+
+ Fitted object. + |
+
probatus/sample_similarity/resemblance_model.py
348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 |
|
plot(ax=None, top_n=None, show=True, **plot_kwargs)
+
+Plots the resulting AUC of the model as well as the feature importances.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
ax |
+
+ axes
+ |
+
+
+
+ Axes to which the output should be plotted. If not provided new axes are created. + |
+
+ None
+ |
+
top_n |
+
+ int
+ |
+
+
+
+ Number of the most important features to be plotted. By default features are included in the plot. + |
+
+ None
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are shown to the user, otherwise they are not shown. Not showing a plot can be useful +when you want to edit the returned axis before showing it. + |
+
+ True
+ |
+
**plot_kwargs |
+ + | +
+
+
+ Keyword arguments passed to the matplotlib.plotly.subplots method. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ axes
+ |
+
+
+
+ Axes that include the plot. + |
+
probatus/sample_similarity/resemblance_model.py
418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 |
|
SHAPImportanceResemblance
+
+
+
+ Bases: BaseResemblanceModel
This model checks for similarity of two samples.
+A possible use case is analysis of whether the train sample differs + from the test sample, due to e.g. non-stationarity.
+It assigns labels to each sample, 0 to the first sample, 1 to the second. Then, it randomly selects a portion of + data to train on. The resulting model tries to distinguish which sample a given test row comes from. This + provides insights on how distinguishable these samples are and which features contribute to that. The feature + importance is calculated using SHAP feature importance.
+If the model achieves test AUC significantly different than 0.5, it indicates that it is possible to distinguish + between the samples, and therefore, the samples differ. Features with a high permutation importance contribute + to that effect the most. Thus, their distribution might differ between two samples.
+This class currently works only with the Tree based models.
+Examples:
+from sklearn.datasets import make_classification
+from sklearn.ensemble import RandomForestClassifier
+from probatus.sample_similarity import SHAPImportanceResemblance
+X1, _ = make_classification(n_samples=100, n_features=5)
+X2, _ = make_classification(n_samples=100, n_features=5, shift=0.5)
+model = RandomForestClassifier(max_depth=2)
+rm = SHAPImportanceResemblance(model)
+feature_importance = rm.fit_compute(X1, X2)
+rm.plot()
+
++
+ +probatus/sample_similarity/resemblance_model.py
482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 |
|
__init__(model, scoring='roc_auc', test_prc=0.25, n_jobs=1, verbose=0, random_state=None)
+
+Initializes the class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ model object
+ |
+
+
+
+ Regression or classification model or pipeline. + |
+ + required + | +
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined +classification scorers names in sklearn. +Another option is using probatus.utils.Scorer to define a custom metric. Recommended option for this +class is 'roc_auc'. + |
+
+ 'roc_auc'
+ |
+
test_prc |
+
+ float
+ |
+
+
+
+ Percentage of data used to test the model. By default 0.25 is set. + |
+
+ 0.25
+ |
+
n_jobs |
+
+ int
+ |
+
+
+
+ Number of parallel executions. If -1 use all available cores. By default 1. + |
+
+ 1
+ |
+
verbose |
+
+ int
+ |
+
+
+
+ Controls verbosity of the output: +
|
+
+ 0
+ |
+
random_state |
+
+ int
+ |
+
+
+
+ Random state set at each round of feature elimination. If it is None, the results will not be +reproducible and in random search at each iteration a different hyperparameters might be tested. For +reproducible results set it to integer. + |
+
+ None
+ |
+
probatus/sample_similarity/resemblance_model.py
517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 |
|
fit(X1, X2, column_names=None, class_names=None, **shap_kwargs)
+
+This function assigns labels to each sample, 0 to the first sample, 1 to the second.
+Then, it randomly selects a + portion of data to train on. The resulting model tries to distinguish which sample a given test row + comes from. This provides insights on how distinguishable these samples are and which features contribute to + that. The feature importance is calculated using SHAP feature importance.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
X1 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ First sample to be compared. It needs to have the same number of columns as X2. + |
+ + required + | +
X2 |
+
+ ndarray or DataFrame
+ |
+
+
+
+ Second sample to be compared. It needs to have the same number of columns as X1. + |
+ + required + | +
column_names |
+
+ list of str
+ |
+
+
+
+ List of feature names of the provided samples. If provided it will be used to overwrite the existing +feature names. If not provided the existing feature names are used or default feature names are +generated. + |
+
+ None
+ |
+
class_names |
+
+ None, or list of str
+ |
+
+
+
+ List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the +default ['First Sample', 'Second Sample'] are used. + |
+
+ None
+ |
+
**shap_kwargs |
+ + | +
+
+
+ keyword arguments passed to
+shap.Explainer.
+It also enables |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ SHAPImportanceResemblance
+ |
+
+
+
+ Fitted object. + |
+
probatus/sample_similarity/resemblance_model.py
569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 |
|
get_shap_values()
+
+Gets the SHAP values generated on the test set.
+ + + +Returns:
+Type | +Description | +
---|---|
+ array
+ |
+
+
+
+ SHAP values generated on the test set. + |
+
probatus/sample_similarity/resemblance_model.py
664 +665 +666 +667 +668 +669 +670 +671 +672 +673 |
|
plot(plot_type='bar', show=True, **summary_plot_kwargs)
+
+Plots the resulting AUC of the model as well as the feature importances.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
plot_type |
+
+ str
+ |
+
+
+
+ Type of plot, used to compute shap.summary_plot. By default 'bar', available ones +are "dot", "bar", "violin", + |
+
+ 'bar'
+ |
+
show |
+
+ bool
+ |
+
+
+
+ If True, the plots are showed to the user, otherwise they are not shown. Not showing plot can be useful, +when you want to edit the returned axis, before showing it. + |
+
+ True
+ |
+
**summary_plot_kwargs |
+ + | +
+
+
+ kwargs passed to the shap.summary_plot. + |
+
+ {}
+ |
+
Returns:
+Type | +Description | +
---|---|
+ axes
+ |
+
+
+
+ Axes that include the plot. + |
+
probatus/sample_similarity/resemblance_model.py
613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 |
|
This module contains various smaller functionalities that can be used across the probatus
package.
Scorer
+
+
+Scores a given machine learning model based on the provided metric name and optionally a custom scoring function.
+Examples:
+from probatus.utils import Scorer
+from sklearn.metrics import make_scorer
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+import pandas as pd
+
+# Make ROC AUC scorer
+scorer1 = Scorer('roc_auc')
+
+# Make custom scorer with following function:
+def custom_metric(y_true, y_pred):
+ return (y_true == y_pred).sum()
+scorer2 = Scorer('custom_metric', custom_scorer=make_scorer(custom_metric))
+
+# Prepare two samples
+feature_names = ['f1', 'f2', 'f3', 'f4']
+X, y = make_classification(n_samples=1000, n_features=4, random_state=0)
+X = pd.DataFrame(X, columns=feature_names)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Prepare and fit model. Remember about class_weight="balanced" or an equivalent.
+model = RandomForestClassifier(class_weight='balanced', n_estimators = 100, max_depth=2, random_state=0)
+model = model.fit(X_train, y_train)
+
+# Score model
+score_test_scorer1 = scorer1.score(model, X_test, y_test)
+score_test_scorer2 = scorer2.score(model, X_test, y_test)
+
+print(f'Test ROC AUC is {score_test_scorer1}, Test {scorer2.metric_name} is {score_test_scorer2}')
+
+
+ probatus/utils/scoring.py
27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 |
|
__init__(metric_name, custom_scorer=None)
+
+Initializes the class.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
metric_name |
+
+ str
+ |
+
+
+
+ Name of the metric used to evaluate the model. +If the custom_scorer is not passed, the +metric name needs to be aligned with classification scorers names in sklearn +(link). + |
+ + required + | +
custom_scorer |
+
+ sklearn.metrics Scorer callable
+ |
+
+
+
+ Callable +that can score samples. + |
+
+ None
+ |
+
probatus/utils/scoring.py
67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 |
|
score(model, X, y)
+
+Scores the samples model based on the provided metric name.
+Args + model (model object): + Model to be scored.
+X (array-like of shape (n_samples,n_features)):
+ Samples on which the model is scored.
+
+y (array-like of shape (n_samples,)):
+ Labels on which the model is scored.
+
+
+
+
+ Returns:
+Type | +Description | +
---|---|
+ float
+ |
+
+
+
+ Score returned by the model + |
+
probatus/utils/scoring.py
85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 |
|
get_single_scorer(scoring)
+
+Returns Scorer, based on provided input in scoring argument.
+ + + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
scoring |
+
+ string or Scorer
+ |
+
+
+
+ Metric for which the model performance is calculated. It can be either a metric name aligned with +predefined classification scorers names in sklearn +(link). +Another option is using probatus.utils.Scorer to define a custom metric. + |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Scorer
+ |
+
+
+
+ Scorer that can be used for scoring models + |
+
probatus/utils/scoring.py
4 + 5 + 6 + 7 + 8 + 9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 |
|