Bases: ClassifierMixin
, MetaEstimatorMixin
, BaseEstimator
Under Sampling Classifier
It trains an ensemble of estimators on several folds obtained
by using all samples of the minority class and undersampling
the rest of classes.
It implements the strategy described here.
It is compatible with having a resampler before it, as long as the resampler only
performs a partial reduction of the imbalance problem.
Parameters:
Name |
Type |
Description |
Default |
estimator |
estimator object
|
An estimator
object implementing fit and predict_proba |
required
|
n_jobs |
int
|
The number
of jobs to use for the computation |
None
|
Attributes:
Name |
Type |
Description |
estimators_ |
list
|
list of fitted estimators
used for predictions. |
classes_ |
array
|
Class labels |
n_classes_ |
int
|
Number of classes. |
label_encoder_ |
LabelEncoder object
|
LabelEncoder object
used to encode multiclass labels |
n_features_in_ |
int
|
Number of features seen during fit . Only defined if the
underlying estimator exposes such an attribute when fit. |
feature_names_in_ |
ndarray
|
Names of features seen during fit .
Only defined if the underlying estimator exposes such an attribute when fit. |
Source code in nestedcvtraining/under_sampling_classifier.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 | class UnderSamplingClassifier(
ClassifierMixin, MetaEstimatorMixin, BaseEstimator
):
""" Under Sampling Classifier
It trains an ensemble of estimators on several folds obtained
by using all samples of the minority class and undersampling
the rest of classes.
It implements the strategy described [here](http://proceedings.mlr.press/v94/ksieniewicz18a/ksieniewicz18a.pdf).
It is compatible with having a resampler before it, as long as the resampler only
performs a partial reduction of the imbalance problem.
Args:
estimator (estimator object): An estimator
object implementing `fit` and `predict_proba`
n_jobs (int, optional): The number
of jobs to use for the computation
Attributes:
estimators_ (list): list of fitted estimators
used for predictions.
classes_ (array): Class labels
n_classes_ (int): Number of classes.
label_encoder_ (LabelEncoder object): LabelEncoder object
used to encode multiclass labels
n_features_in_ (int): Number of features seen during `fit`. Only defined if the
underlying estimator exposes such an attribute when fit.
feature_names_in_ (ndarray): Names of features seen during `fit`.
Only defined if the underlying estimator exposes such an attribute when fit.
"""
def __init__(self, estimator, *, max_k_under_sampling=5, n_jobs=None):
self.estimator = estimator
self.max_k_under_sampling = max_k_under_sampling
self.n_jobs = n_jobs
def fit(self, X, y):
"""Fit underlying estimators by undersampling all classes but the minority class.
Args:
X (array-like of shape (n_samples, n_features) ): Features
y (array-like of shape (n_samples,) ): Targets
Returns:
self : object
Instance of fitted estimator.
"""
self.label_encoder_ = LabelEncoder()
Y = self.label_encoder_.fit_transform(y)
self.classes_ = self.label_encoder_.classes_
counter_classes = Counter(Y)
minority_class_key = counter_classes.most_common()[-1][0]
minority_class_idxs = np.where(y == minority_class_key)[0]
rest_idxs = np.where(y != minority_class_key)[0]
# K is the imbalanced ratio round to int (with a minimum of 2 and a max of max_k_undersamling)
imbalance_ratio = (
len(rest_idxs) / len(minority_class_idxs)
)
k_majority_class = int(np.around(imbalance_ratio))
k_majority_class = k_majority_class if k_majority_class < self.max_k_under_sampling else self.max_k_under_sampling
k_majority_class = k_majority_class if k_majority_class > 2 else 2
def under_sampling_iterator():
splitter = StratifiedKFold(n_splits=k_majority_class)
for _, index in splitter.split(rest_idxs, y[rest_idxs]):
idxs = np.concatenate([minority_class_idxs, rest_idxs[index]])
yield idxs
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_estimator)(
self.estimator,
X[idxs],
y[idxs]
)
for idxs in under_sampling_iterator()
)
if hasattr(self.estimators_[0], "n_features_in_"):
self.n_features_in_ = self.estimators_[0].n_features_in_
if hasattr(self.estimators_[0], "feature_names_in_"):
self.feature_names_in_ = self.estimators_[0].feature_names_in_
return self
def predict(self, X):
check_is_fitted(self)
proba = self.predict_proba(X)
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
def predict_proba(self, X):
check_is_fitted(self)
mean_proba = np.zeros((X.shape[0], len(self.classes_)))
for classifier in self.estimators_:
proba = classifier.predict_proba(X)
mean_proba += proba
mean_proba /= len(self.estimators_)
return mean_proba
@property
def n_classes_(self):
"""Number of classes."""
return len(self.classes_)
|
fit
Fit underlying estimators by undersampling all classes but the minority class.
Parameters:
Name |
Type |
Description |
Default |
X |
array-like of shape (n_samples, n_features)
|
Features |
required
|
y |
array-like of shape (n_samples,)
|
Targets |
required
|
Returns:
Name | Type |
Description |
self |
|
object
Instance of fitted estimator. |
Source code in nestedcvtraining/under_sampling_classifier.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104 | def fit(self, X, y):
"""Fit underlying estimators by undersampling all classes but the minority class.
Args:
X (array-like of shape (n_samples, n_features) ): Features
y (array-like of shape (n_samples,) ): Targets
Returns:
self : object
Instance of fitted estimator.
"""
self.label_encoder_ = LabelEncoder()
Y = self.label_encoder_.fit_transform(y)
self.classes_ = self.label_encoder_.classes_
counter_classes = Counter(Y)
minority_class_key = counter_classes.most_common()[-1][0]
minority_class_idxs = np.where(y == minority_class_key)[0]
rest_idxs = np.where(y != minority_class_key)[0]
# K is the imbalanced ratio round to int (with a minimum of 2 and a max of max_k_undersamling)
imbalance_ratio = (
len(rest_idxs) / len(minority_class_idxs)
)
k_majority_class = int(np.around(imbalance_ratio))
k_majority_class = k_majority_class if k_majority_class < self.max_k_under_sampling else self.max_k_under_sampling
k_majority_class = k_majority_class if k_majority_class > 2 else 2
def under_sampling_iterator():
splitter = StratifiedKFold(n_splits=k_majority_class)
for _, index in splitter.split(rest_idxs, y[rest_idxs]):
idxs = np.concatenate([minority_class_idxs, rest_idxs[index]])
yield idxs
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
delayed(_fit_estimator)(
self.estimator,
X[idxs],
y[idxs]
)
for idxs in under_sampling_iterator()
)
if hasattr(self.estimators_[0], "n_features_in_"):
self.n_features_in_ = self.estimators_[0].n_features_in_
if hasattr(self.estimators_[0], "feature_names_in_"):
self.feature_names_in_ = self.estimators_[0].feature_names_in_
return self
|
n_classes_
property
Number of classes.
Source code in nestedcvtraining/under_sampling_classifier.py
| @property
def n_classes_(self):
"""Number of classes."""
return len(self.classes_)
|