from __future__ import division
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from brew.base import Ensemble
from brew.combination.combiner import Combiner
from brew.preprocessing.smote import smote
from .base import PoolGenerator
[docs]class SmoteBagging(PoolGenerator):
def __init__(self, base_classifier=None,
n_classifiers=100,
combination_rule='majority_vote', k=5):
# self.b = b
self.k = k
self.n_classifiers = n_classifiers
self.base_classifier = base_classifier
self.ensemble = None
self.combiner = Combiner(rule=combination_rule)
[docs] def smote_bootstrap_sample(self, X, y, b, k):
classes = np.unique(y)
count = np.bincount(y) # number of instances of each class
majority_class = count.argmax() # majority clas
majority_count = count.max() # majority class
data = np.empty((0, X.shape[1]))
target = np.empty((0,))
for i in classes:
class_data = X[(y == i), :]
if i == majority_class: # majority class
# regular bootstrap (i.e. 100% sampling rate)
idx = np.random.choice(majority_count, (majority_count,))
data = np.concatenate((data, class_data[idx, :]))
target = np.concatenate(
(target, i * np.ones((majority_count,))))
# print('original class data = {}'.format(class_data.shape))
# print('sampled class data = {}'.format(class_data[idx,:].shape)) # noqa
# print()
else: # minority classes
# bootstrap the class data with defined sampling rate
sample_rate = (majority_count /
class_data.shape[0]) * (b / 100)
idx = np.random.choice(
class_data.shape[0], (int(sample_rate * class_data.shape[0]),)) # noqa
sampled_class_data = class_data[idx, :]
# print('original class data = {}'.format(class_data.shape))
# print('majority_count = {}'.format(majority_count))
# print('class data = {}'.format(class_data.shape))
# print('b = {}'.format(b))
# print('sample rate = {}'.format(sample_rate))
# print('sampled class data = {}'.format(sampled_class_data.shape)) # noqa
# run smote on bootstrapped data to obtain synthetic samples
# ceil to make sure N_smote is a multiple of 100, and the small
# value to avoid a zero
N_smote = int(np.ceil(
(majority_count / sampled_class_data.shape[0]) * (1 - b / 100 + 10e-8)) * 100) # noqa
# print(N_smote)
# print('----------')
# print('smote parameters:')
# print('T : {}'.format(sampled_class_data.shape))
# print('N : {}'.format(N_smote))
synthetic = smote(sampled_class_data, N=N_smote, k=self.k)
# print('synthetic data = {})'.format(synthetic.shape))
# print(synthetic)
# add synthetic samples to sampled class data
n_missing = majority_count - sampled_class_data.shape[0]
idx = np.random.choice(synthetic.shape[0], (n_missing,))
new_class_data = np.concatenate(
(sampled_class_data, synthetic[idx, :]))
# print('new class data = {})'.format(new_class_data.shape))
# print()
data = np.concatenate((data, new_class_data))
target = np.concatenate(
(target, i * np.ones((new_class_data.shape[0],))))
return data, target
[docs] def fit(self, X, y):
self.ensemble = Ensemble()
# this parameter should change between [10, 100] with
# increments of 10, for every classifier in the ensemble
b = 10
for i in range(self.n_classifiers):
# print()
# print('classifier : {}'.format(i))
# print('------------------------')
# print('b = {}'.format(b))
data, target = self.smote_bootstrap_sample(
X, y, b=float(b), k=self.k)
# print('data = {}'.format(data.shape))
# print()
classifier = sklearn.base.clone(self.base_classifier)
classifier.fit(data, target)
self.ensemble.add(classifier)
if b >= 100:
b = 10
else:
b += 10
return
[docs] def predict(self, X):
out = self.ensemble.output(X)
return self.combiner.combine(out)
[docs]class SmoteBaggingNew(SmoteBagging):
[docs] def fit(self, X, y):
self.ensemble = Ensemble()
# this parameter should change between [10, 100] with
# increments of 10, for every classifier in the ensemble
b = 10
for i in range(self.n_classifiers):
# print()
# print('classifier : {}'.format(i))
# print('------------------------')
# print('b = {}'.format(b))
data, target = self.smote_bootstrap_sample(
X, y, b=float(b), k=self.k)
# print('data = {}'.format(data.shape))
# print()
classifier = sklearn.base.clone(self.base_classifier)
classifier.fit(data, target)
self.ensemble.add(classifier)
if b >= 100:
b = 10
else:
b += 10
return
[docs] def smote_bootstrap_sample(self, X, y, b, k):
count = np.bincount(y) # number of instances of each class
majority_class = count.argmax() # majority class
majority_count = count.max() # majority class
data = np.empty((0, X.shape[1]))
target = np.empty((0,))
class_data = X[(y == majority_class), :]
idx = np.random.choice(majority_count, (majority_count,))
data = np.concatenate((data, class_data[idx, :]))
target = np.concatenate(
(target, majority_class * np.ones((majority_count,))))
minority_class = count.argmin()
minority_count = count.min()
# print majority_count
N_syn = int((majority_count) * (b / 100))
# print N_syn
N_res = majority_count - N_syn
# print N_res
N_syn, N_res = N_res, N_syn
class_data = X[(y == minority_class), :]
idx = np.random.choice(class_data.shape[0], (N_res,))
sampled_min_data = class_data[idx, :]
# print sampled_min_data.shape
if N_syn > 0:
N_smote = np.ceil(N_syn / minority_count) * 100
N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100)
synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k)
idx = np.random.choice(synthetic.shape[0], (N_syn,))
new_class_data = np.concatenate(
(sampled_min_data, synthetic[idx, :]))
data = np.concatenate((data, new_class_data))
target = np.concatenate(
(target, minority_class * np.ones((new_class_data.shape[0],))))
else:
data = np.concatenate((data, sampled_min_data))
target = np.concatenate(
(target, minority_class * np.ones((sampled_min_data.shape[0],)))) # noqa
return data, target
if __name__ == '__main__':
# class 0
X0 = np.random.random((100, 2))
y0 = 0 * np.ones((100,), dtype='int64')
# class 1
X1 = np.random.random((60, 2))
y1 = 1 * np.ones((60,), dtype='int64')
# class 2
X2 = np.random.random((35, 2))
y2 = 2 * np.ones((35,), dtype='int64')
# class 3
X3 = np.random.random((5, 2))
y3 = 3 * np.ones((5,), dtype='int64')
print('DATASET before:')
print('class 0 : {}'.format(X0.shape))
print('class 1 : {}'.format(X1.shape))
print('class 2 : {}'.format(X2.shape))
print('class 3 : {}'.format(X3.shape))
print()
X = np.concatenate((X0, X1, X2, X3))
y = np.concatenate((y0, y1, y2, y3))
knn = KNeighborsClassifier
pool = SmoteBagging(base_classifier=knn, n_classifiers=5, k=3)
pool.fit(X, y)
print(np.sum(pool.predict(X) == y) / y.size)