Source code for brew.preprocessing.smote

from __future__ import division

import numpy as np
from sklearn.neighbors import NearestNeighbors


[docs]def smote(T, N=100, k=1): """ T: minority class data N: percentage of oversampling k: number of neighbors used """ # modification of original smote code so that it won't break if # minority class is too small in relation to the k, maybe this is not # sensible. if T.shape[0] <= k + 1: idx = np.random.choice(T.shape[0], size=(k + 1,)) T = T[idx, :] # randomly select a subset of the data, to be used for creating synthethic # samples if N < 100: sz = int(T.shape[0] * (N / 100)) idx = np.random.choice(T.shape[0], size=(sz,), replace=False) T = T[idx, :] N = 100 if N % 100 != 0: raise ValueError('N must be < 100 OR multiple of 100') N = int(N / 100) n_minority_samples, n_features = T.shape n_synthetic_samples = N * n_minority_samples synthetic = np.zeros((n_synthetic_samples, n_features)) knn = NearestNeighbors(n_neighbors=k) knn.fit(T) count = 0 for i in range(n_minority_samples): # first neighbor returned is always the very own sample, so # get 1 more neighbor and discard the first neighbor returned neighbors_idx = knn.kneighbors( T[i, :].reshape(1,-1), n_neighbors=k + 1, return_distance=False)[0][1:] # randomly choose N neighbors of the sample (with replacement) nn_idx = np.random.choice(neighbors_idx, size=(N,)) chosen_neighbors = T[nn_idx, :] diff = chosen_neighbors - T[i, :] gap = np.random.uniform(low=0.0, high=1.0, size=N)[:, np.newaxis] synthetic[count:count + N, :] = T[i, :] + (gap * diff) count += N return synthetic