Source code for brew.preprocessing.smote

from __future__ import division

import numpy as np
from sklearn.neighbors import NearestNeighbors


[docs]def smote(T, N=100, k=1):
    """
    T: minority class data
    N: percentage of oversampling
    k: number of neighbors used
    """

    # modification of original smote code so that it won't break if
    # minority class is too small in relation to the k, maybe this is not
    # sensible.
    if T.shape[0] <= k + 1:
        idx = np.random.choice(T.shape[0], size=(k + 1,))
        T = T[idx, :]

    # randomly select a subset of the data, to be used for creating synthethic
    # samples
    if N < 100:
        sz = int(T.shape[0] * (N / 100))
        idx = np.random.choice(T.shape[0], size=(sz,), replace=False)
        T = T[idx, :]
        N = 100

    if N % 100 != 0:
        raise ValueError('N must be < 100 OR multiple of 100')

    N = int(N / 100)
    n_minority_samples, n_features = T.shape
    n_synthetic_samples = N * n_minority_samples

    synthetic = np.zeros((n_synthetic_samples, n_features))

    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(T)

    count = 0

    for i in range(n_minority_samples):
        # first neighbor returned is always the very own sample, so
        # get 1 more neighbor and discard the first neighbor returned
        neighbors_idx = knn.kneighbors(
            T[i, :].reshape(1,-1), n_neighbors=k + 1, 
            return_distance=False)[0][1:]

        # randomly choose N neighbors of the sample (with replacement)
        nn_idx = np.random.choice(neighbors_idx, size=(N,))
        chosen_neighbors = T[nn_idx, :]

        diff = chosen_neighbors - T[i, :]
        gap = np.random.uniform(low=0.0, high=1.0, size=N)[:, np.newaxis]

        synthetic[count:count + N, :] = T[i, :] + (gap * diff)
        count += N

    return synthetic