In [None]:
%matplotlib inline

import matplotlib

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.ma as ma

In [None]:
dataset = pd.read_csv('LoanStats3d.csv', header=1, skipfooter=4, index_col='id', engine='python')
def convert_percent_to_float_if_necess(data, feature):
    if data[feature].dtype == 'float64':
        return
    data[feature] = data[feature].dropna().apply(lambda perc: float(str(perc)[:-1]))

convert_percent_to_float_if_necess(dataset, 'int_rate')
convert_percent_to_float_if_necess(dataset, 'revol_util')

In [None]:
columns = ['annual_inc', 'dti', 'total_rev_hi_lim','revol_util','int_rate']
label = 'loan_status'

In [None]:
def compare_quantile(df, cols, quantile = [0.0001,0.01,0.99,0.999,0.99999]):
    to_print = ''
    for col in cols:
        to_print = to_print+'column ' + col
        for quant in quantile:
           to_print += ' '+ str(quant)+':'+str(df[col].quantile(quant))
        to_print += '\n'
    print to_print

In [None]:
def ret_without_outliers(df, col_name_minq_maxq):
    def filt(df, col_name, min_quantile, max_quantile):
        return (df[col_name] >= df[col_name].quantile(min_quantile)) & (df[col_name] <= df[col_name].quantile(max_quantile))
    filter_serie = filt(df, col_name_minq_maxq[0][0], col_name_minq_maxq[0][1], col_name_minq_maxq[0][2])
    for i in range(1, len(col_name_minq_maxq)):
        filter_serie = filter_serie & filt(df, col_name_minq_maxq[i][0], col_name_minq_maxq[i][1], col_name_minq_maxq[i][2])
    return df[filter_serie]

In [None]:
compare_quantile(dataset, columns)

In [None]:
print dataset.shape
filtered_dataset = ret_without_outliers(dataset,\
                    [\
                    ('annual_inc',0, 0.98),\
                     ('revol_util',0, 0.99),\
                     ('dti',0, 0.99),\
                     ('total_rev_hi_lim',0, 0.99)
                    ])
print filtered_dataset.shape

In [None]:
def min_max_normalization(s, col_names):
    if s.name not in col_names:
        return s
    min_s = min(s)
    return (s-min_s)/(max(s)- min_s)

In [None]:
filtered_dataset_norm = filtered_dataset.apply(min_max_normalization, args=(columns,))

In [None]:
good_loans = filtered_dataset_norm[filtered_dataset_norm['loan_status'] == 'Fully Paid']

In [None]:
len(good_loans)

In [None]:
bad_loans = filtered_dataset_norm[filtered_dataset_norm['loan_status'] == 'Charged Off']

In [None]:
len(bad_loans)

In [None]:
32522./(5695.+32522.)

In [None]:
good_loans_sample = good_loans.sample(n=len(bad_loans))

In [None]:
len(good_loans_sample)

In [None]:
good_and_bad_loans = pd.concat([good_loans_sample, bad_loans])

In [None]:
data = good_and_bad_loans[columns].values

In [None]:
data.shape

In [None]:
labels = good_and_bad_loans[label].values

In [None]:
labels.shape

In [None]:
np.random.seed(2)
test_ind = np.random.rand(data.shape[0])<0.3

In [None]:
test_data = data[test_ind]
test_labels = labels[test_ind]
training_data = data[~test_ind]
training_labels = labels[~test_ind]

In [None]:
print 'test data: ', test_data.shape,'\n',\
      'test labels', test_labels.shape, '\n',\
      'training data', training_data.shape, '\n',\
      'training labels ', training_labels.shape

In [None]:
def compute_distance_matrix_and_argsort(test_data, training_data):
    data_shape = test_data.shape
    test_n_instances = data_shape[0]
    dims = 1 if len(data_shape) == 1 else data_shape[1]
    distances = np.linalg.norm(test_data.reshape(test_n_instances,1, dims)-training_data,axis = 2)
    dist_sorted = distances.argsort(axis=1)
    return distances, dist_sorted

In [None]:
def select_knn_labels_distances(sim_matrix,argsorted_distances, k, training_labels):
    res_selection = argsorted_distances[:,:k]
    return training_labels[res_selection], sim_matrix[:,:k]
    

In [None]:
def knn_classification_majority(sim_matrix,argsorted_distances, k, training_labels):
    resulting_labels = select_knn_labels_distances(sim_matrix, argsorted_distances, k, training_labels)[0]
    all_labels = np.unique(training_labels)
    counts = np.empty([resulting_labels.shape[0],all_labels.shape[0]])
    for i in range(len(all_labels)):
        counts[:,i] = (resulting_labels==all_labels[i]).sum(axis=1)
    return all_labels[counts.argmax(axis=1)]

In [None]:
def knn_weighted_classification(sim_matrix,argsorted_distances,\
                                k,\
                                training_labels):
    resulting_labels, sorted_k_distances = select_knn_labels_distances(sim_matrix, argsorted_distances, k, training_labels)
    all_labels = np.unique(training_labels)
    counts = np.empty([resulting_labels.shape[0],all_labels.shape[0]])
    for i in range(len(all_labels)):
        for_each_dist_to_knn_or_zero = ma.masked_array(\
                                                      sorted_k_distances,\
                                                      ~(resulting_labels==all_labels[i]),\
                                                      fill_value=0.).filled()
        weights_for_knns = np.where(for_each_dist_to_knn_or_zero == 0,\
                                    for_each_dist_to_knn_or_zero,\
                                    1./np.power(for_each_dist_to_knn_or_zero,2)
                                    )
        counts[:,i] = weights_for_knns.sum(axis=1)
    return all_labels[counts.argmax(axis=1)]

In [None]:
def accuracy(predicted_labels, ground_truth_labels):
    return float((predicted_labels == ground_truth_labels).sum()) / float(len(ground_truth_labels))

In [None]:
distances, argsorted_distances = compute_distance_matrix_and_argsort(test_data, training_data)

In [None]:
for i in range(1,50):
    print 'k=',i, ' accuracy weighted knn: ',\
    accuracy(knn_weighted_classification(distances, argsorted_distances, i, training_labels), test_labels),\
    ' accuracy majority knn: ',\
    accuracy(knn_classification_majority(distances, argsorted_distances, i, training_labels), test_labels)
    

In [None]:
columns