Classifier Comparison

We have prepared for you a framework for visualizing the decision functions of different classifiers on different datasets. Apply the classificaion algorithms to the given datasets and describe your observations. Do the decision surfaces look as you would have expected? Also discuss the effects of different parametrizations.

In [1]:
%matplotlib inline
import itertools

import numpy
import pandas
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot as plt, cm
from matplotlib.colors import ListedColormap
plt.rcParams['figure.figsize'] = (12, 9)

import seaborn

import ipywidgets
In [2]:
cmap = cm.get_cmap('RdBu')
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

Data Sets

In [3]:
# Create data sets
n_samples = 200
resolution = (400, 300)
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, n_samples=n_samples//2)
rng = numpy.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = {
    'moons': dict(zip(['x', 'y'], make_moons(noise=0.3, random_state=0, n_samples=n_samples,))),
    'circles': dict(zip(['x', 'y'], make_circles(noise=0.2, factor=0.5, random_state=1, n_samples=n_samples,))),
    'linear': dict(zip(['x', 'y'], linearly_separable)),
}
In [4]:
# Remove points that are too close
for k in datasets.keys():
    x, y = datasets[k]['x'], datasets[k]['y']
    d = numpy.linalg.norm(x[:, None, :] - x[None, :, :], axis=-1)
    too_close = d < .1
    n = x.shape[0]
    mask = numpy.ones(shape=(n,), dtype=numpy.bool)
    for i in range(n):
        too_close[i, i:] = False
        mask[too_close[i, :]] = False
    x, y = x[mask], y[mask]
    datasets[k]['x'], datasets[k]['y'] = x, y
In [5]:
# Create mesh grid
margin = .5
for k in datasets.keys():
    x, y = datasets[k]['x'], datasets[k]['y']
    low, high = x.min(axis=0) - margin, x.max(axis=0) + margin
    xx, yy = numpy.meshgrid(*[numpy.linspace(start=start, stop=stop, num=num) for start, stop, num in zip(low, high, resolution)])
    datasets[k]['mesh'] = xx, yy

Show data sets

In [6]:
def select_dataset(key):
    global x, y, xx, yy, ax, name
    name = key
    data = datasets[key]
    x = data['x']
    y = data['y']
    xx, yy = data['mesh']
    fig, ax = plt.subplots()
    ax.scatter(*x.T, c=y, cmap=cm_bright, edgecolors='k')
    ax.set_title(key)
    ax.set_aspect('equal')
    
ipywidgets.interact(select_dataset, key=ipywidgets.Dropdown(options=list(datasets.keys())))
Out[6]:
<function __main__.select_dataset(key)>
In [7]:
def show_classifier(classifier, ax):
    global name, x, y, xx, yy
    
    classifier.fit(x, y)
    
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    if hasattr(classifier, "decision_function"):
        Z = classifier.decision_function(numpy.c_[xx.ravel(), yy.ravel()])
    else:
        Z = classifier.predict_proba(numpy.c_[xx.ravel(), yy.ravel()])[:, 1]

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, cmap='RdBu', alpha=.8)
    ax.scatter(*x.T, c=y, cmap=cm_bright, edgecolors='k')
    ax.set_aspect('equal')

Naive Bayes

In [8]:
fig, ax = plt.subplots()

classifier = GaussianNB()
show_classifier(classifier=classifier, ax=ax)

Linear Classifier

In [9]:
fig, ax = plt.subplots()

classifier = LogisticRegression(solver='lbfgs')
show_classifier(classifier=classifier, ax=ax)

SVM

In [10]:
fig, axes = plt.subplots(nrows=3, figsize=(10, 30))
for ax, kernel in zip(axes, ['linear', 'rbf', 'poly']):
    classifier = SVC(kernel=kernel, gamma='auto')
    show_classifier(classifier=classifier, ax=ax)
    ax.set_title('SVM, kernel: {k}'.format(k=kernel))
plt.tight_layout()

Decision Tree

In [11]:
max_depths = [1, 2, 3, 5, 8, 13, 21]
criteria = ['gini', 'entropy']
fig, axes = plt.subplots(ncols=len(criteria), nrows=len(max_depths), figsize=(10*len(criteria), 10*len(max_depths)))
for row, max_depth in enumerate(max_depths):
    for col, criterion in enumerate(criteria):
        ax = axes[row, col]
        classifier = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion)
        show_classifier(classifier=classifier, ax=ax)
        ax.set_title('Decision Tree: max_depth:{d}, crit:{c}'.format(d=max_depth, c=criterion))

Nearest Neighbor

In [12]:
n_neighborss = [1, 2, 3, 5, 8, 13, 21]
weightss = ['uniform', 'distance']
fig, axes = plt.subplots(ncols=len(weightss), nrows=len(n_neighborss), figsize=(10*len(weightss), 10*len(n_neighborss)))
for row, n_neighbors in enumerate(n_neighborss):
    for col, weights in enumerate(weightss):
        ax = axes[row, col]
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
        show_classifier(classifier=classifier, ax=ax)
        ax.set_title('NN Classifier: k:{k}, weights:{w}'.format(k=n_neighbors, w=weights))

AdaBoost

In [13]:
n_estimatorss = [2, 3, 5, 8, 13, 21, 34, 55]
fig, axes = plt.subplots(nrows=len(n_estimatorss), figsize=(10, 10*len(n_estimatorss)))
for ax, n_estimators in zip(axes, n_estimatorss):
    classifier = AdaBoostClassifier(n_estimators=n_estimators)
    show_classifier(classifier=classifier, ax=ax)
    ax.set_title('AdaBoost, #base classifiers: {k}'.format(k=n_estimators))
plt.tight_layout()
In [ ]: