We have prepared for you a framework for visualizing the decision functions of different classifiers on different datasets. Apply the classificaion algorithms to the given datasets and describe your observations. Do the decision surfaces look as you would have expected? Also discuss the effects of different parametrizations.
%matplotlib inline
import itertools
import numpy
import pandas
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt, cm
from matplotlib.colors import ListedColormap
plt.rcParams['figure.figsize'] = (12, 9)
import seaborn
import ipywidgets
cmap = cm.get_cmap('RdBu')
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
# Create data sets
n_samples = 200
resolution = (400, 300)
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, n_samples=n_samples//2)
rng = numpy.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = {
'moons': dict(zip(['x', 'y'], make_moons(noise=0.3, random_state=0, n_samples=n_samples,))),
'circles': dict(zip(['x', 'y'], make_circles(noise=0.2, factor=0.5, random_state=1, n_samples=n_samples,))),
'linear': dict(zip(['x', 'y'], linearly_separable)),
}
# Remove points that are too close
for k in datasets.keys():
x, y = datasets[k]['x'], datasets[k]['y']
d = numpy.linalg.norm(x[:, None, :] - x[None, :, :], axis=-1)
too_close = d < .1
n = x.shape[0]
mask = numpy.ones(shape=(n,), dtype=numpy.bool)
for i in range(n):
too_close[i, i:] = False
mask[too_close[i, :]] = False
x, y = x[mask], y[mask]
datasets[k]['x'], datasets[k]['y'] = x, y
# Create mesh grid
margin = .5
for k in datasets.keys():
x, y = datasets[k]['x'], datasets[k]['y']
low, high = x.min(axis=0) - margin, x.max(axis=0) + margin
xx, yy = numpy.meshgrid(*[numpy.linspace(start=start, stop=stop, num=num) for start, stop, num in zip(low, high, resolution)])
datasets[k]['mesh'] = xx, yy
Show data sets
def select_dataset(key):
global x, y, xx, yy, ax, name
name = key
data = datasets[key]
x = data['x']
y = data['y']
xx, yy = data['mesh']
fig, ax = plt.subplots()
ax.scatter(*x.T, c=y, cmap=cm_bright, edgecolors='k')
ax.set_title(key)
ax.set_aspect('equal')
ipywidgets.interact(select_dataset, key=ipywidgets.Dropdown(options=list(datasets.keys())))
def show_classifier(classifier, ax):
global name, x, y, xx, yy
classifier.fit(x, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
if hasattr(classifier, "decision_function"):
Z = classifier.decision_function(numpy.c_[xx.ravel(), yy.ravel()])
else:
Z = classifier.predict_proba(numpy.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap='RdBu', alpha=.8)
ax.scatter(*x.T, c=y, cmap=cm_bright, edgecolors='k')
ax.set_aspect('equal')
fig, ax = plt.subplots()
classifier = GaussianNB()
show_classifier(classifier=classifier, ax=ax)
fig, ax = plt.subplots()
classifier = LogisticRegression(solver='lbfgs')
show_classifier(classifier=classifier, ax=ax)
fig, axes = plt.subplots(nrows=3, figsize=(10, 30))
for ax, kernel in zip(axes, ['linear', 'rbf', 'poly']):
classifier = SVC(kernel=kernel, gamma='auto')
show_classifier(classifier=classifier, ax=ax)
ax.set_title('SVM, kernel: {k}'.format(k=kernel))
plt.tight_layout()
max_depths = [1, 2, 3, 5, 8, 13, 21]
criteria = ['gini', 'entropy']
fig, axes = plt.subplots(ncols=len(criteria), nrows=len(max_depths), figsize=(10*len(criteria), 10*len(max_depths)))
for row, max_depth in enumerate(max_depths):
for col, criterion in enumerate(criteria):
ax = axes[row, col]
classifier = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion)
show_classifier(classifier=classifier, ax=ax)
ax.set_title('Decision Tree: max_depth:{d}, crit:{c}'.format(d=max_depth, c=criterion))