Due to the ever growing world population, Santa Clause cannot organise the distribution of all presents alone. Hence, he decided to delegate the organisation. To this end, he wants to subdivide the world into multiple regions and delegate the planning of this subregions to his clever reindeers. But these regions cannot be chosen arbitrarily, as spatial coherence easens the planning task for the reindeers. Can you help Santa to discover the structure hidden in the data of all cities he has to visit?
This tutorial will use some packages. If you have not installed them yet, you can use the following pip command to install them:
# Imports - please install these packages first
%matplotlib inline
import urllib
from os import path
import hashlib
import numpy
import pandas
import seaborn
from matplotlib import pyplot as plt, cm
from sklearn.cluster import *
from scipy import sparse
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pyclustering.cluster import optics
# For wide screens
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
# Raw data (you do not need to download these - instead use the preprocessed data)
DATA_SOURCE_URL = 'http://download.geonames.org/export/dump/'
RAW_DATA_FILE_NAME = 'allCountries.zip'
README_FILE_NAME = 'readme.txt'
RAW_DATA_SHA256 = '207ecbe83ef1545631e49b46e1987accf53a3164dc64ee95e34dd47ecdff80fe'
README_SHA256 = '33bd18a1ee9eba0199249c2602d7d1752f548a3ff8dfea1d70a123db6d53a364'
# Preprocessed data paths
PREPROCESSED_DATA_SOURCE_URL = 'http://www.dbs.ifi.lmu.de/Lehre/KDD/WS1819/tutorials/'
COMPRESSION = 'xz'
LARGE_PREPROCESSED_FILE_NAME = 'large.csv' + '.' + COMPRESSION
SMALL_PREPROCESSED_FILE_NAME = 'small.csv' + '.' + COMPRESSION
LARGE_PREPROCESSED_SHA256 = 'b0cb7e153ba7f8dd3a189deae5629eeffb4f6e6f738f7d6b08e5f257dbc53724'
SMALL_PREPROCESSED_SHA256 = '6273dd3475bb473f170304e12db13cb1882ef0a7080b1cd422103b7fe95f2b58'
# Column names
INDEX_COLUMN_RAW = 'geonameid'
COLUMN_MINI_CLUSTER = 'mini_cluster'
COLUMNS_RAW_INTERESTING = ['geonameid', 'longitude', 'latitude', 'population']
COLUMNS_3D = ['x', 'y', 'z']
COLUMNS_2D = ['longitude', 'latitude']
Some helper functions.
def sha256(file_path):
hasher = hashlib.sha256()
with open(file_path, 'rb') as f:
hasher.update(f.read())
return hasher.hexdigest()
def check_checksum(file_path, expected_checksum):
checksum = sha256(file_path)
match = checksum == expected_checksum
if not match:
print('[WARNING] checksums do not match; got {a}, expected {b}'.format(a=checksum, b=expected_checksum))
return match
def maybe_download(file_path, expected_checksum, base_url=DATA_SOURCE_URL):
raw_data_url = urllib.parse.urljoin(base_url, file_path)
download = True
if path.isfile(file_path):
print('Found existing file:', file_path)
if check_checksum(file_path, expected_checksum=expected_checksum):
download = False
print('Verified integrity')
else:
print('Failed to verify integrity. Re-downloading')
if download:
print('Downloading from', raw_data_url, '(May take some time)')
try:
urllib.request.urlretrieve(url=raw_data_url, filename=file_path)
except urllib.error.HTTPError:
print('[ERROR] URL not available.')
def show_clustering(x, y, c, ax=None, transfer_csr=None, resolution=(600, 300)):
if ax is None:
f, ax = plt.subplots()
if transfer_csr is not None:
c = transfer_csr @ c
# Create image
cmap = cm.get_cmap('hsv')
image = numpy.zeros(shape=resolution + (4,))
total_count = numpy.zeros(shape=resolution)
xmin, xmax, ymin, ymax = x.min(), x.max(), y.min(), y.max()
x_edges = numpy.linspace(xmin, xmax, num=resolution[0] + 1)
y_edges = numpy.linspace(ymin, ymax, num=resolution[1] + 1)
bins = (x_edges, y_edges)
k = c.max() + 1
for i in range(k):
mask = c == i
counts, xedges, yedges = numpy.histogram2d(bins=bins, x=x[mask], y=y[mask])
total_count += counts
image += counts[:, :, None] * numpy.asanyarray(cmap(i / k))[None, None, :]
image /= numpy.maximum(total_count, 1)[:, :, None]
image = numpy.transpose(image, axes=(1, 0, 2))
ax.imshow(image, origin='lower', extent=[xmin, xmax, ymin, ymax])
ax.set_ylabel('Latitude')
ax.set_xlabel('Longitude')
def show_clusterings(df, y, transfer_csr, title=None):
n_params = y.shape[1]
nrows = (n_params+2) // 3
f, axes = plt.subplots(ncols=3, nrows=nrows, figsize=(3*6, nrows*3))
for i in range(n_params):
ax = axes.flat[i]
show_clustering(ax=ax, c=y[:, i], x=df['longitude'], y=df['latitude'], transfer_csr=transfer_csr)
axes[0, 1].set_title(title)
plt.tight_layout()
plt.show()
def benchmark_algorithm(alg, params, scaling=True):
n_params = len(params)
y = numpy.empty(shape=(n, n_params), dtype=numpy.int32)
for i, p in enumerate(params):
print(p)
clusterer = alg(**p)
if scaling:
clusterer = Pipeline([('scaling', scaler), ('clustering', clusterer)])
%time y[:, i] = clusterer.fit_predict(data)
return y
This part of the notebook preprocesses the raw input data. This part is only for your information. You can skip it and instead work on the preprocessed data we provide.
Download the raw data
# Raw data
maybe_download(RAW_DATA_FILE_NAME, RAW_DATA_SHA256)
maybe_download(README_FILE_NAME, README_SHA256)
# Take column information from README file
with open(README_FILE_NAME, 'r') as f:
lines = f.readlines()
start = 0
while "main 'geoname' table" not in lines[start]:
start += 1
selection = lines[start+2:start+2+19]
columns = [line.split(':')[0].strip() for line in selection]
# Read CSV file
df = pandas.read_csv(
RAW_DATA_FILE_NAME,
sep='\t',
header=None,
names=columns,
index_col=INDEX_COLUMN_RAW,
usecols=COLUMNS_RAW_INTERESTING,
compression='zip'
)
# Keep only inhabited places
old_size = df.shape[0]
df = df[df['population'] > 0]
new_size = df.shape[0]
print('Dropped', old_size - new_size, 'rows', 'out of', old_size, 'due to non-positive population')
df.drop(columns=['population'], inplace=True)
print('Dropped column: "population"')
# Transform longitude/latitude pairs to 3D coordinates using the WGS84 reference ellipsoid.
# cf. https://en.wikipedia.org/wiki/World_Geodetic_System#WGS84
phi = (df['latitude'].values / 180.0) * numpy.pi
lam = (df['longitude'].values / 180.0) * numpy.pi
# WGS 84
a = 6378.1370
b = 6356.7523
cos_phi = numpy.cos(phi)
sin_phi = numpy.sin(phi)
N_phi = a**2 / numpy.sqrt(a**2 * cos_phi**2 + b**2 * sin_phi**2)
x = N_phi * cos_phi * numpy.cos(lam)
y = N_phi * cos_phi * numpy.sin(lam)
z = (b**2 / a**2) * N_phi * sin_phi
df['x'] = x
df['y'] = y
df['z'] = z
df.head()
df.info()
df.describe()
# Save preprocessed data
df.to_csv(LARGE_PREPROCESSED_FILE_NAME, compression=COMPRESSION)
print('Old checksum', LARGE_PREPROCESSED_SHA256)
LARGE_PREPROCESSED_SHA256 = sha256(LARGE_PREPROCESSED_FILE_NAME)
print('New checksum', LARGE_PREPROCESSED_SHA256)
# 3D plot
from mpl_toolkits.mplot3d import Axes3D
f, ax = plt.subplots(subplot_kw={'projection': '3d'})
ax.scatter(*df.loc[:, ['x', 'y', 'z']].values.T)
# Show data
seaborn.jointplot(data=df, kind='hex', x='longitude', y='latitude', joint_kws={'bins': 'log'})
As the full data set is too large for many clustering algorithms to run in sensible time, we aggregate the dataset first, by performing a $k$-Means clustering with some large $k$, and afterwards only cluster the cluster centers further.
Again, this section is only for your information, and you can skip it and continue with the preprocessed data we provide.
# Load preprocessed data
df_large = pandas.read_csv(LARGE_PREPROCESSED_FILE_NAME, compression=COMPRESSION, index_col=INDEX_COLUMN_RAW)
data = df_large[COLUMNS_3D].values
# Aggregate by k-Means clustering
aggregator = MiniBatchKMeans(n_clusters=2048, n_init=16, init_size=4*2048)
%time aggregator.fit(data)
# Save assingment
df_large[COLUMN_MINI_CLUSTER] = aggregator.labels_
df_large.to_csv(LARGE_PREPROCESSED_FILE_NAME, compression=COMPRESSION)
# Use the mini cluster centers as new smaller data set
df_small = pandas.DataFrame(data=aggregator.cluster_centers_, columns=COLUMNS_3D)
df_small.to_csv(SMALL_PREPROCESSED_FILE_NAME, compression=COMPRESSION, index=False)
df_large.info()
# Show clustering
plt.subplots(figsize=(20, 10))
plt.scatter(*df_large[COLUMNS_2D].values.T, alpha=.1, s=2, c=df_large[COLUMN_MINI_CLUSTER], cmap='hsv')
plt.tight_layout()
In this section, we apply different clustering algorithms to the city data. First, we load the data and define some helper functions.
# Download the data
maybe_download(file_path=LARGE_PREPROCESSED_FILE_NAME, expected_checksum=LARGE_PREPROCESSED_SHA256, base_url=PREPROCESSED_DATA_SOURCE_URL)
maybe_download(file_path=SMALL_PREPROCESSED_FILE_NAME, expected_checksum=SMALL_PREPROCESSED_SHA256, base_url=PREPROCESSED_DATA_SOURCE_URL)
# Read data
df_large = pandas.read_csv(LARGE_PREPROCESSED_FILE_NAME, compression=COMPRESSION)
df = pandas.read_csv(SMALL_PREPROCESSED_FILE_NAME, compression=COMPRESSION)
# Show clustering
plt.subplots(figsize=(20, 10))
plt.scatter(*df_large[COLUMNS_2D].values.T, alpha=.1, s=2, c=df_large[COLUMN_MINI_CLUSTER], cmap='hsv')
plt.tight_layout()
# Use spatial information only
data = df.loc[:, COLUMNS_3D].values
n, d = data.shape
n_large = df_large.shape[0]
n, n_large
Transfer labels from clusters centers to their members
# Converting the WGS84 3D coordinates to latitude, longitude pairs is quite hard.
# Instead we will transfer the labels from the small clusters' center to all its
# members for which we know the latitude/longitude.
# To this end, the following sparse matrix is used.
n_large = df_large.shape[0]
data_coo = numpy.ones(shape=(n_large,))
i_coo = numpy.arange(n_large)
j_coo = df_large[COLUMN_MINI_CLUSTER].values
transfer_coo = sparse.coo_matrix((data_coo, (i_coo, j_coo)), dtype=numpy.int32)
transfer_csr = transfer_coo.tocsr()
scaler = StandardScaler()
# Number of clusters to try
ks = [2, 3, 5, 8, 13, 21]
We show the result in the longitude-latitude plot. Observe that the clustering is performed on the actual 3D coordinates.
Apply the $k$-Means algorithm and experiment with the parameters. What do you observe in particular regarding runtime, and resulting clustering?
y_kmeans = benchmark_algorithm(
alg=KMeans,
params=[
{
'n_clusters': k,
}
for k in ks
],
)
show_clusterings(df=df_large, y=y_kmeans, transfer_csr=transfer_csr, title='k-means')
Apply the EM algorithm and experiment with the parameters. What do you observe in particular regarding runtime, and resulting clustering?
y_em = benchmark_algorithm(
alg=GaussianMixture,
params=[
{
'n_components': k,
'covariance_type': cov,
'n_init': 10,
}
for cov in ['full', 'spherical'] for k in ks
]
)
show_clusterings(df=df_large, y=y_em, title='EM', transfer_csr=transfer_csr)
Apply spectral clustering and experiment with the parameters. What do you observe in particular regarding runtime, and resulting clustering?
y_spectral = benchmark_algorithm(
alg=SpectralClustering,
params=[
{
'n_clusters': k,
'affinity': 'nearest_neighbors',
'n_neighbors': nn,
'n_jobs': -1,
}
for nn in [7, 50] for k in ks
]
)
show_clusterings(df=df_large, y=y_spectral, title='Spectral Clustering', transfer_csr=transfer_csr)