Spectral Clustering
Spectral Clustering
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from scipy import sparse
from sklearn.cluster import KMeans
np.random.seed(0)
def getClusterCentroids(X,spectral_labels):
"""
Funcao auxiliar para obter os centroids dos clusters a partir dos dados X e das
marcacoes de spectral_labels
"""
tmp = pd.DataFrame(X)
cols = tmp.columns
tmp['spectral_labels'] = spectral_labels
return tmp.groupby("spectral_labels")[cols].mean().values
def unnormalizedSpectralClustering(X,k,params):
"""
Unnormalized Spectral Clustering
Inputs:
X - Array of data points
k - Number of clusters to construct
params - Additional parameters for constructing the similarity graph
"""
n = len(X)
def normalizedRWSpectralClustering(X,k,params):
"""
Normalized Spectral Clustering according to Shi and Malik (2000)
Uses the normalized Random Walk Laplacian matrix
Inputs:
X - Array of data points
k - Number of clusters to construct
params - Additional parameters for constructing the similarity graph
"""
## For i=1,...,n, let y_i be the vector corresponding to the i-th row of U
Y = U
## Cluster the points (y_i)i=1,...,n with the k-means algorithm into clusters
C_1,...,C_k
kmeans = KMeans(n_clusters=k, random_state=0).fit(Y)
def normalizedSymSpectralClustering(X,k,params):
"""
Normalized Spectral Clustering according to Ng, Jordan and Weiss (2002)
Uses the normalized symmetric Laplacian matrix
Inputs:
X - Array of data points
k - Number of clusters to construct
params - Additional parameters for constructing the similarity graph
"""
n = len(X)
## For i=1,...,n, let y_i be the vector corresponding to the i-th row of T
## Cluster the points (y_i)i=1,...,n with the k-means algorithm into clusters
C_1,...,C_k
kmeans = KMeans(n_clusters=k, random_state=0).fit(T)
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
simple_datasets = [
(noisy_circles, {'name': 'Noisy Circles','n_clusters': 2}),
(noisy_moons, {'name': 'Noisy Moons', 'n_clusters': 2}),
(varied, {'name': 'Blobs with varied variances','n_clusters': 3}),
(aniso, {'name': 'Anisotropic data', 'n_clusters': 3}),
(blobs, {'name': 'Blobs', 'n_clusters': 3}),
(no_structure, {'name': 'No structure', 'n_clusters': 3})]
plt.figure(figsize=(9 * 2 + 3, 3))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
for i_dataset, (dataset, dataset_params) in enumerate(simple_datasets):
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
name = dataset_params['name']
plt.subplot(1, len(simple_datasets), plot_num)
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], s=10) #c=y)#, cmap='Set1')
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plot_num += 1
# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(4 * 2 + 3, 12.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
clustering_algorithms = (
('KMeans', kmeans),
('Unnormalized Spectral Clustering', unnormalizedSpectralClustering),
('Normalized Spectral Clustering\nRandom Walk',
normalizedRWSpectralClustering),
('Normalized Spectral Clustering\nSymmetric Laplacian',
normalizedSymSpectralClustering)
)
if name == 'KMeans':
algorithm.fit(X)
else:
k = params['n_clusters']
spectral_params = {
'k_neighbors': 12
}
result = algorithm(X,k,spectral_params)
t1 = time.time()
if name == 'KMeans':
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = result['labels']
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()