0% found this document useful (0 votes)
6 views

Spectral Clustering

The document contains Python code for implementing various spectral clustering algorithms, including unnormalized, normalized random walk, and normalized symmetric spectral clustering. It generates synthetic datasets and visualizes the clustering results using KMeans and the spectral methods. The code utilizes libraries such as NumPy, Pandas, Matplotlib, and Scikit-learn for data manipulation and visualization.

Uploaded by

ravintej22
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views

Spectral Clustering

The document contains Python code for implementing various spectral clustering algorithms, including unnormalized, normalized random walk, and normalized symmetric spectral clustering. It generates synthetic datasets and visualizes the clustering results using KMeans and the spectral methods. The code utilizes libraries such as NumPy, Pandas, Matplotlib, and Scikit-learn for data manipulation and visualization.

Uploaded by

ravintej22
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture


from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from itertools import cycle, islice

import networkx as nx
from scipy import sparse
from sklearn.cluster import KMeans

np.random.seed(0)

def getClusterCentroids(X,spectral_labels):
"""
Funcao auxiliar para obter os centroids dos clusters a partir dos dados X e das
marcacoes de spectral_labels
"""
tmp = pd.DataFrame(X)
cols = tmp.columns
tmp['spectral_labels'] = spectral_labels
return tmp.groupby("spectral_labels")[cols].mean().values

def unnormalizedSpectralClustering(X,k,params):
"""
Unnormalized Spectral Clustering

Inputs:
X - Array of data points
k - Number of clusters to construct
params - Additional parameters for constructing the similarity graph
"""
n = len(X)

## Construct a similarity graph by one of the ways described in Section 2. Let


W be its weighted adjacency matrix
# Computes the (weighted) graph of k-Neighbors for points in X. The default
distance is 'euclidean'
A = kneighbors_graph(X, params['k_neighbors'], mode='distance',
metric='euclidean', include_self=True)

## Compute the unnormalized Laplacian L


G = nx.from_scipy_sparse_matrix(A)
L = nx.laplacian_matrix(G)

## Compute the first k eigenvectors u_1,...,u_k of L


eigenvalues, eigenvectors = sparse.linalg.eigs(L, k=k, which='SM')
eigenvectors = np.real_if_close(eigenvectors)
eigenvalues = np.real_if_close(eigenvalues)
kfirst_indices = np.argsort(eigenvalues)[:k]

## Let U be the matrix containing the vectors u_1,...,u_k as columns


## For i=1,...,n, let y_i be the vector corresponding to the i-th row of U
Y = eigenvectors[:,kfirst_indices]
## Cluster the points (y_i)i=1,...,n with the k-means algorithm into clusters
C_1,...,C_k
kmeans = KMeans(n_clusters=k, random_state=0).fit(Y)

## Output: Clusters A_1,...,A_k with A_i = {j|y_j in C_i}


return {
'labels': kmeans.labels_,
'centroids': getClusterCentroids(X,kmeans.labels_)
}

def normalizedRWSpectralClustering(X,k,params):
"""
Normalized Spectral Clustering according to Shi and Malik (2000)
Uses the normalized Random Walk Laplacian matrix

Inputs:
X - Array of data points
k - Number of clusters to construct
params - Additional parameters for constructing the similarity graph
"""

## Construct a similarity graph by one of the ways described in Section 2. Let


W be its weighted adjacency matrix
# Computes the (weighted) graph of k-Neighbors for points in X. The default
distance is 'euclidean'
A = kneighbors_graph(X, params['k_neighbors'], mode='distance',
metric='euclidean', include_self=True)

## Compute the unnormalized Laplacian L


G = nx.from_scipy_sparse_matrix(A)
L = nx.laplacian_matrix(G)

degreeView = G.degree(G.nodes(), weight='weight')


degrees = np.array(degreeView)[:,1]
D = sparse.diags(degrees)

## Compute the first k generalized eigenvectors u_1,...,u_k of the generalized


eigenproblem Lu=lambda*Du
eigenvalues, eigenvectors = sparse.linalg.eigs(L, k=k, M=D, which='SM') # SM =
Smallest Magnitude
eigenvectors = np.real_if_close(eigenvectors)
eigenvalues = np.real_if_close(eigenvalues)
kfirst_indices = np.argsort(eigenvalues)[:k]

## Let U be the matrix containing the vectors u_1,...,u_k as columns


U = eigenvectors[:,kfirst_indices]

## For i=1,...,n, let y_i be the vector corresponding to the i-th row of U
Y = U

## Cluster the points (y_i)i=1,...,n with the k-means algorithm into clusters
C_1,...,C_k
kmeans = KMeans(n_clusters=k, random_state=0).fit(Y)

## Output: Clusters A_1,...,A_k with A_i = {j|y_j in C_i}


return {
'labels': kmeans.labels_,
'centroids': getClusterCentroids(X,kmeans.labels_)
}

def normalizedSymSpectralClustering(X,k,params):
"""
Normalized Spectral Clustering according to Ng, Jordan and Weiss (2002)
Uses the normalized symmetric Laplacian matrix

Inputs:
X - Array of data points
k - Number of clusters to construct
params - Additional parameters for constructing the similarity graph
"""
n = len(X)

## Construct a similarity graph by one of the ways described in Section 2. Let


W be its weighted adjacency matrix
# Computes the (weighted) graph of k-Neighbors for points in X. The default
distance is 'euclidean'
A = kneighbors_graph(X, params['k_neighbors'], mode='distance',
metric='euclidean', include_self=True)

## Compute the normalized Laplacian L_sym


G = nx.from_scipy_sparse_matrix(A)
L_sym = nx.normalized_laplacian_matrix(G)

## Compute the first k eigenvectors u_1,...,u_k of L_sym


eigenvalues, eigenvectors = sparse.linalg.eigs(L_sym, k=k, which='SM')
eigenvectors = np.real_if_close(eigenvectors)
eigenvalues = np.real_if_close(eigenvalues)
kfirst_indices = np.argsort(eigenvalues)[:k]

## Let U be the matrix containing the vectors u_1,...,u_k as columns


U = eigenvectors[:,kfirst_indices]

## Form the matrix T from U by normalizing the rows to norm 1


T = U / np.sqrt(np.sum(U**2, axis=1))[:,np.newaxis]

## For i=1,...,n, let y_i be the vector corresponding to the i-th row of T
## Cluster the points (y_i)i=1,...,n with the k-means algorithm into clusters
C_1,...,C_k
kmeans = KMeans(n_clusters=k, random_state=0).fit(T)

## Output: Clusters A_1,...,A_k with A_i = {j|y_j in C_i}


return {
'labels': kmeans.labels_,
'centroids': getClusterCentroids(X,kmeans.labels_)
}

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances


varied = datasets.make_blobs(n_samples=n_samples,
cluster_std=[1.0, 2.5, 0.5],
random_state=random_state)

simple_datasets = [
(noisy_circles, {'name': 'Noisy Circles','n_clusters': 2}),
(noisy_moons, {'name': 'Noisy Moons', 'n_clusters': 2}),
(varied, {'name': 'Blobs with varied variances','n_clusters': 3}),
(aniso, {'name': 'Anisotropic data', 'n_clusters': 3}),
(blobs, {'name': 'Blobs', 'n_clusters': 3}),
(no_structure, {'name': 'No structure', 'n_clusters': 3})]

plt.figure(figsize=(9 * 2 + 3, 3))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
for i_dataset, (dataset, dataset_params) in enumerate(simple_datasets):

X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)

name = dataset_params['name']
plt.subplot(1, len(simple_datasets), plot_num)
plt.title(name, size=18)
plt.scatter(X[:, 0], X[:, 1], s=10) #c=y)#, cmap='Set1')

plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())

plot_num += 1

# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(4 * 2 + 3, 12.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)

plot_num = 1

for i_dataset, (dataset, params) in enumerate(simple_datasets):


X, y = dataset

# normalize dataset for easier parameter selection


X = StandardScaler().fit_transform(X)

kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
clustering_algorithms = (
('KMeans', kmeans),
('Unnormalized Spectral Clustering', unnormalizedSpectralClustering),
('Normalized Spectral Clustering\nRandom Walk',
normalizedRWSpectralClustering),
('Normalized Spectral Clustering\nSymmetric Laplacian',
normalizedSymSpectralClustering)
)

for name, algorithm in clustering_algorithms:


t0 = time.time()

if name == 'KMeans':
algorithm.fit(X)
else:
k = params['n_clusters']
spectral_params = {
'k_neighbors': 12
}
result = algorithm(X,k,spectral_params)

t1 = time.time()

if name == 'KMeans':
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = result['labels']

plt.subplot(len(simple_datasets), len(clustering_algorithms), plot_num)


if i_dataset == 0:
plt.title(name)

colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',


'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1))))

plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1

plt.show()

You might also like