0% found this document useful (0 votes)
30 views8 pages

ML Python Exercises UOM BDS Cluster Analysis

Uploaded by

metapi5906
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
30 views8 pages

ML Python Exercises UOM BDS Cluster Analysis

Uploaded by

metapi5906
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 8

CHAPTER 4 – CLUSTER ANALYSIS

1. PARTITIONING METHODS :[pg.no:87]


Partitioning methods are a widely used family of clustering algorithms in data mining that aim to
partition a dataset into K clusters.
Program:

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

data = {'x': [25, 34, 22, 27, 33, 33, 31, 22, 35, 34, 67, 54, 57, 43, 50, 57, 59, 52, 65, 47, 49, 48, 35,
33, 44, 45,

38,43,51,46],

'y': [79, 51, 53, 78, 59, 74, 73, 57, 69, 75, 51, 32, 40, 47, 53, 36, 35, 58, 59, 50, 25, 20, 14, 12,
20, 5, 29,

27, 8, 7]}

df = pd.DataFrame(data, columns=['x', 'y'])

# K-means Clustering

kmeans = KMeans(n_clusters=3).fit(df)

centroids = kmeans.cluster_centers_

print(centroids)

labels = kmeans.labels_

plt.scatter(df['x'], df['y'], c=labels.astype(float), s=50)

plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)

plt.show()

OUTPUT
2. AGGLOMERATIVE CLUSTERING :[pg.no:96]
Agglomerative clustering: Commonly referred to as AGNES (AGglomerative NESting) works in a
bottom-up manner. That is, each observation is initially considered as a single-element cluster
(leaf). At each step of the algorithm, the two clusters that are the most similar are combined into
a new bigger cluster (nodes).
PROGRAM

import matplotlib.pyplot as plt

import numpy as np

from sklearn.cluster import AgglomerativeClustering

import scipy.cluster.hierarchy as shc

X = np.array([[2, 8], [8, 15], [3, 6], [6, 9], [8, 7], [10, 10]])

# Agglomerative Clustering

cluster = AgglomerativeClustering(n_clusters=3,

affinity='euclidean', linkage='ward')

labels = cluster.fit_predict(X)

print(labels)

# Drawing Dendrograms

plt.figure(figsize=(10, 7))

plt.title("Employee Skill Dendrograms")

dend = shc.dendrogram(shc.linkage(X, method='ward'))

plt.show()
OUTPUT

[1 2 1 0 0 0]

3. Balanced Iterative Reducing Clustering Using Hierarchies (BIRCH):[pg.no:98]


It appropriate for very large datasets or for streaming data, because of its ability to find a good
clustering solution with only a single scan of the data.
PROGRAM

from sklearn.cluster import Birch

X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]

brc = Birch(branching_factor=50, n_clusters=None,

threshold=0.5, compute_labels=True)

brc.fit(X)

print(brc.predict(X))

OUTPUT

[0 0 0 1 1 1]

4. Density-based clustering (DBSCAN) :[pg.no:101-103]


It locates regions of high density that are separated from one another by regions of low density.

PROGRAM

import numpy as np

from sklearn.cluster import DBSCAN

from sklearn import metrics


from sklearn.datasets import make_blobs

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

# Generate sample data

centers = [[1, 1] , [-1,-1], [1, -1]]

X, labels_true = make_blobs(n_samples=750, centers=centers,

cluster_std=0.4, random_state=0)

X = StandardScaler().fit_transform(X)

# Compute DBSCAN

db = DBSCAN(eps=0.3, min_samples=10).fit(X)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)

core_samples_mask[db.core_sample_indices_] = True

labels = db.labels_

# Number of clusters in labels, ignoring noise if present.

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

n_noise = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters)

print('Estimated number of noise points: %d' % n_noise)

print("Homogeneity: %0.3f" %

metrics.homogeneity_score(labels_true, labels))

print("Completeness: %0.3f" %

metrics.completeness_score(labels_true, labels))

print("V-measure: %0.3f" %

metrics.v_measure_score(labels_true, labels))

print("Adjusted Rand Index: %0.3f" %

metrics.adjusted_rand_score(labels_true, labels))

print("Adjusted Mutual Information: %0.3f" %

metrics.adjusted_mutual_info_score(labels_true, labels))

print("Silhouette Coefficient: %0.3f" %

metrics.silhouette_score(X, labels))

# Plot result
# Black removed and is used for noise instead.

unique_labels = set(labels)

colors = plt.cm.Spectral(np.linspace(0, 1,

len(unique_labels)))

for k, col in zip(unique_labels, colors):

if k == -1:

# Black used for noise.

col = [0, 0, 0, 1]

class_member_mask = (labels == k)

xy = X[class_member_mask & core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], 'o',

markerfacecolor=tuple(col), markeredgecolor='k',

markersize=14)

xy = X[class_member_mask & ~core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], 'o',

markerfacecolor=tuple(col), markeredgecolor='k',

markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters)

plt.show()

OUTPUT

Estimated number of clusters: 3

Estimated number of noise points: 18

Homogeneity: 0.953

Completeness: 0.883

V-measure: 0.917

Adjusted Rand Index: 0.952

Adjusted Mutual Information: 0.916

Silhouette Coefficient: 0.626


5. OPTICS CLUSTERING VS DBSCAN CLUSTERING :[pg.no:105-106]
DBSCAN algorithm assumes the density of the clusters as constant, whereas the OPTICS
algorithm allows a varying density of the clusters.
PROGRAM

from sklearn.cluster import OPTICS, cluster_optics_dbscan

import matplotlib.gridspec as gridspec

import matplotlib.pyplot as plt

import numpy as np

# Generate sample data

np.random.seed(0)

n_points_per_cluster = 250

C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)

C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)

C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)

C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)

C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)

C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)

X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=50, xi=0.05,

min_cluster_size=0.05)

# Run the fit

clust.fit(X)

labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,

core_distances=clust.core_distances_,

ordering=clust.ordering_,

eps=0.5)

labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
core_distances=clust.core_distances_,

ordering=clust.ordering_,

eps=2)

space = np.arange(len(X))

reachability = clust.reachability_[clust.ordering_]

labels = clust.labels_[clust.ordering_]

plt.figure(figsize=(10, 7))

G = gridspec.GridSpec(2, 3)

ax1 = plt.subplot(G[0, :])

ax2 = plt.subplot(G[1, 0])

# Reachability plot

colors = ['g.', 'r.', 'b.', 'y.', 'c.']

for klass, color in zip(range(0, 5), colors):

Xk = space[labels == klass]

Rk = reachability[labels == klass]

ax1.plot(Xk, Rk, color, alpha=0.3)

ax1.plot(space[labels == -1], reachability[labels == -1], 'k.',

alpha=0.3)

ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-',

alpha=0.5)

ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.',

alpha=0.5)

ax1.set_ylabel('Reachability (epsilon distance)')

ax1.set_title('Reachability Plot')

# OPTICS

colors = ['g.', 'r.', 'b.', 'y.', 'c.']

for klass, color in zip(range(0, 5), colors):

Xk = X[clust.labels_ == klass]

ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)

ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k.', alpha=0.1)

ax2.set_title('Automatic Clustering\nOPTICS')
plt.tight_layout()

plt.show()

OUTPUT

You might also like