0% found this document useful (0 votes)

30 views8 pages

ML Python Exercises UOM BDS Cluster Analysis

Uploaded by

metapi5906

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

30 views8 pages

ML Python Exercises UOM BDS Cluster Analysis

Uploaded by

metapi5906

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 8

CHAPTER 4 – CLUSTER ANALYSIS

1. PARTITIONING METHODS :[pg.no:87]

Partitioning methods are a widely used family of clustering algorithms in data mining that aim to
partition a dataset into K clusters.
Program:

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

data = {'x': [25, 34, 22, 27, 33, 33, 31, 22, 35, 34, 67, 54, 57, 43, 50, 57, 59, 52, 65, 47, 49, 48, 35,
33, 44, 45,

38,43,51,46],

'y': [79, 51, 53, 78, 59, 74, 73, 57, 69, 75, 51, 32, 40, 47, 53, 36, 35, 58, 59, 50, 25, 20, 14, 12,
20, 5, 29,

27, 8, 7]}

df = pd.DataFrame(data, columns=['x', 'y'])

# K-means Clustering

kmeans = KMeans(n_clusters=3).fit(df)

centroids = kmeans.cluster_centers_

print(centroids)

labels = kmeans.labels_

plt.scatter(df['x'], df['y'], c=labels.astype(float), s=50)

plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)

plt.show()

OUTPUT
2. AGGLOMERATIVE CLUSTERING :[pg.no:96]
Agglomerative clustering: Commonly referred to as AGNES (AGglomerative NESting) works in a
bottom-up manner. That is, each observation is initially considered as a single-element cluster
(leaf). At each step of the algorithm, the two clusters that are the most similar are combined into
a new bigger cluster (nodes).
PROGRAM

import matplotlib.pyplot as plt

import numpy as np

from sklearn.cluster import AgglomerativeClustering

import scipy.cluster.hierarchy as shc

X = np.array([[2, 8], [8, 15], [3, 6], [6, 9], [8, 7], [10, 10]])

# Agglomerative Clustering

cluster = AgglomerativeClustering(n_clusters=3,

affinity='euclidean', linkage='ward')

labels = cluster.fit_predict(X)

print(labels)

# Drawing Dendrograms

plt.figure(figsize=(10, 7))

plt.title("Employee Skill Dendrograms")

dend = shc.dendrogram(shc.linkage(X, method='ward'))

plt.show()
OUTPUT

[1 2 1 0 0 0]

3. Balanced Iterative Reducing Clustering Using Hierarchies (BIRCH):[pg.no:98]

It appropriate for very large datasets or for streaming data, because of its ability to find a good
clustering solution with only a single scan of the data.
PROGRAM

from sklearn.cluster import Birch

X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]

brc = Birch(branching_factor=50, n_clusters=None,

threshold=0.5, compute_labels=True)

brc.fit(X)

print(brc.predict(X))

OUTPUT

[0 0 0 1 1 1]

4. Density-based clustering (DBSCAN) :[pg.no:101-103]

It locates regions of high density that are separated from one another by regions of low density.

PROGRAM

import numpy as np

from sklearn.cluster import DBSCAN

from sklearn import metrics

from sklearn.datasets import make_blobs

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

# Generate sample data

centers = [[1, 1] , [-1,-1], [1, -1]]

X, labels_true = make_blobs(n_samples=750, centers=centers,

cluster_std=0.4, random_state=0)

X = StandardScaler().fit_transform(X)

# Compute DBSCAN

db = DBSCAN(eps=0.3, min_samples=10).fit(X)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)

core_samples_mask[db.core_sample_indices_] = True

labels = db.labels_

# Number of clusters in labels, ignoring noise if present.

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

n_noise = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters)

print('Estimated number of noise points: %d' % n_noise)

print("Homogeneity: %0.3f" %

metrics.homogeneity_score(labels_true, labels))

print("Completeness: %0.3f" %

metrics.completeness_score(labels_true, labels))

print("V-measure: %0.3f" %

metrics.v_measure_score(labels_true, labels))

print("Adjusted Rand Index: %0.3f" %

metrics.adjusted_rand_score(labels_true, labels))

print("Adjusted Mutual Information: %0.3f" %

metrics.adjusted_mutual_info_score(labels_true, labels))

print("Silhouette Coefficient: %0.3f" %

metrics.silhouette_score(X, labels))

# Plot result
# Black removed and is used for noise instead.

unique_labels = set(labels)

colors = plt.cm.Spectral(np.linspace(0, 1,

len(unique_labels)))

for k, col in zip(unique_labels, colors):

if k == -1:

# Black used for noise.

col = [0, 0, 0, 1]

class_member_mask = (labels == k)

xy = X[class_member_mask & core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], 'o',

markerfacecolor=tuple(col), markeredgecolor='k',

markersize=14)

xy = X[class_member_mask & ~core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], 'o',

markerfacecolor=tuple(col), markeredgecolor='k',

markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters)

plt.show()

OUTPUT

Estimated number of clusters: 3

Estimated number of noise points: 18

Homogeneity: 0.953

Completeness: 0.883

V-measure: 0.917

Adjusted Rand Index: 0.952

Adjusted Mutual Information: 0.916

Silhouette Coefficient: 0.626

5. OPTICS CLUSTERING VS DBSCAN CLUSTERING :[pg.no:105-106]
DBSCAN algorithm assumes the density of the clusters as constant, whereas the OPTICS
algorithm allows a varying density of the clusters.
PROGRAM

from sklearn.cluster import OPTICS, cluster_optics_dbscan

import matplotlib.gridspec as gridspec

import matplotlib.pyplot as plt

import numpy as np

# Generate sample data

np.random.seed(0)

n_points_per_cluster = 250

C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)

C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)

C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)

C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)

C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)

C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)

X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=50, xi=0.05,

min_cluster_size=0.05)

# Run the fit

clust.fit(X)

labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,

core_distances=clust.core_distances_,

ordering=clust.ordering_,

eps=0.5)

labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
core_distances=clust.core_distances_,

ordering=clust.ordering_,

eps=2)

space = np.arange(len(X))

reachability = clust.reachability_[clust.ordering_]

labels = clust.labels_[clust.ordering_]

plt.figure(figsize=(10, 7))

G = gridspec.GridSpec(2, 3)

ax1 = plt.subplot(G[0, :])

ax2 = plt.subplot(G[1, 0])

# Reachability plot

colors = ['g.', 'r.', 'b.', 'y.', 'c.']

for klass, color in zip(range(0, 5), colors):

Xk = space[labels == klass]

Rk = reachability[labels == klass]

ax1.plot(Xk, Rk, color, alpha=0.3)

ax1.plot(space[labels == -1], reachability[labels == -1], 'k.',

alpha=0.3)

ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-',

alpha=0.5)

ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.',

alpha=0.5)

ax1.set_ylabel('Reachability (epsilon distance)')

ax1.set_title('Reachability Plot')

# OPTICS

colors = ['g.', 'r.', 'b.', 'y.', 'c.']

for klass, color in zip(range(0, 5), colors):

Xk = X[clust.labels_ == klass]

ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)

ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k.', alpha=0.1)

ax2.set_title('Automatic Clustering\nOPTICS')
plt.tight_layout()

plt.show()

OUTPUT

A
No ratings yet
A
20 pages
4.cluster Analysis
No ratings yet
4.cluster Analysis
7 pages
Clustering
No ratings yet
Clustering
7 pages
Chatgpt Unit - 4
No ratings yet
Chatgpt Unit - 4
4 pages
Week 8 DS Practical (1)
No ratings yet
Week 8 DS Practical (1)
13 pages
Partition
No ratings yet
Partition
52 pages
HTCB Unit 5
No ratings yet
HTCB Unit 5
3 pages
Dmaclat4 Merged
No ratings yet
Dmaclat4 Merged
46 pages
Clustering
No ratings yet
Clustering
45 pages
Clustering
No ratings yet
Clustering
1 page
ML0101EN Clus DBSCN Weather Py v1
No ratings yet
ML0101EN Clus DBSCN Weather Py v1
16 pages
Machine Learning Unit-4
No ratings yet
Machine Learning Unit-4
24 pages
Data Mining Unit-Iv
No ratings yet
Data Mining Unit-Iv
34 pages
Cheat Sheet-Building Unsupervised Learning Models
No ratings yet
Cheat Sheet-Building Unsupervised Learning Models
3 pages
Clustering
No ratings yet
Clustering
11 pages
Session 3: Clustering Techniques - Partitioning & Hierarchical Methods
No ratings yet
Session 3: Clustering Techniques - Partitioning & Hierarchical Methods
27 pages
Cluster Analysis
No ratings yet
Cluster Analysis
22 pages
Cluster Analysis - Approach 1
No ratings yet
Cluster Analysis - Approach 1
28 pages
From Import Import As Import As From Import From Import From Import From Import
No ratings yet
From Import Import As Import As From Import From Import From Import From Import
9 pages
DataEnggineering
No ratings yet
DataEnggineering
16 pages
23CC554
No ratings yet
23CC554
10 pages
Experiment 4 1
No ratings yet
Experiment 4 1
4 pages
DWDM Unit 3
No ratings yet
DWDM Unit 3
21 pages
Clustering in Machine Learning
No ratings yet
Clustering in Machine Learning
4 pages
Clustering in Python-Dr. Afsaneh Javadi(1)
No ratings yet
Clustering in Python-Dr. Afsaneh Javadi(1)
8 pages
Data Science Analysis Final Project
No ratings yet
Data Science Analysis Final Project
10 pages
Data Mining - Lecture 9
No ratings yet
Data Mining - Lecture 9
29 pages
Introduction to Cluster Analysis.
No ratings yet
Introduction to Cluster Analysis.
53 pages
DWDM Lab All
No ratings yet
DWDM Lab All
20 pages
SE_KMeansClustering
No ratings yet
SE_KMeansClustering
21 pages
DM Clustering UNIT4
No ratings yet
DM Clustering UNIT4
36 pages
Clustering Algorithms CheatSheet 1710438661
No ratings yet
Clustering Algorithms CheatSheet 1710438661
6 pages
Gautam A. Kudale
No ratings yet
Gautam A. Kudale
6 pages
DOC-20231118-WA0008new Unit 5
No ratings yet
DOC-20231118-WA0008new Unit 5
15 pages
Density & Grid based clustering
100% (1)
Density & Grid based clustering
21 pages
Ambo University: Inistitute of Technology
No ratings yet
Ambo University: Inistitute of Technology
15 pages
ML - 8
No ratings yet
ML - 8
70 pages
Data Mining and Machine Learning PDF
No ratings yet
Data Mining and Machine Learning PDF
10 pages
ML Exp5 C36
No ratings yet
ML Exp5 C36
18 pages
Clustering Part2
No ratings yet
Clustering Part2
29 pages
476 emt abstract
No ratings yet
476 emt abstract
3 pages
exp_6
No ratings yet
exp_6
10 pages
FAI Lecture - 9-10-2023 PDF
No ratings yet
FAI Lecture - 9-10-2023 PDF
16 pages
DMW Unit-V
No ratings yet
DMW Unit-V
47 pages
Clustering Techniques and Their Applications in Engineering
100% (1)
Clustering Techniques and Their Applications in Engineering
16 pages
Unit 5 DM
No ratings yet
Unit 5 DM
47 pages
Cluster Analysis
No ratings yet
Cluster Analysis
18 pages
Sathyabama Institute of Science and Technology SIT1301-Data Mining and Warehousing
No ratings yet
Sathyabama Institute of Science and Technology SIT1301-Data Mining and Warehousing
22 pages
ML 2.3 Prashant
No ratings yet
ML 2.3 Prashant
4 pages
Lecture Notes On Clustering
No ratings yet
Lecture Notes On Clustering
10 pages
6 - Machine Learning and Unlabeled Data
No ratings yet
6 - Machine Learning and Unlabeled Data
67 pages
Unit 3 Data
No ratings yet
Unit 3 Data
37 pages
ML Unit 4 Notes - NJ
No ratings yet
ML Unit 4 Notes - NJ
15 pages
UNIT V MACHINE LEARNING
No ratings yet
UNIT V MACHINE LEARNING
5 pages
unit5_CSM_ML
No ratings yet
unit5_CSM_ML
32 pages
Chapter 4
No ratings yet
Chapter 4
30 pages
unit iv[1]
No ratings yet
unit iv[1]
96 pages
05 Clustering
No ratings yet
05 Clustering
96 pages
An Efficient Enhanced K-Means Clustering Algorithm
No ratings yet
An Efficient Enhanced K-Means Clustering Algorithm
8 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
Priority Queues (Heaps)
No ratings yet
Priority Queues (Heaps)
25 pages
Top-Down Parsing PDF
No ratings yet
Top-Down Parsing PDF
6 pages
Bestfit, Paging, Page Replacement
No ratings yet
Bestfit, Paging, Page Replacement
13 pages
MPRO-4
No ratings yet
MPRO-4
207 pages
Data Structure-9-10
No ratings yet
Data Structure-9-10
2 pages
Quicksort - Mind Map
No ratings yet
Quicksort - Mind Map
1 page
CRAFT Ex Solved
No ratings yet
CRAFT Ex Solved
6 pages
Unit 4 - 7 Backtracking
No ratings yet
Unit 4 - 7 Backtracking
22 pages
MAT6007 - Session7 - Sigmoid Neurons - Gradient Descent
No ratings yet
MAT6007 - Session7 - Sigmoid Neurons - Gradient Descent
19 pages
Materi Pemrograman
No ratings yet
Materi Pemrograman
58 pages
in The Bisection Method: X F X F
No ratings yet
in The Bisection Method: X F X F
17 pages
Practice Set 1 Asymptotics and Recurrences
No ratings yet
Practice Set 1 Asymptotics and Recurrences
4 pages
Dsa Lab File
No ratings yet
Dsa Lab File
23 pages
Cp4161 Ads Lab Manual
No ratings yet
Cp4161 Ads Lab Manual
62 pages
Errors in Numerical Analysis PDF
100% (1)
Errors in Numerical Analysis PDF
2 pages
Alogorithm and DS PG DAC - Aug 19
No ratings yet
Alogorithm and DS PG DAC - Aug 19
34 pages
L. Gaceta Division Algorithm
No ratings yet
L. Gaceta Division Algorithm
22 pages
Spring 2024 - CS301P - 2
No ratings yet
Spring 2024 - CS301P - 2
4 pages
Dsa Cheatsheet: Code Library
100% (1)
Dsa Cheatsheet: Code Library
4 pages
Unit 4
No ratings yet
Unit 4
119 pages
Data Structure: Tree and Graph
No ratings yet
Data Structure: Tree and Graph
22 pages
Mic Solved Manual 1
No ratings yet
Mic Solved Manual 1
29 pages
FADML 03 PPC Analysis of Algos PDF
No ratings yet
FADML 03 PPC Analysis of Algos PDF
29 pages
List of Questions: 1: Internal Examiner External Examiner
No ratings yet
List of Questions: 1: Internal Examiner External Examiner
3 pages
Heuristic Search Strategies: Informed (Heuris7c) Search Strategy
No ratings yet
Heuristic Search Strategies: Informed (Heuris7c) Search Strategy
12 pages
CD3291 -DATA STRUCTURES -UNIT 2 -NOTES
No ratings yet
CD3291 -DATA STRUCTURES -UNIT 2 -NOTES
60 pages
Me4126d Optimization Methods in Engineering
0% (1)
Me4126d Optimization Methods in Engineering
8 pages
dsa bim practical file
No ratings yet
dsa bim practical file
12 pages
Suppose There Are Two Singly Linked Lists Which Intersect Each Other at Some Point. The Start Pointers of Both
No ratings yet
Suppose There Are Two Singly Linked Lists Which Intersect Each Other at Some Point. The Start Pointers of Both
11 pages

ML Python Exercises UOM BDS Cluster Analysis

Uploaded by

ML Python Exercises UOM BDS Cluster Analysis

Uploaded by

CHAPTER 4 – CLUSTER ANALYSIS

1. PARTITIONING METHODS :[pg.no:87]

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

df = pd.DataFrame(data, columns=['x', 'y'])

plt.scatter(df['x'], df['y'], c=labels.astype(float), s=50)

plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)

import matplotlib.pyplot as plt

from sklearn.cluster import AgglomerativeClustering

import scipy.cluster.hierarchy as shc

plt.title("Employee Skill Dendrograms")

dend = shc.dendrogram(shc.linkage(X, method='ward'))

3. Balanced Iterative Reducing Clustering Using Hierarchies (BIRCH):[pg.no:98]

from sklearn.cluster import Birch

brc = Birch(branching_factor=50, n_clusters=None,

4. Density-based clustering (DBSCAN) :[pg.no:101-103]

from sklearn.cluster import DBSCAN

from sklearn import metrics

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

# Generate sample data

centers = [[1, 1] , [-1,-1], [1, -1]]

X, labels_true = make_blobs(n_samples=750, centers=centers,

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)

# Number of clusters in labels, ignoring noise if present.

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters)

print('Estimated number of noise points: %d' % n_noise)

print("Adjusted Rand Index: %0.3f" %

print("Adjusted Mutual Information: %0.3f" %

print("Silhouette Coefficient: %0.3f" %

for k, col in zip(unique_labels, colors):

# Black used for noise.

xy = X[class_member_mask & core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], 'o',

xy = X[class_member_mask & ~core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], 'o',

plt.title('Estimated number of clusters: %d' % n_clusters)

Estimated number of clusters: 3

Estimated number of noise points: 18

Adjusted Rand Index: 0.952

Adjusted Mutual Information: 0.916

Silhouette Coefficient: 0.626

from sklearn.cluster import OPTICS, cluster_optics_dbscan

import matplotlib.gridspec as gridspec

import matplotlib.pyplot as plt

# Generate sample data

C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)

C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)

C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)

C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)

X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=50, xi=0.05,

# Run the fit

ax1 = plt.subplot(G[0, :])

ax2 = plt.subplot(G[1, 0])

colors = ['g.', 'r.', 'b.', 'y.', 'c.']

for klass, color in zip(range(0, 5), colors):

ax1.plot(Xk, Rk, color, alpha=0.3)

ax1.plot(space[labels == -1], reachability[labels == -1], 'k.',

ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-',

ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.',

ax1.set_ylabel('Reachability (epsilon distance)')

colors = ['g.', 'r.', 'b.', 'y.', 'c.']

for klass, color in zip(range(0, 5), colors):

ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)

ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k.', alpha=0.1)

You might also like