Data Science Project VI - Ipynb - Colaboratory
Data Science Project VI - Ipynb - Colaboratory
ipynb - Colaboratory
Dataset Description
Customer Segmentation Clustering
Tujuan dari project ini adalah untuk menemukan pola dari perilaku customer dan dibagi menjadi
beberapa cluster untuk bisa menjadi sebuah insight.
Ket. Dataset
Import Library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
df = pd.read_csv(r'/content/drive/MyDrive/digital skola/Dataset18_Clustering_Customer.csv'
df.head()
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 1/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
0 1 Male 19 15 39
(200, 5)
1 2 Male 21 15 81
2 3 Female 20 16 6
df.isnull().sum()
3 4 Female 23 16 77
CustomerID 0
4
Gender 5 Female 310
17 40
Age 0
dtype: int64
df.describe()
sns.heatmap(df.corr(), annot=True)
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 2/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
<matplotlib.axes._subplots.AxesSubplot at 0x7f6b9768b450>
Hipotesa awal
Semakin muda maka semakin sering spending semakin tua maka semakin jarang belanja
semakin tinggi gaji mka semakin sering berbelanja
Data visual
sns.set(style="whitegrid")
sns.distplot(df['Age'], color = "blue", bins=20)
plt.title("Age Distribution Plot", fontsize=14)
plt.xlabel("Age", fontsize=14)
plt.ylabel("count", fontsize=14)
plt.show()
plt.figure(figsize=(14,8))
sns.countplot(df['Age'])
plt.title("Age countplot", fontsize=14)
plt.xlabel("Age", fontsize=14)
plt.ylabel("count", fontsize=14)
plt.show()
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 3/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
sns.boxplot(df['Age'])
plt.title("Age box plot",fontsize=14)
plt.xlabel("Age", fontsize=14)
plt.show()
sns.violinplot(y="Age", x ="Gender", data = df)
plt.title("Age Violin plot with Gender", fontsize=14)
plt.xlabel("Gender", fontsize=14)
plt.ylabel("Age", fontsize=14)
plt.show()
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 4/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
spending score
sns.set(style="whitegrid")
sns.distplot(df['Spending Score (1-100)'], color = "blue", bins=20)
plt.title("Spending Score (1-100) Distribution Plot", fontsize=14)
plt.xlabel("Spending Score (1-100)", fontsize=14)
plt.ylabel("count", fontsize=14)
plt.show()
sns.boxplot(df['Spending Score (1-100)'])
plt.title("Spending Score (1-100) box plot",fontsize=14)
plt.xlabel("Spending Score (1-100)", fontsize=14)
plt.show()
plt.figure(figsize=(26,8))
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 5/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
sns.countplot(df['Spending Score (1-100)'])
plt.title("Spending Score (1-100) countplot", fontsize=14)
plt.xlabel("Spending Score (1-100)", fontsize=14)
plt.ylabel("count", fontsize=14)
plt.show()
sns.violinplot(y="Spending Score (1-100)", x ="Gender", data = df)
plt.title("Spending Score Violin plot with Gender", fontsize=14)
plt.xlabel("Gender", fontsize=14)
plt.ylabel("Spending Score", fontsize=14)
plt.show()
Perempuan usia 30-40 yang banyak spending dengan rata-rata spending score 40-50
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 6/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
sns.scatterplot(df['Age'], df['Spending Score (1-100)'], hue=df['Gender'], palette=['blue'
plt.title("Scatter plot distribution of gender based on Age and spending score",fontsize=1
plt.xlabel("Age", fontsize=14)
plt.ylabel("Spending Score", fontsize=14)
plt.show()
Data preprocessing
df.set_index('CustomerID',inplace=True)
df
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 7/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
CustomerID
1 Male 19 15 39
2 Male 21 15 81
3 Female 20 16 6
4 Female 23 16 77
5 Female 31 17 40
df_ss['Gender'] = le.fit_transform(df_ss['Gender'])
df_ss.head()
CustomerID
1 1 19 15 39
2 1 21 15 81
3 0 20 16 6
4 0 23 16 77
5 0 31 17 40
Age_Spend = df_ss[['Age','Spending Score (1-100)']].iloc[:,:].values
inertia_list=[]
for i in range(2,9):
kmeans_us = KMeans(n_clusters=i,n_init=10,max_iter=100, random_state=0)
kmeans_us.fit(Age_Spend)
inertia_list.append(kmeans_us.inertia_)
plt.figure(figsize=(15,8))
plt.plot(range(2,9),inertia_list)
plt.xlabel("Num of clusters")
plt.ylabel("Distortion")
plt.show()
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 8/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
!pip install scikit-learn-extra
from sklearn_extra.cluster import KMedoids
labels_kmedoid = KMedoids(n_clusters=4).fit_predict(Age_Spend)
labels_kmedoid
array([1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 1, 0, 2, 1, 2,
0, 2, 0, 2, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 3, 2, 3, 1,
0, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3, 1,
3, 3, 1, 1, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 3, 1, 3, 3, 1, 3, 3, 1,
1, 3, 3, 1, 3, 3, 1, 1, 3, 1, 3, 1, 1, 3, 3, 1, 3, 1, 3, 3, 3, 3,
3, 1, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 1, 2, 1, 2, 3, 2, 0, 2, 0, 2,
1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 3, 2, 0, 2, 0, 2, 0, 2,
0, 2, 0, 2, 0, 2, 3, 2, 0, 2, 0, 2, 0, 2, 0, 1, 0, 2, 0, 2, 0, 2,
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 9/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
0, 2, 0, 2, 0, 2, 0, 2, 3, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
0, 2])
labels_kmean_pp = KMeans(init='k-means++',n_clusters=4).fit_predict(Age_Spend)
labels_kmean_pp
array([3, 1, 0, 1, 3, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 3, 3, 0, 1, 3, 1,
0, 1, 0, 1, 0, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 3,
0, 3, 2, 3, 3, 3, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 3,
2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2, 3, 3, 2, 2, 3, 2, 2, 3, 2, 2, 3,
3, 2, 2, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3, 2, 3, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 1, 3, 1, 2, 1, 0, 1, 0, 1,
3, 1, 0, 1, 0, 1, 0, 1, 0, 1, 3, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 3, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 3, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1], dtype=int32)
import scipy.cluster.hierarchy as shc
from matplotlib import pyplot
pyplot.figure(figsize=(14,8))
pyplot.title("Dendrograms")
dend = shc.dendrogram(shc.linkage(Age_Spend, method="complete"))
import scipy.cluster.hierarchy as shc
from matplotlib import pyplot
pyplot.figure(figsize=(14,8))
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 10/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
pyplot.title("Dendrograms")
dend1 = shc.dendrogram(shc.linkage(Age_Spend, method="single"))
import scipy.cluster.hierarchy as shc
from matplotlib import pyplot
pyplot.figure(figsize=(14,8))
pyplot.title("Dendrograms")
dend2 = shc.dendrogram(shc.linkage(Age_Spend, method="average"))
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 11/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
import scipy.cluster.hierarchy as shc
from matplotlib import pyplot
pyplot.figure(figsize=(14,8))
pyplot.title("Dendrograms")
dend3 = shc.dendrogram(shc.linkage(Age_Spend, method="ward"))
labels_cluster_hierarchical_Ward = AgglomerativeClustering(n_clusters=4, linkage="ward").f
labels_cluster_hierarchical_Complete = AgglomerativeClustering(n_clusters=4, linkage="comp
labels_cluster_hierarchical_Ward
array([0, 3, 2, 3, 0, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 0, 0, 0, 3, 0, 3,
2, 3, 2, 3, 0, 1, 0, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 3, 0, 1,
0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 3, 0, 3, 0, 3, 2, 3, 2, 3,
0, 3, 2, 3, 2, 3, 2, 3, 2, 3, 0, 3, 2, 3, 0, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 1, 3, 2, 3, 0, 3, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3,
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 12/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
2, 3, 2, 3, 0, 3, 2, 3, 0, 3, 0, 3, 2, 3, 2, 3, 2, 3, 2, 3, 0, 3,
2, 3])
labels_cluster_hierarchical_Complete
array([2, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 2, 1,
0, 1, 0, 1, 0, 2, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 3, 1, 0, 2,
0, 1, 3, 2, 2, 2, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2,
3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 3, 2, 3, 3, 2,
2, 3, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3,
3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1,
2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 3, 1, 0, 1, 0, 1, 0, 1, 0, 2, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 2, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1])
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
print("Silhouette score of K-Medoids: ", silhouette_score(Age_Spend, labels_kmedoid), "\n"
print("Silhouette score of K-Means++: ", silhouette_score(Age_Spend, labels_kmean_pp), "\n
print("Silhouette score of Agglo Hierarchical Ward: ", silhouette_score(Age_Spend, labels_
print("Silhouette score of Agglo Hierarchical Complete: ", silhouette_score(Age_Spend, lab
print("Davies Bouldin score of K-Medoids: ", davies_bouldin_score(Age_Spend, labels_kmedoi
print("Davies Bouldin score of K-Means++: ", davies_bouldin_score(Age_Spend, labels_kmean_
print("Davies Bouldin score of Agglo Hierarchical Ward: ", davies_bouldin_score(Age_Spend,
print("Davies Bouldin score of Agglo Hierarchical Complete: ", davies_bouldin_score(Age_Sp
plt.figure(figsize=(15,8))
plt.scatter(Age_Spend[labels_kmean_pp == 0,0], Age_Spend[labels_kmean_pp == 0,1], c = 'pin
plt.scatter(Age_Spend[labels_kmean_pp == 1,0], Age_Spend[labels_kmean_pp == 1,1], c = 'ora
plt.scatter(Age_Spend[labels_kmean_pp == 2,0], Age_Spend[labels_kmean_pp == 2,1], c = 'gre
plt.scatter(Age_Spend[labels_kmean_pp == 3,0], Age_Spend[labels_kmean_pp == 3,1], c = 'red
plt.legend()
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 13/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
plt.title('Customer Segmentation using Age and spending score', fontsize=14)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Spending Score', fontsize=14)
plt.show()
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 14/15
02/09/22 13.41 Data Science Project VI.ipynb - Colaboratory
https://colab.research.google.com/drive/17aNfpDllpoi6wxjLL4xJekIWDpBsMnqb#scrollTo=ort78Scb66TI&printMode=true 15/15