day 28打卡

day 18
选用昨天的kmeans 得到的效果进行聚类,进而推断每个簇的实际含义
# 先运行之前处理好的代码
import pandas  as pd 
import  numpy as np
import  matplotlib.pyplot as plt
import  seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('/Users/gj/东财-学习/python相关资料学习/Python60DaysChallenge-main/data.csv')
# 先筛选字符串变量
discrete_features=data.select_dtypes(include=['object']).columns.tolist()
home_ownership_mapping={
    'Own Home':1,
    'Rent':2,
    'Have Mortgage':3,
    'Home Mortgage':4
}
data['Home Ownership']=data['Home Ownership'].map(home_ownership_mapping)
# years in current job
years_in_job_mapping={
    '<1 year':1,
    '1 year':2,
    '2 years':3,
    '3 years':4,
    '4 years':5,
    '5 years':6,
    '6 years':7,
    '7 years':8,
    '8 years':9,
    '9 years':10,
    '10+ years':11
}
data['Years in current job']=data['Years in current job'].map(years_in_job_mapping)

data=pd.get_dummies(data,columns=['Purpose'])
data2=pd.read_csv("/Users/gj/东财-学习/python相关资料学习/Python60DaysChallenge-main/data.csv")
list_final=[]
for i in  data.columns:
    if i  not in data2.columns:
        list_final.append(i)
for  i in list_final:
        data[i]=data[i].astype('int')
    
# term 0-1映射
term_mapping={
    'Short Term':0,
    'Long Term':1
}
data['Term']=data['Term'].map(term_mapping)
data.rename(columns={'Term':'Long Term'},inplace=True)
continuous_features=data.select_dtypes(include=['int64','float64']).columns.tolist()


# 连续特征用中位数填充
for feature in continuous_features:
     mode_value=data[feature].mode()[0]
     data[feature].fillna(mode_value,inplace=True)

from  sklearn.model_selection import train_test_split
X=data.drop(columns=['Credit Default'])
y=data['Credit Default']


import  numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
import  numpy as np
import pandas as pd
from  sklearn.cluster import KMeans 
from  sklearn.metrics import silhouette_score,calinski_harabasz_score,davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# 评估不同k 值下的指标
k_range=range(2,11)
inertia_values=[]
silhouette_scoress=[]
ch_scores=[]
db_scores=[]


for k  in  k_range:
    kmeans=KMeans(n_clusters=k,random_state=42)
    kmeans_labels=kmeans.fit_predict(X_scaled)
    inertia_values.append(kmeans.inertia_)
    silhouette=silhouette_score(X_scaled,kmeans_labels)
    silhouette_scores.append(silhouette)
    ch=calinski_harabasz_score(X_scaled,kmeans_labels)
    ch_score.append(ch)
    db=davies_bouldin_score(X_scaled,kmeans_labels)
    db_score.append(db)
    print(f"k={k},inertia={inertia_values[-1]:.2f},silhouette={silhouette_scores[-1]:.2f},ch={ch_score[-1]:.2f},db={db_score[-1]:.2f}")



# 提示用户选择k 值
selected_k=3
# 使用选择的k 值进行kmeans 聚类
kmeans=KMeans(n_clusters=selected_k,random_state=42)
kmeans_labels=kmeans.fit_predict(X_scaled)
X['KMeans_Cluster']=kmeans_labels
#使用pca 降维到2d 进行可视化
pca=PCA (n_components=2)
X_pca=pca.fit_transform(X_scaled)
# 可视化聚类结果
plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=X['KMeans_Cluster'], palette='viridis', legend='full')
plt.title('KMeans 聚类结果')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
# 打印聚类结果
print(f"KMeans Cluster lables (k={selected_k}) added to X:")
print(X[['KMeans_Cluster']].value_counts())

现在需要给这个簇赋予实际的含义,一般当你赋予实际含义的时候,你需要根据某几个特征来赋予,但是源数据特征很多,如何选择特征呢?2种思路
1、你最开始聚类的时候,就选择了你想最后用来确定簇含义的特征,那么你需要选择一些特征来进行聚类,那么你最后确定簇含义的特征就是这几个特征,而非全部
如果你想聚类消费者的购买习惯,那么他过去的消费记录、购买记录、购买金额等等,这些特征都与消费者购买习惯有关,你可以使用这些特征来确定簇含义,一些其他的特征,如消费年龄、工作行业则不考虑。----适用于你本身就有构造某些明确含义的特征情况。
2、最开始用全部特征来聚类,把其余特征作为x,聚类得到的簇类别作为标签构建监督模型,进而根据重要性筛选来确定要根据哪些特征赋予含义。

X.columns

@浙大疏锦行

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值