day 18
选用昨天的kmeans 得到的效果进行聚类,进而推断每个簇的实际含义
# 先运行之前处理好的代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
data=pd.read_csv('/Users/gj/东财-学习/python相关资料学习/Python60DaysChallenge-main/data.csv')
# 先筛选字符串变量
discrete_features=data.select_dtypes(include=['object']).columns.tolist()
home_ownership_mapping={
'Own Home':1,
'Rent':2,
'Have Mortgage':3,
'Home Mortgage':4
}
data['Home Ownership']=data['Home Ownership'].map(home_ownership_mapping)
# years in current job
years_in_job_mapping={
'<1 year':1,
'1 year':2,
'2 years':3,
'3 years':4,
'4 years':5,
'5 years':6,
'6 years':7,
'7 years':8,
'8 years':9,
'9 years':10,
'10+ years':11
}
data['Years in current job']=data['Years in current job'].map(years_in_job_mapping)
data=pd.get_dummies(data,columns=['Purpose'])
data2=pd.read_csv("/Users/gj/东财-学习/python相关资料学习/Python60DaysChallenge-main/data.csv")
list_final=[]
for i in data.columns:
if i not in data2.columns:
list_final.append(i)
for i in list_final:
data[i]=data[i].astype('int')
# term 0-1映射
term_mapping={
'Short Term':0,
'Long Term':1
}
data['Term']=data['Term'].map(term_mapping)
data.rename(columns={'Term':'Long Term'},inplace=True)
continuous_features=data.select_dtypes(include=['int64','float64']).columns.tolist()
# 连续特征用中位数填充
for feature in continuous_features:
mode_value=data[feature].mode()[0]
data[feature].fillna(mode_value,inplace=True)
from sklearn.model_selection import train_test_split
X=data.drop(columns=['Credit Default'])
y=data['Credit Default']
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,calinski_harabasz_score,davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# 评估不同k 值下的指标
k_range=range(2,11)
inertia_values=[]
silhouette_scoress=[]
ch_scores=[]
db_scores=[]
for k in k_range:
kmeans=KMeans(n_clusters=k,random_state=42)
kmeans_labels=kmeans.fit_predict(X_scaled)
inertia_values.append(kmeans.inertia_)
silhouette=silhouette_score(X_scaled,kmeans_labels)
silhouette_scores.append(silhouette)
ch=calinski_harabasz_score(X_scaled,kmeans_labels)
ch_score.append(ch)
db=davies_bouldin_score(X_scaled,kmeans_labels)
db_score.append(db)
print(f"k={k},inertia={inertia_values[-1]:.2f},silhouette={silhouette_scores[-1]:.2f},ch={ch_score[-1]:.2f},db={db_score[-1]:.2f}")
# 提示用户选择k 值
selected_k=3
# 使用选择的k 值进行kmeans 聚类
kmeans=KMeans(n_clusters=selected_k,random_state=42)
kmeans_labels=kmeans.fit_predict(X_scaled)
X['KMeans_Cluster']=kmeans_labels
#使用pca 降维到2d 进行可视化
pca=PCA (n_components=2)
X_pca=pca.fit_transform(X_scaled)
# 可视化聚类结果
plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=X['KMeans_Cluster'], palette='viridis', legend='full')
plt.title('KMeans 聚类结果')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
# 打印聚类结果
print(f"KMeans Cluster lables (k={selected_k}) added to X:")
print(X[['KMeans_Cluster']].value_counts())
现在需要给这个簇赋予实际的含义,一般当你赋予实际含义的时候,你需要根据某几个特征来赋予,但是源数据特征很多,如何选择特征呢?2种思路
1、你最开始聚类的时候,就选择了你想最后用来确定簇含义的特征,那么你需要选择一些特征来进行聚类,那么你最后确定簇含义的特征就是这几个特征,而非全部
如果你想聚类消费者的购买习惯,那么他过去的消费记录、购买记录、购买金额等等,这些特征都与消费者购买习惯有关,你可以使用这些特征来确定簇含义,一些其他的特征,如消费年龄、工作行业则不考虑。----适用于你本身就有构造某些明确含义的特征情况。
2、最开始用全部特征来聚类,把其余特征作为x,聚类得到的簇类别作为标签构建监督模型,进而根据重要性筛选来确定要根据哪些特征赋予含义。
X.columns
day 28打卡
于 2025-07-18 15:19:07 首次发布