# 划分训练集,验证集和测试集。因为要考2次
#这里演示一下如何2次划分数据集,因为这个函数只能划分一次,所以需要调用两次才能划分出训练集,验证集和测试集
from sklearn.model_selection import train_test_split
x=data.drop(['Credit Default'],axis=1)
y=data['Credit Default']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.25,random_state=42)
print("Data shapes:")
print("x_train:", x_train.shape)
print("y_train:", y_train.shape)
print("x_val:", x_val.shape)
print("y_val:", y_val.shape)
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)
# 1、默认参数的随机森林
#评估基准模型,
print("----1.默认参数随机森林(训练集---》测试集)----")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
start_time = time.time() # 记录开始时间
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train) # 在训练集上训练
rf_pred = rf_model.predict(x_test) # 在测试集上预测
end_time = time.time() # 记录结束时间
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告:")
print(classification_report(y_test, rf_pred))
print("默认随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
#----2、网格搜索优化随机森林
print("----2.网格搜索优化随机森林----")
from sklearn.model_selection import GridSearchCV
# 定义要搜索的参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 创建网格搜索对象
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
start_time=time.time()
# 在训练集上进行网络搜索
grid_search.fit(x_train, y_train)
end_time=time.time()
# 输出网格搜索的耗时
print(f"网格搜索耗时: {time.time() - start_time:.4f} 秒")
# 输出最佳参数组合
print("网格搜索找到的最佳参数组合:")
print(grid_search.best_params_)
# 使用最佳参数的模型进行预测
best_model=grid_search.best_estimator_#获得最佳模型
best_pred=best_model.predict(x_test)
print("\n网格搜索找到的最佳模型在测试集上的分类报告:")
print(classification_report(y_test, best_pred))
print("网格搜索找到的最佳模型在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, best_pred))
day 15打卡
最新推荐文章于 2025-08-05 19:55:44 发布