day 16打卡

最新推荐文章于 2025-08-06 18:10:49 发布
原创最新推荐文章于 2025-08-06 18:10:49 发布 · 114 阅读
0 ·
CC 4.0 BY-SA版权
文章标签：
#深度学习 #人工智能 #机器学习
AI学习笔记专栏收录该内容
46 篇文章
订阅专栏
# 划分训练集，验证集和测试集。因为要考2次
#这里演示一下如何2次划分数据集，因为这个函数只能划分一次，所以需要调用两次才能划分出训练集，验证集和测试集
from sklearn.model_selection import train_test_split
x=data.drop(['Credit Default'],axis=1)
y=data['Credit Default']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.25,random_state=42)
print("Data shapes:")
print("x_train:", x_train.shape)
print("y_train:", y_train.shape)
print("x_val:", x_val.shape)
print("y_val:", y_val.shape)
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)


简单的调参方法
1、随机搜索：在参数空间中随机选择参数组合，然后适应交叉验证来评估每个组合的性能
2、网格搜索
3、贝叶斯优化
基线模型：首先运行一个使用默认参数的RandomForestClassifier，记录其性能作为比较的基准。
1、网格搜索（GridSearchCV）:
需要定义参数的网格，包含所有你想要的特定值的列表。会尝试网格中所有可能的参数组合。
缺点；计算成本非常高，参数和数量稍多，组合数会呈指数级增长。因此网格通常设置比较小或者集中在认为最优参数可能存在区域，
2、随机搜索（RandomizedSearchCV）
需要定义参数的分布，而不是固定的列表。这是他与网格搜索的主要区别，他不会尝试所有组合，而是在指定次数内随机采样。
3、贝叶斯优化（BayesSearchCV）
需要定义参数的分布，而不是固定的列表。这是他与网格搜索的主要区别，他不会尝试所有组合，而是在指定次数内随机采样。通常用相对较少的迭代次数（如50-100）就能找到相当好的参数。
3、贝叶斯优化（BayesSearchCV from skopt）   
需要定义参数的搜索空间，与随机搜索类似。当搜索空间非常大时，它通常比网格搜索和随机搜索更有效果。
核心优势：他不是随机选择下一个点，而是根据先前的评估的结果建立一个概率模型，（通过高斯过程）预测哪些参数组合可能产生更好的结果，并据此选择下一个评估点，这使得它在寻找最优解方面通常比随机搜索更高效果，（更少的迭代次数达到更好的性能）特别是当模型训练非常耗时的时候。




总结：计算资源够用网格，计算不够用，用贝叶斯优化


# 1、默认参数的随机森林
#评估基准模型，
print("----1.默认参数随机森林（训练集---》测试集）----")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


start_time = time.time() # 记录开始时间
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train) # 在训练集上训练
rf_pred = rf_model.predict(x_test) # 在测试集上预测
end_time = time.time() # 记录结束时间
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred))
print("默认随机森林 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, rf_pred))

#----2、网格搜索优化随机森林
print("----2.网格搜索优化随机森林----")
from   sklearn.model_selection import GridSearchCV

# 定义要搜索的参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# 创建网格搜索对象
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
start_time=time.time()
# 在训练集上进行网络搜索
grid_search.fit(x_train, y_train)
end_time=time.time()
# 输出网格搜索的耗时
print(f"网格搜索耗时: {time.time() - start_time:.4f} 秒")
# 输出最佳参数组合
print("网格搜索找到的最佳参数组合:")
print(grid_search.best_params_)
# 使用最佳参数的模型进行预测
best_model=grid_search.best_estimator_#获得最佳模型
best_pred=best_model.predict(x_test)
print("\n网格搜索找到的最佳模型在测试集上的分类报告：")
print(classification_report(y_test, best_pred))
print("网格搜索找到的最佳模型在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, best_pred))
# 2、贝叶斯优化随机森林--
print("\n---2.贝叶斯优化随机森林（训练集-》测试集）----")
from  bayes_opt import BayesianOptimization
from  sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
import time 
import numpy as np

#假设x_train y_train ,x_test,y_test 已经定义好
#定义目标函数，这里使用交叉验证来评估模型性能
def rf_eval(n_estimators,max_depth,min_samples_split,min_samples_leaf):
    n_estimators=int(n_estimators)
    max_depth=int(max_depth)
    min_samples_split=int(min_samples_split)
    min_samples_leaf=int(min_samples_leaf)
    model=RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    scores=cross_val_score(model,x_train,y_train,cv=5,scoring='accuracy')
    return np.mean(scores)

# 定义要搜索的参数空间
pbounds_rf={
    'n_estimators':(50,200),
    'max_depth':(10,30),
    'min_samples_split':(2,10),
    'min_samples_leaf':(1,4)
}

#创建贝叶斯优化对象，设置verbose=2 显示详细迭代信息
optimizer_rf=BayesianOptimization(
    f=rf_eval,
    pbounds=pbounds_rf,
    verbose=2,
    random_state=42
)
start_time=time.time()
# 开始贝叶斯优化
optimizer_rf.maximize(init_points=5,n_iter=32)#初始随机采样点数5，迭代次数是32
end_time=time.time()
print(f"贝叶斯优化耗时：{end_time-start_time:.2f}秒")

# 输出最优参数
print("最优参数:",optimizer_rf.max['params'])
# 使用最佳参数的模型进行预测
best_params_rf=optimizer_rf.max['params']
best_model=RandomForestClassifier(
    n_estimators=int(best_params_rf['n_estimators']),
    max_depth=int(best_params_rf['max_depth']),
    min_samples_split=int(best_params_rf['min_samples_split']),
    min_samples_leaf=int(best_params_rf['min_samples_leaf']),
    random_state=42
)
best_model.fit(x_train,y_train)
best_pred=best_model.predict(x_test)
print("\n贝叶斯优化的随机森林，在测试模型上的分类报告：")
print(classification_report(y_test,best_pred))
print("贝叶斯优化后的随机森林 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test,best_pred))
@浙大疏锦行