模型选择与调参
模型选择与调参
超参数调优是机器学习项目中的关键环节。合理的超参数设置能显著提升模型性能。本教程将介绍几种主流的超参数优化方法。
数据准备
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# 生成数据
X, y = make_classification(
n_samples=2000,
n_features=20,
n_informative=10,
n_classes=2,
random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"训练集: {X_train.shape}, 测试集: {X_test.shape}")
网格搜索(Grid Search)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import time
# 定义参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 网格搜索
start_time = time.time()
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
grid_time = time.time() - start_time
print(f"网格搜索耗时: {grid_time:.2f}秒")
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")
# 测试集评估
y_pred = grid_search.predict(X_test)
print(f"测试集准确率: {grid_search.score(X_test, y_test):.4f}")
随机搜索(Random Search)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
# 定义参数分布
param_distributions = {
'n_estimators': randint(50, 300),
'max_depth': [3, 5, 10, 15, 20, None],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': uniform(0.1, 0.9)
}
# 随机搜索
start_time = time.time()
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions,
n_iter=50, # 迭代次数
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42,
verbose=1
)
random_search.fit(X_train, y_train)
random_time = time.time() - start_time
print(f"随机搜索耗时: {random_time:.2f}秒")
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳交叉验证分数: {random_search.best_score_:.4f}")
# 测试集评估
print(f"测试集准确率: {random_search.score(X_test, y_test):.4f}")
贝叶斯优化
# 使用scikit-optimize进行贝叶斯优化
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
# 定义搜索空间
search_space = {
'n_estimators': Integer(50, 300),
'max_depth': Integer(3, 20),
'min_samples_split': Integer(2, 20),
'min_samples_leaf': Integer(1, 10),
'max_features': Real(0.1, 0.9)
}
# 贝叶斯优化
start_time = time.time()
bayes_search = BayesSearchCV(
RandomForestClassifier(random_state=42),
search_space,
n_iter=30, # 迭代次数
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42,
verbose=1
)
bayes_search.fit(X_train, y_train)
bayes_time = time.time() - start_time
print(f"贝叶斯优化耗时: {bayes_time:.2f}秒")
print(f"最佳参数: {bayes_search.best_params_}")
print(f"最佳交叉验证分数: {bayes_search.best_score_:.4f}")
# 测试集评估
print(f"测试集准确率: {bayes_search.score(X_test, y_test):.4f}")
方法对比
# 对比三种方法
results = [
{
'方法': '网格搜索',
'最佳分数': grid_search.best_score_,
'测试准确率': grid_search.score(X_test, y_test),
'耗时(秒)': grid_time,
'迭代次数': np.prod([len(v) for v in param_grid.values()])
},
{
'方法': '随机搜索',
'最佳分数': random_search.best_score_,
'测试准确率': random_search.score(X_test, y_test),
'耗时(秒)': random_time,
'迭代次数': 50
},
{
'方法': '贝叶斯优化',
'最佳分数': bayes_search.best_score_,
'测试准确率': bayes_search.score(X_test, y_test),
'耗时(秒)': bayes_time,
'迭代次数': 30
}
]
results_df = pd.DataFrame(results)
print("\n方法对比结果:")
print(results_df.to_string(index=False))
多模型选择
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
# 定义多个模型及其参数空间
models = {
'逻辑回归': {
'model': LogisticRegression(random_state=42, max_iter=1000),
'params': {
'C': [0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2']
}
},
'随机森林': {
'model': RandomForestClassifier(random_state=42),
'params': {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, None]
}
},
'梯度提升': {
'model': GradientBoostingClassifier(random_state=42),
'params': {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7]
}
},
'KNN': {
'model': KNeighborsClassifier(),
'params': {
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance']
}
}
}
# 对每个模型进行网格搜索
best_models = {}
for name, config in models.items():
print(f"\n正在优化 {name}...")
grid = GridSearchCV(
config['model'],
config['params'],
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid.fit(X_train, y_train)
best_models[name] = {
'model': grid.best_estimator_,
'best_params': grid.best_params_,
'best_score': grid.best_score_,
'test_score': grid.score(X_test, y_test)
}
print(f" 最佳参数: {grid.best_params_}")
print(f" 最佳CV分数: {grid.best_score_:.4f}")
print(f" 测试集分数: {grid.score(X_test, y_test):.4f}")
# 找出最佳模型
best_model_name = max(best_models, key=lambda x: best_models[x]['best_score'])
print(f"\n最佳模型: {best_model_name}")
print(f"最佳模型参数: {best_models[best_model_name]['best_params']}")
学习曲线分析
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, cv=5):
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy'
)
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
test_mean = test_scores.mean(axis=1)
test_std = test_scores.std(axis=1)
plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练分数')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.plot(train_sizes, test_mean, 'o-', color='red', label='验证分数')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')
plt.xlabel('训练样本数')
plt.ylabel('准确率')
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 绘制最佳模型的学习曲线
best_estimator = best_models[best_model_name]['model']
plot_learning_curve(best_estimator, f'{best_model_name}学习曲线', X_train, y_train)
验证曲线
from sklearn.model_selection import validation_curve
def plot_validation_curve(estimator, title, X, y, param_name, param_range, cv=5):
train_scores, test_scores = validation_curve(
estimator, X, y,
param_name=param_name,
param_range=param_range,
cv=cv, n_jobs=-1,
scoring='accuracy'
)
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
test_mean = test_scores.mean(axis=1)
test_std = test_scores.std(axis=1)
plt.figure(figsize=(8, 5))
plt.plot(param_range, train_mean, 'o-', color='blue', label='训练分数')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.plot(param_range, test_mean, 'o-', color='red', label='验证分数')
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')
plt.xlabel(param_name)
plt.ylabel('准确率')
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xscale('log')
plt.show()
# 绘制验证曲线
plot_validation_curve(
RandomForestClassifier(random_state=42),
'随机森林 - n_estimators验证曲线',
X_train, y_train,
param_name='n_estimators',
param_range=[10, 50, 100, 200, 300]
)
总结
超参数调优是提升模型性能的关键步骤。网格搜索适合小参数空间;随机搜索效率更高;贝叶斯优化能找到更好的参数组合。实际项目中建议:先用随机搜索快速定位大致范围,再用贝叶斯优化精细调优。同时要结合学习曲线和验证曲线诊断模型状态。