🤖

模型选择与调参

📂 ai ⏱ 4 min 626 words

模型选择与调参

超参数调优是机器学习项目中的关键环节。合理的超参数设置能显著提升模型性能。本教程将介绍几种主流的超参数优化方法。

数据准备

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 生成数据
X, y = make_classification(
    n_samples=2000,
    n_features=20,
    n_informative=10,
    n_classes=2,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"训练集: {X_train.shape}, 测试集: {X_test.shape}")

网格搜索（Grid Search）

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import time

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 网格搜索
start_time = time.time()
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

grid_time = time.time() - start_time

print(f"网格搜索耗时: {grid_time:.2f}秒")
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")

# 测试集评估
y_pred = grid_search.predict(X_test)
print(f"测试集准确率: {grid_search.score(X_test, y_test):.4f}")

随机搜索（Random Search）

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# 定义参数分布
param_distributions = {
    'n_estimators': randint(50, 300),
    'max_depth': [3, 5, 10, 15, 20, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': uniform(0.1, 0.9)
}

# 随机搜索
start_time = time.time()
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions,
    n_iter=50,  # 迭代次数
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
random_search.fit(X_train, y_train)

random_time = time.time() - start_time

print(f"随机搜索耗时: {random_time:.2f}秒")
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳交叉验证分数: {random_search.best_score_:.4f}")

# 测试集评估
print(f"测试集准确率: {random_search.score(X_test, y_test):.4f}")

贝叶斯优化

# 使用scikit-optimize进行贝叶斯优化
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical

# 定义搜索空间
search_space = {
    'n_estimators': Integer(50, 300),
    'max_depth': Integer(3, 20),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Real(0.1, 0.9)
}

# 贝叶斯优化
start_time = time.time()
bayes_search = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    search_space,
    n_iter=30,  # 迭代次数
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
bayes_search.fit(X_train, y_train)

bayes_time = time.time() - start_time

print(f"贝叶斯优化耗时: {bayes_time:.2f}秒")
print(f"最佳参数: {bayes_search.best_params_}")
print(f"最佳交叉验证分数: {bayes_search.best_score_:.4f}")

# 测试集评估
print(f"测试集准确率: {bayes_search.score(X_test, y_test):.4f}")

方法对比

# 对比三种方法
results = [
    {
        '方法': '网格搜索',
        '最佳分数': grid_search.best_score_,
        '测试准确率': grid_search.score(X_test, y_test),
        '耗时(秒)': grid_time,
        '迭代次数': np.prod([len(v) for v in param_grid.values()])
    },
    {
        '方法': '随机搜索',
        '最佳分数': random_search.best_score_,
        '测试准确率': random_search.score(X_test, y_test),
        '耗时(秒)': random_time,
        '迭代次数': 50
    },
    {
        '方法': '贝叶斯优化',
        '最佳分数': bayes_search.best_score_,
        '测试准确率': bayes_search.score(X_test, y_test),
        '耗时(秒)': bayes_time,
        '迭代次数': 30
    }
]

results_df = pd.DataFrame(results)
print("\n方法对比结果:")
print(results_df.to_string(index=False))

多模型选择

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# 定义多个模型及其参数空间
models = {
    '逻辑回归': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2']
        }
    },
    '随机森林': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, None]
        }
    },
    '梯度提升': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    }
}

# 对每个模型进行网格搜索
best_models = {}
for name, config in models.items():
    print(f"\n正在优化 {name}...")
    grid = GridSearchCV(
        config['model'],
        config['params'],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    
    best_models[name] = {
        'model': grid.best_estimator_,
        'best_params': grid.best_params_,
        'best_score': grid.best_score_,
        'test_score': grid.score(X_test, y_test)
    }
    
    print(f"  最佳参数: {grid.best_params_}")
    print(f"  最佳CV分数: {grid.best_score_:.4f}")
    print(f"  测试集分数: {grid.score(X_test, y_test):.4f}")

# 找出最佳模型
best_model_name = max(best_models, key=lambda x: best_models[x]['best_score'])
print(f"\n最佳模型: {best_model_name}")
print(f"最佳模型参数: {best_models[best_model_name]['best_params']}")

学习曲线分析

from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, cv=5):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy'
    )
    
    train_mean = train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    test_mean = test_scores.mean(axis=1)
    test_std = test_scores.std(axis=1)
    
    plt.figure(figsize=(8, 5))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练分数')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.plot(train_sizes, test_mean, 'o-', color='red', label='验证分数')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')
    
    plt.xlabel('训练样本数')
    plt.ylabel('准确率')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# 绘制最佳模型的学习曲线
best_estimator = best_models[best_model_name]['model']
plot_learning_curve(best_estimator, f'{best_model_name}学习曲线', X_train, y_train)

验证曲线

from sklearn.model_selection import validation_curve

def plot_validation_curve(estimator, title, X, y, param_name, param_range, cv=5):
    train_scores, test_scores = validation_curve(
        estimator, X, y,
        param_name=param_name,
        param_range=param_range,
        cv=cv, n_jobs=-1,
        scoring='accuracy'
    )
    
    train_mean = train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    test_mean = test_scores.mean(axis=1)
    test_std = test_scores.std(axis=1)
    
    plt.figure(figsize=(8, 5))
    plt.plot(param_range, train_mean, 'o-', color='blue', label='训练分数')
    plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.plot(param_range, test_mean, 'o-', color='red', label='验证分数')
    plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')
    
    plt.xlabel(param_name)
    plt.ylabel('准确率')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xscale('log')
    plt.show()

# 绘制验证曲线
plot_validation_curve(
    RandomForestClassifier(random_state=42),
    '随机森林 - n_estimators验证曲线',
    X_train, y_train,
    param_name='n_estimators',
    param_range=[10, 50, 100, 200, 300]
)

总结

超参数调优是提升模型性能的关键步骤。网格搜索适合小参数空间；随机搜索效率更高；贝叶斯优化能找到更好的参数组合。实际项目中建议：先用随机搜索快速定位大致范围，再用贝叶斯优化精细调优。同时要结合学习曲线和验证曲线诊断模型状态。