🤖

交叉验证

📂 ai ⏱ 2 min 362 words

交叉验证 K折交叉验证留一法模型验证

交叉验证

交叉验证是评估机器学习模型泛化能力的重要技术。它通过将数据集划分为多个子集，多次训练和验证模型，从而得到更可靠的性能估计。

为什么需要交叉验证

简单的训练集/测试集划分存在以下问题：

模型性能评估结果依赖于数据划分方式
浪费了部分数据用于验证
可能导致过拟合或欠拟合的误判

交叉验证通过多次划分和验证，提供了更稳定、更可靠的性能估计。

K折交叉验证

K折交叉验证将数据集划分为K个大小相似的子集，每次使用K-1个子集训练，剩下的1个子集验证，重复K次。

import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# 创建示例数据
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# 创建模型
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 5折交叉验证
scores_lr = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
scores_rf = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

print("逻辑回归 5折交叉验证结果:")
print(f"各折准确率: {scores_lr}")
print(f"平均准确率: {scores_lr.mean():.4f} (+/- {scores_lr.std() * 2:.4f})")

print("\n随机森林 5折交叉验证结果:")
print(f"各折准确率: {scores_rf}")
print(f"平均准确率: {scores_rf.mean():.4f} (+/- {scores_rf.std() * 2:.4f})")

可视化交叉验证结果

# 绘制交叉验证结果对比
models = ['逻辑回归', '随机森林']
means = [scores_lr.mean(), scores_rf.mean()]
stds = [scores_lr.std(), scores_rf.std()]

plt.figure(figsize=(8, 6))
bars = plt.bar(models, means, yerr=stds, capsize=5, color=['steelblue', 'forestgreen'])
plt.ylabel('准确率')
plt.title('模型交叉验证性能对比')
plt.ylim(0.7, 1.0)

# 在柱状图上添加数值
for bar, mean, std in zip(bars, means, stds):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{mean:.3f}±{std:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

留一法交叉验证

留一法是K折交叉验证的特例，K等于样本数。每次只用一个样本作为验证集。

from sklearn.model_selection import LeaveOneOut

# 创建小数据集用于演示
X_small, y_small = make_classification(n_samples=50, n_features=10, random_state=42)

# 留一法交叉验证
loo = LeaveOneOut()
scores_loo = cross_val_score(lr, X_small, y_small, cv=loo, scoring='accuracy')

print("留一法交叉验证结果:")
print(f"样本数: {len(y_small)}")
print(f"平均准确率: {scores_loo.mean():.4f}")
print(f"标准差: {scores_loo.std():.4f}")

分层交叉验证

分层交叉验证确保每个折中的类别比例与整体数据集保持一致，特别适用于类别不平衡的数据。

from sklearn.model_selection import StratifiedKFold

# 创建类别不平衡数据
X_imbalanced, y_imbalanced = make_classification(
    n_samples=1000, n_features=20, weights=[0.9, 0.1], random_state=42)

print(f"类别分布: {np.bincount(y_imbalanced)}")

# 分层K折交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_stratified = cross_val_score(rf, X_imbalanced, y_imbalanced, cv=skf, scoring='f1')

print("\n分层交叉验证F1分数:")
print(f"各折F1: {scores_stratified}")
print(f"平均F1: {scores_stratified.mean():.4f} (+/- {scores_stratified.std() * 2:.4f})")

# 对比普通K折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores_kfold = cross_val_score(rf, X_imbalanced, y_imbalanced, cv=kf, scoring='f1')

print("\n普通K折交叉验证F1分数:")
print(f"各折F1: {scores_kfold}")
print(f"平均F1: {scores_kfold.mean():.4f} (+/- {scores_kfold.std() * 2:.4f})")

重复交叉验证

重复交叉验证通过多次重复K折划分，进一步提高评估的稳定性。

from sklearn.model_selection import RepeatedStratifiedKFold

# 重复分层K折交叉验证
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
scores_repeated = cross_val_score(rf, X, y, cv=rskf, scoring='accuracy')

print("重复分层交叉验证结果:")
print(f"总验证次数: {len(scores_repeated)}")
print(f"平均准确率: {scores_repeated.mean():.4f}")
print(f"标准差: {scores_repeated.std():.4f}")

时间序列交叉验证

对于时间序列数据，需要使用特殊的交叉验证方法以避免数据泄露。

from sklearn.model_selection import TimeSeriesSplit

# 创建时间序列数据
n_samples = 100
X_ts = np.random.randn(n_samples, 5)
y_ts = np.cumsum(np.random.randn(n_samples))

# 时间序列交叉验证
tscv = TimeSeriesSplit(n_splits=5)

print("时间序列交叉验证划分:")
for i, (train_idx, test_idx) in enumerate(tscv.split(X_ts)):
    print(f"折 {i+1}: 训练集 [{train_idx[0]}:{train_idx[-1]}], 测试集 [{test_idx[0]}:{test_idx[-1]}]")

交叉验证用于超参数调优

from sklearn.model_selection import GridSearchCV

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# 使用交叉验证的网格搜索
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

print("最佳参数:", grid_search.best_params_)
print("最佳交叉验证分数:", f"{grid_search.best_score_:.4f}")

# 查看所有参数组合的结果
results = grid_search.cv_results_
print("\n前5个参数组合的结果:")
for mean, std, params in zip(results['mean_test_score'], results['std_test_score'], 
                              results['params'][:5]):
    print(f"准确率: {mean:.4f} (+/- {std:.4f}), 参数: {params}")

交叉验证最佳实践

选择合适的K值：通常K=5或K=10
使用分层交叉验证：当类别不平衡时
数据预处理应在交叉验证内部进行：避免数据泄露
多次重复：提高评估稳定性
选择合适的评估指标：根据问题类型选择

交叉验证是机器学习中不可或缺的模型评估技术，能够帮助我们更准确地评估模型的泛化能力，避免过拟合，为模型选择和超参数调优提供可靠依据。