交叉验证
交叉验证
交叉验证是评估机器学习模型泛化能力的重要技术。它通过将数据集划分为多个子集,多次训练和验证模型,从而得到更可靠的性能估计。
为什么需要交叉验证
简单的训练集/测试集划分存在以下问题:
- 模型性能评估结果依赖于数据划分方式
- 浪费了部分数据用于验证
- 可能导致过拟合或欠拟合的误判
交叉验证通过多次划分和验证,提供了更稳定、更可靠的性能估计。
K折交叉验证
K折交叉验证将数据集划分为K个大小相似的子集,每次使用K-1个子集训练,剩下的1个子集验证,重复K次。
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
# 创建示例数据
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
# 创建模型
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# 5折交叉验证
scores_lr = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
scores_rf = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print("逻辑回归 5折交叉验证结果:")
print(f"各折准确率: {scores_lr}")
print(f"平均准确率: {scores_lr.mean():.4f} (+/- {scores_lr.std() * 2:.4f})")
print("\n随机森林 5折交叉验证结果:")
print(f"各折准确率: {scores_rf}")
print(f"平均准确率: {scores_rf.mean():.4f} (+/- {scores_rf.std() * 2:.4f})")
可视化交叉验证结果
# 绘制交叉验证结果对比
models = ['逻辑回归', '随机森林']
means = [scores_lr.mean(), scores_rf.mean()]
stds = [scores_lr.std(), scores_rf.std()]
plt.figure(figsize=(8, 6))
bars = plt.bar(models, means, yerr=stds, capsize=5, color=['steelblue', 'forestgreen'])
plt.ylabel('准确率')
plt.title('模型交叉验证性能对比')
plt.ylim(0.7, 1.0)
# 在柱状图上添加数值
for bar, mean, std in zip(bars, means, stds):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{mean:.3f}±{std:.3f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
留一法交叉验证
留一法是K折交叉验证的特例,K等于样本数。每次只用一个样本作为验证集。
from sklearn.model_selection import LeaveOneOut
# 创建小数据集用于演示
X_small, y_small = make_classification(n_samples=50, n_features=10, random_state=42)
# 留一法交叉验证
loo = LeaveOneOut()
scores_loo = cross_val_score(lr, X_small, y_small, cv=loo, scoring='accuracy')
print("留一法交叉验证结果:")
print(f"样本数: {len(y_small)}")
print(f"平均准确率: {scores_loo.mean():.4f}")
print(f"标准差: {scores_loo.std():.4f}")
分层交叉验证
分层交叉验证确保每个折中的类别比例与整体数据集保持一致,特别适用于类别不平衡的数据。
from sklearn.model_selection import StratifiedKFold
# 创建类别不平衡数据
X_imbalanced, y_imbalanced = make_classification(
n_samples=1000, n_features=20, weights=[0.9, 0.1], random_state=42)
print(f"类别分布: {np.bincount(y_imbalanced)}")
# 分层K折交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_stratified = cross_val_score(rf, X_imbalanced, y_imbalanced, cv=skf, scoring='f1')
print("\n分层交叉验证F1分数:")
print(f"各折F1: {scores_stratified}")
print(f"平均F1: {scores_stratified.mean():.4f} (+/- {scores_stratified.std() * 2:.4f})")
# 对比普通K折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores_kfold = cross_val_score(rf, X_imbalanced, y_imbalanced, cv=kf, scoring='f1')
print("\n普通K折交叉验证F1分数:")
print(f"各折F1: {scores_kfold}")
print(f"平均F1: {scores_kfold.mean():.4f} (+/- {scores_kfold.std() * 2:.4f})")
重复交叉验证
重复交叉验证通过多次重复K折划分,进一步提高评估的稳定性。
from sklearn.model_selection import RepeatedStratifiedKFold
# 重复分层K折交叉验证
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
scores_repeated = cross_val_score(rf, X, y, cv=rskf, scoring='accuracy')
print("重复分层交叉验证结果:")
print(f"总验证次数: {len(scores_repeated)}")
print(f"平均准确率: {scores_repeated.mean():.4f}")
print(f"标准差: {scores_repeated.std():.4f}")
时间序列交叉验证
对于时间序列数据,需要使用特殊的交叉验证方法以避免数据泄露。
from sklearn.model_selection import TimeSeriesSplit
# 创建时间序列数据
n_samples = 100
X_ts = np.random.randn(n_samples, 5)
y_ts = np.cumsum(np.random.randn(n_samples))
# 时间序列交叉验证
tscv = TimeSeriesSplit(n_splits=5)
print("时间序列交叉验证划分:")
for i, (train_idx, test_idx) in enumerate(tscv.split(X_ts)):
print(f"折 {i+1}: 训练集 [{train_idx[0]}:{train_idx[-1]}], 测试集 [{test_idx[0]}:{test_idx[-1]}]")
交叉验证用于超参数调优
from sklearn.model_selection import GridSearchCV
# 定义参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 20, None],
'min_samples_split': [2, 5, 10]
}
# 使用交叉验证的网格搜索
grid_search = GridSearchCV(
estimator=RandomForestClassifier(random_state=42),
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X, y)
print("最佳参数:", grid_search.best_params_)
print("最佳交叉验证分数:", f"{grid_search.best_score_:.4f}")
# 查看所有参数组合的结果
results = grid_search.cv_results_
print("\n前5个参数组合的结果:")
for mean, std, params in zip(results['mean_test_score'], results['std_test_score'],
results['params'][:5]):
print(f"准确率: {mean:.4f} (+/- {std:.4f}), 参数: {params}")
交叉验证最佳实践
- 选择合适的K值:通常K=5或K=10
- 使用分层交叉验证:当类别不平衡时
- 数据预处理应在交叉验证内部进行:避免数据泄露
- 多次重复:提高评估稳定性
- 选择合适的评估指标:根据问题类型选择
交叉验证是机器学习中不可或缺的模型评估技术,能够帮助我们更准确地评估模型的泛化能力,避免过拟合,为模型选择和超参数调优提供可靠依据。