过拟合与欠拟合
过拟合与欠拟合
过拟合和欠拟合是机器学习模型训练中最常见的问题。理解它们的成因和解决方法,对于构建高性能模型至关重要。
偏差-方差权衡
偏差-方差权衡是理解过拟合和欠拟合的核心理论。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
# 创建非线性数据
np.random.seed(42)
X = np.sort(np.random.rand(100, 1) * 10, axis=0)
y = np.sin(X).ravel() + np.random.randn(100) * 0.3
# 不同复杂度的模型
degrees = [1, 3, 10]
plt.figure(figsize=(15, 4))
for i, degree in enumerate(degrees):
ax = plt.subplot(1, 3, i + 1)
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X, y)
X_test = np.linspace(0, 10, 100).reshape(-1, 1)
y_pred = model.predict(X_test)
# 计算训练误差和测试误差
y_train_pred = model.predict(X)
train_mse = mean_squared_error(y, y_train_pred)
plt.scatter(X, y, s=20, alpha=0.5, label='训练数据')
plt.plot(X_test, y_pred, 'r-', label='模型预测')
plt.plot(X_test, np.sin(X_test), 'g--', label='真实函数')
plt.title(f'多项式次数={degree}\n训练MSE={train_mse:.3f}')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.ylim(-2, 2)
plt.tight_layout()
plt.show()
学习曲线
学习曲线展示了模型性能随训练样本数量的变化,是诊断过拟合和欠拟合的有效工具。
# 绘制学习曲线
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 欠拟合模型(线性回归)
model_underfit = LinearRegression()
train_sizes, train_scores_under, val_scores_under = learning_curve(
model_underfit, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10), scoring='neg_mean_squared_error')
axes[0].plot(train_sizes, -train_scores_under.mean(axis=1), label='训练误差')
axes[0].plot(train_sizes, -val_scores_under.mean(axis=1), label='验证误差')
axes[0].fill_between(train_sizes,
-train_scores_under.mean(axis=1) - train_scores_under.std(axis=1),
-train_scores_under.mean(axis=1) + train_scores_under.std(axis=1), alpha=0.1)
axes[0].fill_between(train_sizes,
-val_scores_under.mean(axis=1) - val_scores_under.std(axis=1),
-val_scores_under.mean(axis=1) + val_scores_under.std(axis=1), alpha=0.1)
axes[0].set_xlabel('训练样本数')
axes[0].set_ylabel('MSE')
axes[0].set_title('欠拟合模型(线性回归)')
axes[0].legend()
axes[0].grid(True)
# 合适的模型(多项式次数=3)
model合适的 = make_pipeline(PolynomialFeatures(3), LinearRegression())
train_sizes, train_scores合适的, val_scores合适的 = learning_curve(
model合适的, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10), scoring='neg_mean_squared_error')
axes[1].plot(train_sizes, -train_scores合适的.mean(axis=1), label='训练误差')
axes[1].plot(train_sizes, -val_scores合适的.mean(axis=1), label='验证误差')
axes[1].fill_between(train_sizes,
-train_scores合适的.mean(axis=1) - train_scores合适的.std(axis=1),
-train_scores合适的.mean(axis=1) + train_scores合适的.std(axis=1), alpha=0.1)
axes[1].fill_between(train_sizes,
-val_scores合适的.mean(axis=1) - val_scores合适的.std(axis=1),
-val_scores合适的.mean(axis=1) + val_scores合适的.std(axis=1), alpha=0.1)
axes[1].set_xlabel('训练样本数')
axes[1].set_ylabel('MSE')
axes[1].set_title('合适模型(多项式次数=3)')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.show()
验证曲线
验证曲线展示了模型性能随超参数变化的情况,帮助选择最佳超参数。
# 验证曲线:多项式次数
degrees = np.arange(1, 15)
train_scores, val_scores = validation_curve(
make_pipeline(PolynomialFeatures(), LinearRegression()),
X, y,
param_name='polynomialfeatures__degree',
param_range=degrees,
cv=5,
scoring='neg_mean_squared_error')
plt.figure(figsize=(10, 6))
plt.plot(degrees, -train_scores.mean(axis=1), label='训练误差', marker='o')
plt.plot(degrees, -val_scores.mean(axis=1), label='验证误差', marker='s')
plt.fill_between(degrees,
-train_scores.mean(axis=1) - train_scores.std(axis=1),
-train_scores.mean(axis=1) + train_scores.std(axis=1), alpha=0.1)
plt.fill_between(degrees,
-val_scores.mean(axis=1) - val_scores.std(axis=1),
-val_scores.mean(axis=1) + val_scores.std(axis=1), alpha=0.1)
plt.xlabel('多项式次数')
plt.ylabel('MSE')
plt.title('验证曲线')
plt.legend()
plt.grid(True)
plt.show()
# 找到最佳次数
best_degree = degrees[np.argmin(-val_scores.mean(axis=1))]
print(f"最佳多项式次数: {best_degree}")
过拟合的诊断与解决
from sklearn.model_selection import train_test_split
# 过拟合示例:高次多项式
degree_overfit = 15
model_overfit = make_pipeline(PolynomialFeatures(degree_overfit), LinearRegression())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_overfit.fit(X_train, y_train)
train_mse = mean_squared_error(y_train, model_overfit.predict(X_train))
test_mse = mean_squared_error(y_test, model_overfit.predict(X_test))
print(f"过拟合模型 (次数={degree_overfit}):")
print(f"训练MSE: {train_mse:.4f}")
print(f"测试MSE: {test_mse:.4f}")
print(f"过拟合程度: {test_mse/train_mse:.2f}倍")
# 解决方案1: 正则化
from sklearn.linear_model import Ridge
model_ridge = make_pipeline(PolynomialFeatures(degree_overfit), Ridge(alpha=1.0))
model_ridge.fit(X_train, y_train)
train_mse_ridge = mean_squared_error(y_train, model_ridge.predict(X_train))
test_mse_ridge = mean_squared_error(y_test, model_ridge.predict(X_test))
print(f"\n正则化后:")
print(f"训练MSE: {train_mse_ridge:.4f}")
print(f"测试MSE: {test_mse_ridge:.4f}")
欠拟合的诊断与解决
# 欠拟合示例:简单线性模型
model_underfit = LinearRegression()
model_underfit.fit(X_train, y_train)
train_mse_under = mean_squared_error(y_train, model_underfit.predict(X_train))
test_mse_under = mean_squared_error(y_test, model_underfit.predict(X_test))
print(f"欠拟合模型(线性回归):")
print(f"训练MSE: {train_mse_under:.4f}")
print(f"测试MSE: {test_mse_under:.4f}")
# 解决方案: 增加模型复杂度
model_better = make_pipeline(PolynomialFeatures(3), LinearRegression())
model_better.fit(X_train, y_train)
train_mse_better = mean_squared_error(y_train, model_better.predict(X_train))
test_mse_better = mean_squared_error(y_test, model_better.predict(X_test))
print(f"\n改进后(多项式次数=3):")
print(f"训练MSE: {train_mse_better:.4f}")
print(f"测试MSE: {test_mse_better:.4f}")
偏差-方差分解
def bias_variance_decomposition(model, X, y, n_iterations=100):
"""简单的偏差-方差分解"""
predictions = []
for _ in range(n_iterations):
# 有放回采样
indices = np.random.choice(len(X), len(X), replace=True)
X_bootstrap, y_bootstrap = X[indices], y[indices]
model.fit(X_bootstrap, y_bootstrap)
predictions.append(model.predict(X))
predictions = np.array(predictions)
# 偏差平方
bias_squared = np.mean((np.mean(predictions, axis=0) - y) ** 2)
# 方差
variance = np.mean(np.var(predictions, axis=0))
return bias_squared, variance
# 计算不同复杂度模型的偏差和方差
degrees_to_test = [1, 3, 10]
bias_values = []
variance_values = []
for degree in degrees_to_test:
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
bias, var = bias_variance_decomposition(model, X, y)
bias_values.append(bias)
variance_values.append(var)
print(f"次数={degree}: 偏差平方={bias:.4f}, 方差={var:.4f}")
# 可视化偏差-方差权衡
plt.figure(figsize=(8, 6))
plt.plot(degrees_to_test, bias_values, 'bo-', label='偏差平方')
plt.plot(degrees_to_test, variance_values, 'rs-', label='方差')
plt.plot(degrees_to_test, [b + v for b, v in zip(bias_values, variance_values)],
'g^-', label='总误差')
plt.xlabel('模型复杂度(多项式次数)')
plt.ylabel('误差')
plt.title('偏差-方差权衡')
plt.legend()
plt.grid(True)
plt.show()
理解过拟合和欠拟合是机器学习模型优化的关键。通过学习曲线、验证曲线和偏差-方差分析,我们可以诊断模型问题并采取相应的解决措施,构建泛化能力更强的模型。