正则化
正则化
正则化是防止机器学习模型过拟合的重要技术。通过在损失函数中添加惩罚项,正则化可以限制模型的复杂度,提高模型的泛化能力。
L2正则化(Ridge回归)
L2正则化通过添加权重的平方和作为惩罚项,使模型权重趋向于较小的值。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
# 创建示例数据
np.random.seed(42)
X = np.sort(np.random.rand(100, 1) * 10, axis=0)
y = np.sin(X).ravel() + np.random.randn(100) * 0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建高次多项式特征
degree = 10
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# 不同alpha值的Ridge回归
alphas = [0, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores = []
test_scores = []
coefficients = []
for alpha in alphas:
if alpha == 0:
model = make_pipeline(PolynomialFeatures(degree),
LinearRegression())
else:
model = make_pipeline(PolynomialFeatures(degree),
Ridge(alpha=alpha))
model.fit(X_train, y_train)
train_mse = mean_squared_error(y_train, model.predict(X_train))
test_mse = mean_squared_error(y_test, model.predict(X_test))
train_scores.append(train_mse)
test_scores.append(test_mse)
# 获取系数
if alpha > 0:
coef = model.named_steps['ridge'].coef_
else:
coef = model.named_steps['linearregression'].coef_
coefficients.append(coef)
# 绘制误差曲线
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(alphas, train_scores, 'b-o', label='训练误差')
plt.plot(alphas, test_scores, 'r-o', label='测试误差')
plt.xscale('log')
plt.xlabel('alpha (log scale)')
plt.ylabel('MSE')
plt.title('Ridge回归:alpha与误差的关系')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
coef_matrix = np.array(coefficients)
for i in range(min(5, coef_matrix.shape[1])):
plt.plot(alphas, coef_matrix[:, i], label=f'系数 {i}')
plt.xscale('log')
plt.xlabel('alpha (log scale)')
plt.ylabel('系数值')
plt.title('Ridge回归:alpha与系数的关系')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
L1正则化(Lasso回归)
L1正则化通过添加权重的绝对值之和作为惩罚项,可以产生稀疏解,实现特征选择。
# Lasso回归
alphas_lasso = [0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
train_scores_lasso = []
test_scores_lasso = []
coef_counts = []
for alpha in alphas_lasso:
model = make_pipeline(PolynomialFeatures(degree),
Lasso(alpha=alpha, max_iter=10000))
model.fit(X_train, y_train)
train_mse = mean_squared_error(y_train, model.predict(X_train))
test_mse = mean_squared_error(y_test, model.predict(X_test))
train_scores_lasso.append(train_mse)
test_scores_lasso.append(test_mse)
# 统计非零系数数量
coef = model.named_steps['lasso'].coef_
coef_counts.append(np.sum(coef != 0))
# 绘制Lasso结果
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].plot(alphas_lasso, train_scores_lasso, 'b-o', label='训练误差')
axes[0].plot(alphas_lasso, test_scores_lasso, 'r-o', label='测试误差')
axes[0].set_xscale('log')
axes[0].set_xlabel('alpha (log scale)')
axes[0].set_ylabel('MSE')
axes[0].set_title('Lasso回归:alpha与误差的关系')
axes[0].legend()
axes[0].grid(True)
axes[1].plot(alphas_lasso, coef_counts, 'g-o')
axes[1].set_xscale('log')
axes[1].set_xlabel('alpha (log scale)')
axes[1].set_ylabel('非零系数数量')
axes[1].set_title('Lasso回归:alpha与特征选择')
axes[1].grid(True)
plt.tight_layout()
plt.show()
# 找到最佳alpha
best_idx = np.argmin(test_scores_lasso)
print(f"最佳alpha: {alphas_lasso[best_idx]}")
print(f"非零系数数量: {coef_counts[best_idx]}")
print(f"测试MSE: {test_scores_lasso[best_idx]:.4f}")
Elastic Net
Elastic Net结合了L1和L2正则化的优点,通过l1_ratio参数控制两者的比例。
# Elastic Net
l1_ratios = [0.1, 0.3, 0.5, 0.7, 0.9]
alphas_en = [0.01, 0.1, 1.0, 10.0]
results = []
for alpha in alphas_en:
for l1_ratio in l1_ratios:
model = make_pipeline(PolynomialFeatures(degree),
ElasticNet(alpha=alpha, l1_ratio=l1_ratio,
max_iter=10000))
model.fit(X_train, y_train)
test_mse = mean_squared_error(y_test, model.predict(X_test))
coef = model.named_steps['elasticnet'].coef_
n_nonzero = np.sum(coef != 0)
results.append({
'alpha': alpha,
'l1_ratio': l1_ratio,
'test_mse': test_mse,
'n_nonzero': n_nonzero
})
# 找到最佳参数
results_df = pd.DataFrame(results)
best_params = results_df.loc[results_df['test_mse'].idxmin()]
print("Elastic Net最佳参数:")
print(best_params)
# 可视化不同l1_ratio的影响
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for alpha in [0.1, 1.0]:
alpha_results = results_df[results_df['alpha'] == alpha]
axes[0].plot(alpha_results['l1_ratio'], alpha_results['test_mse'],
'o-', label=f'alpha={alpha}')
axes[0].set_xlabel('l1_ratio')
axes[0].set_ylabel('测试MSE')
axes[0].set_title('Elastic Net: l1_ratio与误差的关系')
axes[0].legend()
axes[0].grid(True)
for l1_ratio in [0.3, 0.5, 0.7]:
ratio_results = results_df[results_df['l1_ratio'] == l1_ratio]
axes[1].plot(ratio_results['alpha'], ratio_results['test_mse'],
'o-', label=f'l1_ratio={l1_ratio}')
axes[1].set_xscale('log')
axes[1].set_xlabel('alpha (log scale)')
axes[1].set_ylabel('测试MSE')
axes[1].set_title('Elastic Net: alpha与误差的关系')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.show()
早停法
早停法在训练过程中监控验证集性能,当性能不再提升时停止训练。
import time
from sklearn.neural_network import MLPRegressor
# 创建神经网络模型
mlp = MLPRegressor(
hidden_layer_sizes=(100, 50),
activation='relu',
solver='adam',
learning_rate_init=0.001,
max_iter=1000,
early_stopping=True,
validation_fraction=0.1,
n_iter_no_change=10,
random_state=42
)
# 训练模型
mlp.fit(X_train, y_train)
print(f"训练轮数: {mlp.n_iter_}")
print(f"训练MSE: {mean_squared_error(y_train, mlp.predict(X_train)):.4f}")
print(f"测试MSE: {mean_squared_error(y_test, mlp.predict(X_test)):.4f}")
# 绘制损失曲线
plt.figure(figsize=(10, 6))
plt.plot(mlp.loss_curve_, label='训练损失')
if hasattr(mlp, 'validation_scores_'):
plt.plot(mlp.validation_scores_, label='验证分数')
plt.xlabel('迭代次数')
plt.ylabel('损失/分数')
plt.title('神经网络训练曲线(早停法)')
plt.legend()
plt.grid(True)
plt.show()
正则化参数选择
from sklearn.model_selection import GridSearchCV
# 使用交叉验证选择最佳正则化参数
param_grid = {
'ridge__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}
model_ridge = make_pipeline(PolynomialFeatures(degree), Ridge())
grid_search = GridSearchCV(
model_ridge, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(f"最佳alpha: {grid_search.best_params_['ridge__alpha']}")
print(f"最佳交叉验证分数: {-grid_search.best_score_:.4f}")
# 可视化GridSearch结果
results = grid_search.cv_results_
alphas = param_grid['ridge__alpha']
mean_scores = -results['mean_test_score']
std_scores = results['std_test_score']
plt.figure(figsize=(8, 6))
plt.errorbar(alphas, mean_scores, yerr=std_scores, fmt='o-', capsize=5)
plt.xscale('log')
plt.xlabel('alpha (log scale)')
plt.ylabel('MSE')
plt.title('Ridge回归:GridSearchCV结果')
plt.grid(True)
plt.show()
正则化技术对比
# 对比不同正则化方法
models = {
'无正则化': make_pipeline(PolynomialFeatures(degree), LinearRegression()),
'Ridge (L2)': make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1.0)),
'Lasso (L1)': make_pipeline(PolynomialFeatures(degree), Lasso(alpha=0.01, max_iter=10000)),
'Elastic Net': make_pipeline(PolynomialFeatures(degree),
ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000))
}
results_compare = []
for name, model in models.items():
# 交叉验证
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
# 训练模型
model.fit(X_train, y_train)
# 计算测试误差
test_mse = mean_squared_error(y_test, model.predict(X_test))
# 获取系数
if 'linearregression' in model.named_steps:
coef = model.named_steps['linearregression'].coef_
elif 'ridge' in model.named_steps:
coef = model.named_steps['ridge'].coef_
elif 'lasso' in model.named_steps:
coef = model.named_steps['lasso'].coef_
elif 'elasticnet' in model.named_steps:
coef = model.named_steps['elasticnet'].coef_
results_compare.append({
'模型': name,
'交叉验证MSE': -cv_scores.mean(),
'测试MSE': test_mse,
'非零系数': np.sum(coef != 0)
})
# 打印对比结果
import pandas as pd
df_compare = pd.DataFrame(results_compare)
print("正则化方法对比:")
print(df_compare.to_string(index=False))
正则化是机器学习中不可或缺的技术。通过合理选择正则化方法和参数,可以有效防止过拟合,提高模型的泛化能力。在实际应用中,需要根据问题特点和数据特性选择合适的正则化策略。