模型选择与比较详解
模型选择与比较详解
模型选择是机器学习中的关键步骤,通过比较不同算法的性能来选择最适合特定问题的模型。
模型选择原则
选择考虑因素
- 问题类型:分类、回归、聚类等
- 数据规模:样本数量和特征维度
- 计算资源:训练时间和内存需求
- 可解释性:模型的可解释程度
- 过拟合风险:模型的泛化能力
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import time
# 创建分类数据
X_cls, y_cls = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, random_state=42
)
# 创建回归数据
X_reg, y_reg = make_regression(
n_samples=1000, n_features=20, noise=0.1, random_state=42
)
print(f"分类数据: {X_cls.shape}")
print(f"回归数据: {X_reg.shape}")
分类模型比较
常用分类算法
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
# 定义分类模型
classification_models = {
'逻辑回归': LogisticRegression(max_iter=1000, random_state=42),
'决策树': DecisionTreeClassifier(random_state=42),
'随机森林': RandomForestClassifier(n_estimators=100, random_state=42),
'梯度提升': GradientBoostingClassifier(n_estimators=100, random_state=42),
'SVM': SVC(random_state=42),
'KNN': KNeighborsClassifier(),
'朴素贝叶斯': GaussianNB()
}
# 划分数据
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
X_cls, y_cls, test_size=0.2, random_state=42
)
# 标准化
scaler_cls = StandardScaler()
X_train_cls_scaled = scaler_cls.fit_transform(X_train_cls)
X_test_cls_scaled = scaler_cls.transform(X_test_cls)
# 比较模型
print("分类模型比较:")
print("-" * 70)
print(f"{'模型':<15} {'训练时间':<12} {'准确率':<10} {'交叉验证':<15}")
print("-" * 70)
classification_results = {}
for name, model in classification_models.items():
# 训练时间
start_time = time.time()
model.fit(X_train_cls_scaled, y_train_cls)
training_time = time.time() - start_time
# 预测
y_pred_cls = model.predict(X_test_cls_scaled)
accuracy = accuracy_score(y_test_cls, y_pred_cls)
# 交叉验证
cv_scores = cross_val_score(model, X_train_cls_scaled, y_train_cls, cv=5, scoring='accuracy')
classification_results[name] = {
'training_time': training_time,
'accuracy': accuracy,
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std()
}
print(f"{name:<15} {training_time:<12.4f} {accuracy:<10.4f} {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
分类性能可视化
# 可视化分类模型性能
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 准确率比较
model_names = list(classification_results.keys())
accuracies = [classification_results[name]['accuracy'] for name in model_names]
cv_means = [classification_results[name]['cv_mean'] for name in model_names]
x = np.arange(len(model_names))
width = 0.35
axes[0].bar(x - width/2, accuracies, width, label='测试集准确率', color='skyblue')
axes[0].bar(x + width/2, cv_means, width, label='交叉验证准确率', color='lightcoral')
axes[0].set_xlabel('模型')
axes[0].set_ylabel('准确率')
axes[0].set_title('分类模型准确率比较')
axes[0].set_xticks(x)
axes[0].set_xticklabels(model_names, rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 训练时间比较
training_times = [classification_results[name]['training_time'] for name in model_names]
axes[1].bar(model_names, training_times, color='lightgreen', edgecolor='black')
axes[1].set_xlabel('模型')
axes[1].set_ylabel('训练时间 (秒)')
axes[1].set_title('分类模型训练时间比较')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
回归模型比较
常用回归算法
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
# 定义回归模型
regression_models = {
'线性回归': LinearRegression(),
'Ridge回归': Ridge(alpha=1.0),
'Lasso回归': Lasso(alpha=1.0),
'随机森林': RandomForestRegressor(n_estimators=100, random_state=42),
'梯度提升': GradientBoostingRegressor(n_estimators=100, random_state=42),
'SVR': SVR(),
'KNN回归': KNeighborsRegressor()
}
# 划分数据
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# 标准化
scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)
# 比较模型
print("\n回归模型比较:")
print("-" * 70)
print(f"{'模型':<15} {'训练时间':<12} {'MSE':<12} {'R²':<10}")
print("-" * 70)
regression_results = {}
for name, model in regression_models.items():
# 训练时间
start_time = time.time()
model.fit(X_train_reg_scaled, y_train_reg)
training_time = time.time() - start_time
# 预测
y_pred_reg = model.predict(X_test_reg_scaled)
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
regression_results[name] = {
'training_time': training_time,
'mse': mse,
'r2': r2
}
print(f"{name:<15} {training_time:<12.4f} {mse:<12.4f} {r2:<10.4f}")
回归性能可视化
# 可视化回归模型性能
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# R²分数比较
model_names = list(regression_results.keys())
r2_scores = [regression_results[name]['r2'] for name in model_names]
mse_scores = [regression_results[name]['mse'] for name in model_names]
axes[0].bar(model_names, r2_scores, color='skyblue', edgecolor='black')
axes[0].set_xlabel('模型')
axes[0].set_ylabel('R²分数')
axes[0].set_title('回归模型R²分数比较')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)
# MSE比较(对数尺度)
axes[1].bar(model_names, mse_scores, color='lightcoral', edgecolor='black')
axes[1].set_xlabel('模型')
axes[1].set_ylabel('MSE')
axes[1].set_title('回归模型MSE比较')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_yscale('log')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
超参数调优
网格搜索
# 以随机森林为例进行超参数调优
param_grid_rf = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10, None],
'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid_rf,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search_rf.fit(X_train_cls_scaled, y_train_cls)
print(f"最佳参数: {grid_search_rf.best_params_}")
print(f"最佳交叉验证准确率: {grid_search_rf.best_score_:.4f}")
# 使用最佳模型
best_rf = grid_search_rf.best_estimator_
y_pred_best = best_rf.predict(X_test_cls_scaled)
print(f"测试集准确率: {accuracy_score(y_test_cls, y_pred_best):.4f}")
模型解释
特征重要性分析
# 特征重要性分析
def plot_feature_importance(model, feature_names, title="特征重要性"):
"""绘制特征重要性图"""
if hasattr(model, 'feature_importances_'):
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices], align='center',
color='lightblue', edgecolor='black')
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45)
plt.xlabel('特征')
plt.ylabel('重要性')
plt.title(title)
plt.tight_layout()
plt.show()
# 返回重要特征
return [(feature_names[i], importances[i]) for i in indices]
else:
print("该模型没有特征重要性属性")
return None
# 分析随机森林的特征重要性
feature_names = [f'特征{i}' for i in range(X_cls.shape[1])]
important_features = plot_feature_importance(best_rf, feature_names, "随机森林特征重要性")
print("\n前5个重要特征:")
if important_features:
for name, importance in important_features[:5]:
print(f"{name}: {importance:.4f}")
模型选择最佳实践
- 理解问题:明确问题类型和业务需求
- 数据探索:了解数据特点和分布
- 基线模型:从简单模型开始
- 交叉验证:使用交叉验证评估模型
- 超参数调优:优化模型参数
- 模型解释:理解模型决策过程
模型选择是机器学习项目中的关键步骤,掌握模型选择方法可以构建更有效的机器学习系统。