回归分析实践详解
回归分析实践详解
回归分析是机器学习中最基础的任务之一,通过建立自变量和因变量之间的关系来进行预测。
房价预测案例
数据准备
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
# 创建模拟房价数据
np.random.seed(42)
n_samples = 500
# 特征
area = np.random.randint(50, 300, n_samples) # 面积
rooms = np.random.randint(1, 6, n_samples) # 房间数
age = np.random.randint(0, 30, n_samples) # 房龄
distance = np.random.uniform(0.5, 10, n_samples) # 距离市中心距离
# 目标变量(房价)
price = (area * 1000 +
rooms * 50000 -
age * 2000 -
distance * 10000 +
np.random.normal(0, 20000, n_samples))
# 创建DataFrame
df = pd.DataFrame({
'面积': area,
'房间数': rooms,
'房龄': age,
'距离市中心': distance,
'房价': price
})
print("房价数据集信息:")
print(df.info())
print("\n数据统计描述:")
print(df.describe())
数据探索
# 数据可视化
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 面积 vs 房价
axes[0, 0].scatter(df['面积'], df['房价'], alpha=0.5)
axes[0, 0].set_xlabel('面积 (平方米)')
axes[0, 0].set_ylabel('房价 (元)')
axes[0, 0].set_title('面积 vs 房价')
axes[0, 0].grid(True, alpha=0.3)
# 房间数 vs 房价
axes[0, 1].scatter(df['房间数'], df['房价'], alpha=0.5)
axes[0, 1].set_xlabel('房间数')
axes[0, 1].set_ylabel('房价 (元)')
axes[0, 1].set_title('房间数 vs 房价')
axes[0, 1].grid(True, alpha=0.3)
# 房龄 vs 房价
axes[1, 0].scatter(df['房龄'], df['房价'], alpha=0.5)
axes[1, 0].set_xlabel('房龄 (年)')
axes[1, 0].set_ylabel('房价 (元)')
axes[1, 0].set_title('房龄 vs 房价')
axes[1, 0].grid(True, alpha=0.3)
# 距离 vs 房价
axes[1, 1].scatter(df['距离市中心'], df['房价'], alpha=0.5)
axes[1, 1].set_xlabel('距离市中心 (km)')
axes[1, 1].set_ylabel('房价 (元)')
axes[1, 1].set_title('距离 vs 房价')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 相关性分析
print("\n特征相关性:")
print(df.corr()['房价'].sort_values(ascending=False))
模型训练与评估
数据预处理
# 分离特征和目标
X = df.drop('房价', axis=1)
y = df['房价']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")
# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
多种回归模型比较
# 定义模型
models = {
'线性回归': LinearRegression(),
'岭回归': Ridge(alpha=1.0),
'Lasso回归': Lasso(alpha=1.0),
'随机森林': RandomForestRegressor(n_estimators=100, random_state=42),
'梯度提升': GradientBoostingRegressor(n_estimators=100, random_state=42)
}
# 训练和评估模型
results = {}
for name, model in models.items():
# 训练模型
model.fit(X_train_scaled, y_train)
# 预测
y_pred = model.predict(X_test_scaled)
# 计算指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# 交叉验证
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
results[name] = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R²': r2,
'CV_R²': cv_scores.mean(),
'CV_std': cv_scores.std()
}
print(f"\n{name}:")
print(f" MSE: {mse:.2f}")
print(f" RMSE: {rmse:.2f}")
print(f" MAE: {mae:.2f}")
print(f" R²: {r2:.4f}")
print(f" 交叉验证R²: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
模型性能可视化
# 可视化模型性能
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# R²分数比较
model_names = list(results.keys())
r2_scores = [results[name]['R²'] for name in model_names]
cv_r2_scores = [results[name]['CV_R²'] for name in model_names]
x = np.arange(len(model_names))
width = 0.35
axes[0].bar(x - width/2, r2_scores, width, label='测试集R²', color='skyblue')
axes[0].bar(x + width/2, cv_r2_scores, width, label='交叉验证R²', color='lightcoral')
axes[0].set_xlabel('模型')
axes[0].set_ylabel('R²分数')
axes[0].set_title('模型R²分数比较')
axes[0].set_xticks(x)
axes[0].set_xticklabels(model_names, rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# RMSE比较
rmse_scores = [results[name]['RMSE'] for name in model_names]
axes[1].bar(model_names, rmse_scores, color='lightgreen', edgecolor='black')
axes[1].set_xlabel('模型')
axes[1].set_ylabel('RMSE')
axes[1].set_title('模型RMSE比较')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
最佳模型选择
超参数调优
from sklearn.model_selection import GridSearchCV
# 选择最佳模型(假设是梯度提升)
best_model_name = max(results, key=lambda x: results[x]['R²'])
print(f"最佳模型: {best_model_name}")
# 超参数调优
if best_model_name == '梯度提升':
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(
GradientBoostingRegressor(random_state=42),
param_grid,
cv=5,
scoring='r2',
n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证R²: {grid_search.best_score_:.4f}")
# 使用最佳模型
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print(f"测试集R²: {r2_score(y_test, y_pred_best):.4f}")
特征重要性分析
# 特征重要性(对于树模型)
if hasattr(best_model, 'feature_importances_'):
feature_importance = best_model.feature_importances_
feature_names = X.columns
# 可视化
plt.figure(figsize=(10, 6))
indices = np.argsort(feature_importance)[::-1]
plt.bar(range(X.shape[1]), feature_importance[indices], align='center',
color='lightblue', edgecolor='black')
plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices])
plt.xlabel('特征')
plt.ylabel('重要性')
plt.title('特征重要性排序')
plt.tight_layout()
plt.show()
# 打印重要特征
print("\n特征重要性排序:")
for i in indices:
print(f"{feature_names[i]}: {feature_importance[i]:.4f}")
模型部署
保存模型
import joblib
# 保存模型和预处理器
model_data = {
'model': best_model,
'scaler': scaler,
'feature_names': X.columns.tolist()
}
joblib.dump(model_data, 'house_price_model.pkl')
print("模型已保存")
# 预测函数
def predict_house_price(area, rooms, age, distance):
"""预测房价"""
# 加载模型
model_data = joblib.load('house_price_model.pkl')
# 准备数据
input_data = pd.DataFrame({
'面积': [area],
'房间数': [rooms],
'房龄': [age],
'距离市中心': [distance]
})
# 预处理
input_scaled = model_data['scaler'].transform(input_data)
# 预测
prediction = model_data['model'].predict(input_scaled)[0]
return prediction
# 测试预测
sample_prediction = predict_house_price(
area=100, rooms=3, age=5, distance=2.0
)
print(f"\n示例预测:")
print(f"面积: 100平方米, 房间数: 3, 房龄: 5年, 距离: 2.0km")
print(f"预测房价: {sample_prediction:,.2f}元")
总结
通过这个完整的回归分析案例,我们掌握了:
- 数据探索和可视化
- 多种回归模型的比较
- 模型评估指标的选择
- 超参数调优
- 特征重要性分析
- 模型部署和预测
回归分析是机器学习的基础,掌握回归分析对于理解更复杂的模型至关重要。