🤖

回归分析实践详解

📂 ai ⏱ 4 min 606 words

回归分析实践详解

回归分析是机器学习中最基础的任务之一，通过建立自变量和因变量之间的关系来进行预测。

房价预测案例

数据准备

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns

# 创建模拟房价数据
np.random.seed(42)
n_samples = 500

# 特征
area = np.random.randint(50, 300, n_samples)  # 面积
rooms = np.random.randint(1, 6, n_samples)     # 房间数
age = np.random.randint(0, 30, n_samples)      # 房龄
distance = np.random.uniform(0.5, 10, n_samples)  # 距离市中心距离

# 目标变量（房价）
price = (area * 1000 + 
         rooms * 50000 - 
         age * 2000 - 
         distance * 10000 + 
         np.random.normal(0, 20000, n_samples))

# 创建DataFrame
df = pd.DataFrame({
    '面积': area,
    '房间数': rooms,
    '房龄': age,
    '距离市中心': distance,
    '房价': price
})

print("房价数据集信息:")
print(df.info())
print("\n数据统计描述:")
print(df.describe())

数据探索

# 数据可视化
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 面积 vs 房价
axes[0, 0].scatter(df['面积'], df['房价'], alpha=0.5)
axes[0, 0].set_xlabel('面积 (平方米)')
axes[0, 0].set_ylabel('房价 (元)')
axes[0, 0].set_title('面积 vs 房价')
axes[0, 0].grid(True, alpha=0.3)

# 房间数 vs 房价
axes[0, 1].scatter(df['房间数'], df['房价'], alpha=0.5)
axes[0, 1].set_xlabel('房间数')
axes[0, 1].set_ylabel('房价 (元)')
axes[0, 1].set_title('房间数 vs 房价')
axes[0, 1].grid(True, alpha=0.3)

# 房龄 vs 房价
axes[1, 0].scatter(df['房龄'], df['房价'], alpha=0.5)
axes[1, 0].set_xlabel('房龄 (年)')
axes[1, 0].set_ylabel('房价 (元)')
axes[1, 0].set_title('房龄 vs 房价')
axes[1, 0].grid(True, alpha=0.3)

# 距离 vs 房价
axes[1, 1].scatter(df['距离市中心'], df['房价'], alpha=0.5)
axes[1, 1].set_xlabel('距离市中心 (km)')
axes[1, 1].set_ylabel('房价 (元)')
axes[1, 1].set_title('距离 vs 房价')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 相关性分析
print("\n特征相关性:")
print(df.corr()['房价'].sort_values(ascending=False))

模型训练与评估

数据预处理

# 分离特征和目标
X = df.drop('房价', axis=1)
y = df['房价']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")

# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

多种回归模型比较

# 定义模型
models = {
    '线性回归': LinearRegression(),
    '岭回归': Ridge(alpha=1.0),
    'Lasso回归': Lasso(alpha=1.0),
    '随机森林': RandomForestRegressor(n_estimators=100, random_state=42),
    '梯度提升': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# 训练和评估模型
results = {}
for name, model in models.items():
    # 训练模型
    model.fit(X_train_scaled, y_train)
    
    # 预测
    y_pred = model.predict(X_test_scaled)
    
    # 计算指标
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # 交叉验证
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    
    results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'CV_R²': cv_scores.mean(),
        'CV_std': cv_scores.std()
    }
    
    print(f"\n{name}:")
    print(f"  MSE: {mse:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R²: {r2:.4f}")
    print(f"  交叉验证R²: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

模型性能可视化

# 可视化模型性能
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# R²分数比较
model_names = list(results.keys())
r2_scores = [results[name]['R²'] for name in model_names]
cv_r2_scores = [results[name]['CV_R²'] for name in model_names]

x = np.arange(len(model_names))
width = 0.35

axes[0].bar(x - width/2, r2_scores, width, label='测试集R²', color='skyblue')
axes[0].bar(x + width/2, cv_r2_scores, width, label='交叉验证R²', color='lightcoral')
axes[0].set_xlabel('模型')
axes[0].set_ylabel('R²分数')
axes[0].set_title('模型R²分数比较')
axes[0].set_xticks(x)
axes[0].set_xticklabels(model_names, rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# RMSE比较
rmse_scores = [results[name]['RMSE'] for name in model_names]
axes[1].bar(model_names, rmse_scores, color='lightgreen', edgecolor='black')
axes[1].set_xlabel('模型')
axes[1].set_ylabel('RMSE')
axes[1].set_title('模型RMSE比较')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

最佳模型选择

超参数调优

from sklearn.model_selection import GridSearchCV

# 选择最佳模型（假设是梯度提升）
best_model_name = max(results, key=lambda x: results[x]['R²'])
print(f"最佳模型: {best_model_name}")

# 超参数调优
if best_model_name == '梯度提升':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    
    grid_search = GridSearchCV(
        GradientBoostingRegressor(random_state=42),
        param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    print(f"最佳参数: {grid_search.best_params_}")
    print(f"最佳交叉验证R²: {grid_search.best_score_:.4f}")
    
    # 使用最佳模型
    best_model = grid_search.best_estimator_
    y_pred_best = best_model.predict(X_test_scaled)
    print(f"测试集R²: {r2_score(y_test, y_pred_best):.4f}")

特征重要性分析

# 特征重要性（对于树模型）
if hasattr(best_model, 'feature_importances_'):
    feature_importance = best_model.feature_importances_
    feature_names = X.columns
    
    # 可视化
    plt.figure(figsize=(10, 6))
    indices = np.argsort(feature_importance)[::-1]
    
    plt.bar(range(X.shape[1]), feature_importance[indices], align='center',
            color='lightblue', edgecolor='black')
    plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices])
    plt.xlabel('特征')
    plt.ylabel('重要性')
    plt.title('特征重要性排序')
    plt.tight_layout()
    plt.show()
    
    # 打印重要特征
    print("\n特征重要性排序:")
    for i in indices:
        print(f"{feature_names[i]}: {feature_importance[i]:.4f}")

模型部署

保存模型

import joblib

# 保存模型和预处理器
model_data = {
    'model': best_model,
    'scaler': scaler,
    'feature_names': X.columns.tolist()
}

joblib.dump(model_data, 'house_price_model.pkl')
print("模型已保存")

# 预测函数
def predict_house_price(area, rooms, age, distance):
    """预测房价"""
    # 加载模型
    model_data = joblib.load('house_price_model.pkl')
    
    # 准备数据
    input_data = pd.DataFrame({
        '面积': [area],
        '房间数': [rooms],
        '房龄': [age],
        '距离市中心': [distance]
    })
    
    # 预处理
    input_scaled = model_data['scaler'].transform(input_data)
    
    # 预测
    prediction = model_data['model'].predict(input_scaled)[0]
    
    return prediction

# 测试预测
sample_prediction = predict_house_price(
    area=100, rooms=3, age=5, distance=2.0
)
print(f"\n示例预测:")
print(f"面积: 100平方米, 房间数: 3, 房龄: 5年, 距离: 2.0km")
print(f"预测房价: {sample_prediction:,.2f}元")

总结

通过这个完整的回归分析案例，我们掌握了：

数据探索和可视化
多种回归模型的比较
模型评估指标的选择
超参数调优
特征重要性分析
模型部署和预测

回归分析是机器学习的基础，掌握回归分析对于理解更复杂的模型至关重要。