🤖

特征选择技术详解

📂 ai ⏱ 3 min 559 words

特征选择技术详解

特征选择是从原始特征中选择最相关特征的过程，可以提高模型性能、减少过拟合和加快训练速度。

特征选择方法

1. 过滤法（Filter Methods）

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# 创建示例数据
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10, 
    n_redundant=5, n_classes=2, random_state=42
)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始特征数量: {X.shape[1]}")
print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")

方差阈值过滤

from sklearn.feature_selection import VarianceThreshold

# 方差阈值过滤
var_selector = VarianceThreshold(threshold=0.1)
X_var = var_selector.fit_transform(X_train)

# 获取保留的特征
selected_features_var = np.where(var_selector.get_support())[0]
print(f"方差阈值过滤后保留的特征: {len(selected_features_var)}")
print(f"保留的特征索引: {selected_features_var}")

单变量特征选择

# 不同评分函数的比较
scoring_functions = {
    'F-value': f_classif,
    '卡方': chi2,
    '互信息': mutual_info_classif
}

# 可视化不同评分函数的结果
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, (name, func) in enumerate(scoring_functions.items()):
    # 选择K个最佳特征
    selector = SelectKBest(score_func=func, k=10)
    X_selected = selector.fit_transform(X_train, y_train)
    
    # 获取特征分数
    scores = selector.scores_
    
    # 可视化
    axes[idx].bar(range(len(scores)), scores, color='skyblue', edgecolor='black')
    axes[idx].set_xlabel('特征索引')
    axes[idx].set_ylabel('分数')
    axes[idx].set_title(f'{name}评分')
    axes[idx].grid(True, alpha=0.3)
    
    # 标记选中的特征
    selected_indices = np.where(selector.get_support())[0]
    axes[idx].scatter(selected_indices, scores[selected_indices], 
                     c='red', s=100, zorder=5, label='选中特征')
    axes[idx].legend()

plt.tight_layout()
plt.show()

2. 包装法（Wrapper Methods）

from sklearn.feature_selection import RFE, RFECV

# 递归特征消除 (RFE)
rfe = RFE(
    estimator=RandomForestClassifier(n_estimators=100, random_state=42),
    n_features_to_select=10,
    step=1
)

X_rfe = rfe.fit_transform(X_train, y_train)

# 获取选中的特征
selected_features_rfe = np.where(rfe.get_support())[0]
print(f"RFE选中的特征: {len(selected_features_rfe)}")
print(f"特征排名: {rfe.ranking_}")

# 可视化特征排名
plt.figure(figsize=(10, 6))
plt.bar(range(len(rfe.ranking_)), rfe.ranking_, color='lightcoral', edgecolor='black')
plt.xlabel('特征索引')
plt.ylabel('排名 (1=最佳)')
plt.title('RFE特征排名')
plt.grid(True, alpha=0.3)
plt.show()

# 交叉验证RFE
rfecv = RFECV(
    estimator=RandomForestClassifier(n_estimators=100, random_state=42),
    step=1,
    cv=5,
    scoring='accuracy',
    min_features_to_select=1
)

X_rfecv = rfecv.fit_transform(X_train, y_train)

print(f"最佳特征数量: {rfecv.n_features_}")
print(f"交叉验证分数: {rfecv.cv_results_['mean_test_score']}")

# 可视化交叉验证结果
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1),
         rfecv.cv_results_['mean_test_score'], 'b-o')
plt.xlabel('特征数量')
plt.ylabel('交叉验证准确率')
plt.title('RFECV：特征数量对性能的影响')
plt.axvline(x=rfecv.n_features_, color='r', linestyle='--', 
            label=f'最佳特征数量: {rfecv.n_features_}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

3. 嵌入法（Embedded Methods）

from sklearn.feature_selection import SelectFromModel

# 基于随机森林的特征选择
sfm_rf = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold='mean'  # 使用均值作为阈值
)

X_sfm_rf = sfm_rf.fit_transform(X_train, y_train)

# 获取选中的特征
selected_features_sfm = np.where(sfm_rf.get_support())[0]
print(f"基于随机森林选中的特征: {len(selected_features_sfm)}")

# 获取特征重要性
importances = sfm_rf.estimator_.feature_importances_
indices = np.argsort(importances)[::-1]

# 可视化特征重要性
plt.figure(figsize=(12, 6))
plt.bar(range(X_train.shape[1]), importances[indices], align='center',
        color='lightgreen', edgecolor='black')
plt.xticks(range(X_train.shape[1]), [f'特征{i}' for i in indices], rotation=45)
plt.xlabel('特征')
plt.ylabel('重要性')
plt.title('基于随机森林的特征重要性')
plt.tight_layout()
plt.show()

特征选择效果评估

模型性能比较

# 比较不同特征选择方法的效果
methods = {
    '原始特征': (X_train, X_test),
    '方差阈值': (X_var, var_selector.transform(X_test)),
    'RFE': (X_rfe, rfe.transform(X_test)),
    '基于模型': (X_sfm_rf, sfm_rf.transform(X_test))
}

print("特征选择效果比较:")
print("-" * 60)
print(f"{'方法':<15} {'特征数量':<10} {'准确率':<10}")
print("-" * 60)

for name, (X_tr, X_te) in methods.items():
    # 训练模型
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_tr, y_train)
    
    # 预测
    y_pred = clf.predict(X_te)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name:<15} {X_tr.shape[1]:<10} {accuracy:<10.4f}")

可视化比较

# 可视化不同特征选择方法的效果
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 特征数量比较
method_names = list(methods.keys())
feature_counts = [methods[name][0].shape[1] for name in method_names]

axes[0].bar(method_names, feature_counts, color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
axes[0].set_ylabel('特征数量')
axes[0].set_title('不同方法选择的特征数量')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

# 准确率比较
accuracies = []
for name, (X_tr, X_te) in methods.items():
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_tr, y_train)
    y_pred = clf.predict(X_te)
    accuracies.append(accuracy_score(y_test, y_pred))

axes[1].bar(method_names, accuracies, color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
axes[1].set_ylabel('准确率')
axes[1].set_title('不同方法的分类准确率')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

实际应用

特征选择管道

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 创建特征选择管道
feature_selection_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=10)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 交叉验证评估
cv_scores = cross_val_score(feature_selection_pipeline, X, y, cv=5, scoring='accuracy')
print(f"特征选择管道交叉验证准确率: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# 与完整特征比较
full_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

cv_scores_full = cross_val_score(full_pipeline, X, y, cv=5, scoring='accuracy')
print(f"完整特征交叉验证准确率: {cv_scores_full.mean():.4f} (+/- {cv_scores_full.std()*2:.4f})")

特征重要性分析

# 综合特征重要性分析
def analyze_feature_importance(X, y, feature_names=None):
    """综合分析特征重要性"""
    
    if feature_names is None:
        feature_names = [f'特征{i}' for i in range(X.shape[1])]
    
    # 训练随机森林
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # 获取特征重要性
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # 可视化
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(importances)), importances[indices], align='center',
            color='lightblue', edgecolor='black')
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45)
    plt.xlabel('特征')
    plt.ylabel('重要性')
    plt.title('综合特征重要性分析')
    plt.tight_layout()
    plt.show()
    
    # 返回重要特征
    return [(feature_names[i], importances[i]) for i in indices]

# 分析特征重要性
feature_names = [f'特征{i}' for i in range(X.shape[1])]
important_features = analyze_feature_importance(X, y, feature_names)

print("\n前10个重要特征:")
for name, importance in important_features[:10]:
    print(f"{name}: {importance:.4f}")

特征选择最佳实践

理解问题：根据问题类型选择合适的特征选择方法
多方法结合：结合多种特征选择方法提高稳定性
评估效果：使用交叉验证评估特征选择对模型性能的影响
可视化分析：通过可视化理解特征重要性
迭代优化：不断尝试和优化特征选择策略

特征选择是机器学习中的重要技术，掌握特征选择可以提高模型性能、减少过拟合和加快训练速度。