特征选择技术详解
特征选择技术详解
特征选择是从原始特征中选择最相关特征的过程,可以提高模型性能、减少过拟合和加快训练速度。
特征选择方法
1. 过滤法(Filter Methods)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
# 创建示例数据
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, n_classes=2, random_state=42
)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"原始特征数量: {X.shape[1]}")
print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")
方差阈值过滤
from sklearn.feature_selection import VarianceThreshold
# 方差阈值过滤
var_selector = VarianceThreshold(threshold=0.1)
X_var = var_selector.fit_transform(X_train)
# 获取保留的特征
selected_features_var = np.where(var_selector.get_support())[0]
print(f"方差阈值过滤后保留的特征: {len(selected_features_var)}")
print(f"保留的特征索引: {selected_features_var}")
单变量特征选择
# 不同评分函数的比较
scoring_functions = {
'F-value': f_classif,
'卡方': chi2,
'互信息': mutual_info_classif
}
# 可视化不同评分函数的结果
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, (name, func) in enumerate(scoring_functions.items()):
# 选择K个最佳特征
selector = SelectKBest(score_func=func, k=10)
X_selected = selector.fit_transform(X_train, y_train)
# 获取特征分数
scores = selector.scores_
# 可视化
axes[idx].bar(range(len(scores)), scores, color='skyblue', edgecolor='black')
axes[idx].set_xlabel('特征索引')
axes[idx].set_ylabel('分数')
axes[idx].set_title(f'{name}评分')
axes[idx].grid(True, alpha=0.3)
# 标记选中的特征
selected_indices = np.where(selector.get_support())[0]
axes[idx].scatter(selected_indices, scores[selected_indices],
c='red', s=100, zorder=5, label='选中特征')
axes[idx].legend()
plt.tight_layout()
plt.show()
2. 包装法(Wrapper Methods)
from sklearn.feature_selection import RFE, RFECV
# 递归特征消除 (RFE)
rfe = RFE(
estimator=RandomForestClassifier(n_estimators=100, random_state=42),
n_features_to_select=10,
step=1
)
X_rfe = rfe.fit_transform(X_train, y_train)
# 获取选中的特征
selected_features_rfe = np.where(rfe.get_support())[0]
print(f"RFE选中的特征: {len(selected_features_rfe)}")
print(f"特征排名: {rfe.ranking_}")
# 可视化特征排名
plt.figure(figsize=(10, 6))
plt.bar(range(len(rfe.ranking_)), rfe.ranking_, color='lightcoral', edgecolor='black')
plt.xlabel('特征索引')
plt.ylabel('排名 (1=最佳)')
plt.title('RFE特征排名')
plt.grid(True, alpha=0.3)
plt.show()
# 交叉验证RFE
rfecv = RFECV(
estimator=RandomForestClassifier(n_estimators=100, random_state=42),
step=1,
cv=5,
scoring='accuracy',
min_features_to_select=1
)
X_rfecv = rfecv.fit_transform(X_train, y_train)
print(f"最佳特征数量: {rfecv.n_features_}")
print(f"交叉验证分数: {rfecv.cv_results_['mean_test_score']}")
# 可视化交叉验证结果
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1),
rfecv.cv_results_['mean_test_score'], 'b-o')
plt.xlabel('特征数量')
plt.ylabel('交叉验证准确率')
plt.title('RFECV:特征数量对性能的影响')
plt.axvline(x=rfecv.n_features_, color='r', linestyle='--',
label=f'最佳特征数量: {rfecv.n_features_}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
3. 嵌入法(Embedded Methods)
from sklearn.feature_selection import SelectFromModel
# 基于随机森林的特征选择
sfm_rf = SelectFromModel(
RandomForestClassifier(n_estimators=100, random_state=42),
threshold='mean' # 使用均值作为阈值
)
X_sfm_rf = sfm_rf.fit_transform(X_train, y_train)
# 获取选中的特征
selected_features_sfm = np.where(sfm_rf.get_support())[0]
print(f"基于随机森林选中的特征: {len(selected_features_sfm)}")
# 获取特征重要性
importances = sfm_rf.estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
# 可视化特征重要性
plt.figure(figsize=(12, 6))
plt.bar(range(X_train.shape[1]), importances[indices], align='center',
color='lightgreen', edgecolor='black')
plt.xticks(range(X_train.shape[1]), [f'特征{i}' for i in indices], rotation=45)
plt.xlabel('特征')
plt.ylabel('重要性')
plt.title('基于随机森林的特征重要性')
plt.tight_layout()
plt.show()
特征选择效果评估
模型性能比较
# 比较不同特征选择方法的效果
methods = {
'原始特征': (X_train, X_test),
'方差阈值': (X_var, var_selector.transform(X_test)),
'RFE': (X_rfe, rfe.transform(X_test)),
'基于模型': (X_sfm_rf, sfm_rf.transform(X_test))
}
print("特征选择效果比较:")
print("-" * 60)
print(f"{'方法':<15} {'特征数量':<10} {'准确率':<10}")
print("-" * 60)
for name, (X_tr, X_te) in methods.items():
# 训练模型
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_tr, y_train)
# 预测
y_pred = clf.predict(X_te)
accuracy = accuracy_score(y_test, y_pred)
print(f"{name:<15} {X_tr.shape[1]:<10} {accuracy:<10.4f}")
可视化比较
# 可视化不同特征选择方法的效果
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 特征数量比较
method_names = list(methods.keys())
feature_counts = [methods[name][0].shape[1] for name in method_names]
axes[0].bar(method_names, feature_counts, color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
axes[0].set_ylabel('特征数量')
axes[0].set_title('不同方法选择的特征数量')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)
# 准确率比较
accuracies = []
for name, (X_tr, X_te) in methods.items():
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_tr, y_train)
y_pred = clf.predict(X_te)
accuracies.append(accuracy_score(y_test, y_pred))
axes[1].bar(method_names, accuracies, color=['skyblue', 'lightcoral', 'lightgreen', 'gold'])
axes[1].set_ylabel('准确率')
axes[1].set_title('不同方法的分类准确率')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
实际应用
特征选择管道
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# 创建特征选择管道
feature_selection_pipeline = Pipeline([
('scaler', StandardScaler()),
('feature_selection', SelectKBest(score_func=f_classif, k=10)),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# 交叉验证评估
cv_scores = cross_val_score(feature_selection_pipeline, X, y, cv=5, scoring='accuracy')
print(f"特征选择管道交叉验证准确率: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
# 与完整特征比较
full_pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
cv_scores_full = cross_val_score(full_pipeline, X, y, cv=5, scoring='accuracy')
print(f"完整特征交叉验证准确率: {cv_scores_full.mean():.4f} (+/- {cv_scores_full.std()*2:.4f})")
特征重要性分析
# 综合特征重要性分析
def analyze_feature_importance(X, y, feature_names=None):
"""综合分析特征重要性"""
if feature_names is None:
feature_names = [f'特征{i}' for i in range(X.shape[1])]
# 训练随机森林
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
# 获取特征重要性
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
# 可视化
plt.figure(figsize=(12, 6))
plt.bar(range(len(importances)), importances[indices], align='center',
color='lightblue', edgecolor='black')
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45)
plt.xlabel('特征')
plt.ylabel('重要性')
plt.title('综合特征重要性分析')
plt.tight_layout()
plt.show()
# 返回重要特征
return [(feature_names[i], importances[i]) for i in indices]
# 分析特征重要性
feature_names = [f'特征{i}' for i in range(X.shape[1])]
important_features = analyze_feature_importance(X, y, feature_names)
print("\n前10个重要特征:")
for name, importance in important_features[:10]:
print(f"{name}: {importance:.4f}")
特征选择最佳实践
- 理解问题:根据问题类型选择合适的特征选择方法
- 多方法结合:结合多种特征选择方法提高稳定性
- 评估效果:使用交叉验证评估特征选择对模型性能的影响
- 可视化分析:通过可视化理解特征重要性
- 迭代优化:不断尝试和优化特征选择策略
特征选择是机器学习中的重要技术,掌握特征选择可以提高模型性能、减少过拟合和加快训练速度。