LLM公平性:消除模型偏见
--- title: "LLM公平性:消除模型偏见" description: "评估和消除LLM中的偏见,确保AI系统公平对待所有群体" tags: ["公平性", "偏见消除", "AI公平", "LLM", "无偏见"] category: "llm" icon: "⚖️"
LLM公平性:消除模型偏见
公平性概述
AI公平性确保LLM对所有群体一视同仁,不因种族、性别、年龄等因素产生歧视性输出。
偏见检测
1. 统计均等性检测
import numpy as np
from typing import List, Dict, Tuple
from dataclasses import dataclass
@dataclass
class BiasMetric:
"""偏见指标"""
metric_name: str
value: float
threshold: float
is_biased: bool
details: Dict
class StatisticalParityDetector:
"""统计均等性检测器"""
def __init__(self):
self.default_threshold = 0.1
def compute_statistical_parity_difference(
self,
predictions: List[int],
sensitive_attribute: List[str],
privileged_value: str,
unprivileged_value: str
) -> BiasMetric:
"""计算统计均等性差异"""
# 分组预测
privileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a == privileged_value]
unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a == unprivileged_value]
# 计算正类比例
privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
# 计算差异
spd = privileged_rate - unprivileged_rate
return BiasMetric(
metric_name="Statistical Parity Difference",
value=spd,
threshold=self.default_threshold,
is_biased=abs(spd) > self.default_threshold,
details={
"privileged_rate": privileged_rate,
"unprivileged_rate": unprivileged_rate,
"privileged_count": len(privileged_preds),
"unprivileged_count": len(unprivileged_preds)
}
)
def compute_disparate_impact(
self,
predictions: List[int],
sensitive_attribute: List[str],
privileged_value: str,
unprivileged_value: str
) -> BiasMetric:
"""计算差异化影响"""
privileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a == privileged_value]
unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a == unprivileged_value]
privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
# 差异化影响比例
if privileged_rate > 0:
di_ratio = unprivileged_rate / privileged_rate
else:
di_ratio = 1.0
# 80%规则:如果比例在0.8-1.25之间则认为公平
is_biased = di_ratio < 0.8 or di_ratio > 1.25
return BiasMetric(
metric_name="Disparate Impact Ratio",
value=di_ratio,
threshold=0.8,
is_biased=is_biased,
details={
"privileged_rate": privileged_rate,
"unprivileged_rate": unprivileged_rate,
"ratio": di_ratio
}
)
2. 文本偏见检测
class TextBiasDetector:
"""文本偏见检测器"""
def __init__(self):
self.bias_terms = self._load_bias_terms()
def _load_bias_terms(self) -> Dict[str, List[str]]:
"""加载偏见词汇"""
return {
"gender": {
"stereotypes": ["护士通常是女性", "工程师通常是男性"],
"biases": ["女人更情绪化", "男人更理性"]
},
"race": {
"stereotypes": ["某个种族更聪明", "某个种族更暴力"],
"biases": ["种族优越感", "种族歧视"]
},
"age": {
"stereotypes": ["年轻人不懂事", "老年人跟不上时代"],
"biases": ["年龄歧视", "代际偏见"]
}
}
def detect_bias_in_text(self, text: str) -> Dict:
"""检测文本偏见"""
detected_biases = []
for category, patterns in self.bias_terms.items():
for pattern_type, terms in patterns.items():
for term in terms:
if term in text:
detected_biases.append({
"category": category,
"type": pattern_type,
"term": term,
"position": text.find(term)
})
return {
"has_bias": len(detected_biases) > 0,
"bias_count": len(detected_biases),
"biases": detected_biases,
"severity": "high" if len(detected_biases) > 3 else "medium" if detected_biases else "low"
}
def detect_stereotype_in_generation(self, generations: List[str],
sensitive_terms: List[str]) -> Dict:
"""检测生成内容中的刻板印象"""
stereotype_scores = []
for generation in generations:
score = 0
for term in sensitive_terms:
if term in generation:
# 检查上下文
context = self._get_context(generation, term)
if self._is_stereotypical(context):
score += 1
stereotype_scores.append(score)
return {
"mean_stereotype_score": np.mean(stereotype_scores),
"max_stereotype_score": max(stereotype_scores),
"has_stereotypes": any(s > 0 for s in stereotype_scores),
"stereotype_frequency": sum(1 for s in stereotype_scores if s > 0) / len(stereotype_scores)
}
def _get_context(self, text: str, term: str, window: int = 20) -> str:
"""获取术语上下文"""
pos = text.find(term)
start = max(0, pos - window)
end = min(len(text), pos + len(term) + window)
return text[start:end]
def _is_stereotypical(self, context: str) -> bool:
"""判断是否为刻板印象"""
# 简化实现
stereotype_indicators = ["总是", "通常", "应该", "必须"]
return any(indicator in context for indicator in stereotype_indicators)
偏见缓解
1. 预处理方法
class PreprocessingDebiasing:
"""预处理去偏见"""
def __init__(self):
pass
def reweight_samples(self, data: List[Dict], sensitive_attribute: str) -> List[Dict]:
"""重新加权样本"""
# 计算每个群体的权重
groups = {}
for item in data:
group = item[sensitive_attribute]
if group not in groups:
groups[group] = []
groups[group].append(item)
# 计算权重
total = len(data)
weights = {}
for group, items in groups.items():
group_size = len(items)
weights[group] = total / (len(groups) * group_size)
# 应用权重
weighted_data = []
for item in data:
weighted_item = item.copy()
weighted_item["weight"] = weights[item[sensitive_attribute]]
weighted_data.append(weighted_item)
return weighted_data
def transform_label(self, data: List[Dict], sensitive_attribute: str) -> List[Dict]:
"""标签转换(减少标签偏见)"""
# 简化实现:实际应使用更复杂的方法
return data
2. 处理中方法
class InProcessingDebiasing:
"""处理中去偏见"""
def __init__(self, model, lambda_fairness: float = 0.5):
self.model = model
self.lambda_fairness = lambda_fairness
def adversarial_debiasing(self, data: List[Dict], sensitive_attribute: str,
epochs: int = 10):
"""对抗性去偏见"""
# 简化实现
for epoch in range(epochs):
# 训练主模型
# 训练对抗模型
# 更新损失
pass
def prejudice_remover(self, data: List[Dict], sensitive_attribute: str):
"""偏见消除器"""
# 简化实现
pass
3. 后处理方法
class PostprocessingDebiasing:
"""后处理去偏见"""
def __init__(self):
pass
def equalized_odds_postprocessing(self, predictions: List[int],
true_labels: List[int],
sensitive_attribute: List[str],
privileged_value: str) -> List[int]:
"""均等机会后处理"""
# 简化实现:调整预测以满足均等机会
adjusted_predictions = predictions.copy()
# 计算各群体的真正例率和假正例率
# 调整预测
return adjusted_predictions
def reject_option_classification(self, predictions: List[int],
probabilities: List[float],
threshold: float = 0.5) -> List[int]:
"""拒绝选项分类"""
adjusted_predictions = []
for pred, prob in zip(predictions, probabilities):
if abs(prob - 0.5) < threshold:
# 不确定样本,标记为需要人工审核
adjusted_predictions.append(-1) # -1表示需要审核
else:
adjusted_predictions.append(pred)
return adjusted_predictions
公平性评估
class FairnessEvaluator:
"""公平性评估器"""
def __init__(self):
self.detector = StatisticalParityDetector()
def comprehensive_evaluation(self, predictions: List[int],
true_labels: List[int],
sensitive_attribute: List[str],
privileged_value: str,
unprivileged_value: str) -> Dict:
"""综合公平性评估"""
# 统计均等性
spd = self.detector.compute_statistical_parity_difference(
predictions, sensitive_attribute, privileged_value, unprivileged_value
)
# 差异化影响
di = self.detector.compute_disparate_impact(
predictions, sensitive_attribute, privileged_value, unprivileged_value
)
# 准确率均等性
# 简化实现
accuracy_equal = True
# 汇总结果
overall_fair = not spd.is_biased and not di.is_biased
return {
"overall_fair": overall_fair,
"statistical_parity": {
"value": spd.value,
"is_biased": spd.is_biased,
"details": spd.details
},
"disparate_impact": {
"value": di.value,
"is_biased": di.is_biased,
"details": di.details
},
"recommendations": self._generate_recommendations(spd, di)
}
def _generate_recommendations(self, spd: BiasMetric, di: BiasMetric) -> List[str]:
"""生成建议"""
recommendations = []
if spd.is_biased:
recommendations.append("统计均等性差异过大,建议使用重新加权或对抗性去偏见方法")
if di.is_biased:
recommendations.append("差异化影响不符合80%规则,建议检查训练数据分布")
return recommendations
最佳实践
- 早期检测:在模型开发早期检测偏见
- 多维度评估:从多个角度评估公平性
- 持续监控:在生产环境中持续监控公平性指标
- 透明报告:向用户透明地报告公平性评估结果
总结
AI公平性是构建负责任LLM的关键要素。通过系统化的偏见检测和缓解方法,可以构建更公平、无偏见的AI系统。