LLM偏见:识别和消除AI偏见
--- title: "LLM偏见:识别和消除AI偏见" description: "识别LLM中的各种偏见类型,并采用方法消除偏见" tags: ["AI偏见", "偏见识别", "偏见消除", "LLM", "公平性"] category: "llm" icon: "🔍"
LLM偏见:识别和消除AI偏见
偏见类型
1. 数据偏见
import numpy as np
from typing import List, Dict, Tuple
from collections import Counter
class DataBiasDetector:
"""数据偏见检测器"""
def __init__(self):
self.detection_methods = {
"representation": self._check_representation,
"label": self._check_label_bias,
"selection": self._check_selection_bias
}
def detect_all_biases(self, data: List[Dict], sensitive_attributes: List[str]) -> Dict:
"""检测所有偏见"""
results = {}
for attr in sensitive_attributes:
attr_results = {}
for method_name, method_func in self.detection_methods.items():
attr_results[method_name] = method_func(data, attr)
results[attr] = attr_results
return results
def _check_representation(self, data: List[Dict], attribute: str) -> Dict:
"""检查表示偏见"""
values = [item.get(attribute) for item in data if item.get(attribute)]
counter = Counter(values)
total = len(values)
representation = {k: v/total for k, v in counter.items()}
# 检查是否均衡
expected = 1.0 / len(counter) if counter else 0
max_deviation = max(abs(v - expected) for v in representation.values()) if representation else 0
return {
"representation": representation,
"is_balanced": max_deviation < 0.1,
"max_deviation": max_deviation
}
def _check_label_bias(self, data: List[Dict], attribute: str) -> Dict:
"""检查标签偏见"""
# 按属性值分组
groups = {}
for item in data:
attr_value = item.get(attribute)
label = item.get("label")
if attr_value and label:
if attr_value not in groups:
groups[attr_value] = []
groups[attr_value].append(label)
# 计算每个组的标签分布
group_label_distributions = {}
for group, labels in groups.items():
counter = Counter(labels)
total = len(labels)
group_label_distributions[group] = {k: v/total for k, v in counter.items()}
return {
"group_distributions": group_label_distributions,
"has_bias": self._compare_distributions(group_label_distributions)
}
def _check_selection_bias(self, data: List[Dict], attribute: str) -> Dict:
"""检查选择偏见"""
# 简化实现:检查数据收集过程中的偏见
return {
"methodology_review_needed": True,
"potential_biases": ["sampling_bias", "survivorship_bias"]
}
def _compare_distributions(self, distributions: Dict) -> bool:
"""比较分布"""
if len(distributions) < 2:
return False
# 简化比较
all_values = set()
for dist in distributions.values():
all_values.update(dist.keys())
for value in all_values:
probs = [dist.get(value, 0) for dist in distributions.values()]
if max(probs) - min(probs) > 0.2:
return True
return False
2. 模型偏见
class ModelBiasDetector:
"""模型偏见检测器"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def detect_embedding_bias(self, word_pairs: List[Tuple[str, str]]) -> Dict:
"""检测嵌入偏见"""
biases = {}
for word1, word2 in word_pairs:
# 获取词嵌入
embedding1 = self._get_word_embedding(word1)
embedding2 = self._get_word_embedding(word2)
# 计算与基准词的余弦相似度
bias_score = self._compute_bias_score(embedding1, embedding2)
biases[f"{word1}-{word2}"] = {
"bias_score": bias_score,
"is_biased": abs(bias_score) > 0.1
}
return biases
def _get_word_embedding(self, word: str):
"""获取词嵌入"""
inputs = self.tokenizer(word, return_tensors="pt")
with torch.no_grad():
outputs = self.model(**inputs, output_hidden_states=True)
embedding = outputs.hidden_states[-1].mean(dim=1).squeeze()
return embedding
def _compute_bias_score(self, embedding1, embedding2) -> float:
"""计算偏见分数"""
# 使用余弦相似度
cosine_sim = torch.nn.functional.cosine_similarity(
embedding1.unsqueeze(0), embedding2.unsqueeze(0)
).item()
return cosine_sim
def detect_generation_bias(self, prompts: List[str], sensitive_terms: List[str]) -> Dict:
"""检测生成偏见"""
generation_biases = []
for prompt in prompts:
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(**inputs, max_new_tokens=50)
generation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# 检查生成内容中的敏感术语
for term in sensitive_terms:
if term in generation:
generation_biases.append({
"prompt": prompt,
"generation": generation,
"sensitive_term": term,
"context": self._get_context(generation, term)
})
return {
"has_bias": len(generation_biases) > 0,
"bias_count": len(generation_biases),
"biases": generation_biases
}
def _get_context(self, text: str, term: str, window: int = 20) -> str:
"""获取上下文"""
pos = text.find(term)
start = max(0, pos - window)
end = min(len(text), pos + len(term) + window)
return text[start:end]
偏见消除方法
1. 训练数据去偏见
class TrainingDataDebiasing:
"""训练数据去偏见"""
def __init__(self):
pass
def reweight_samples(self, data: List[Dict], sensitive_attribute: str) -> List[Dict]:
"""重新加权样本"""
# 计算每个群体的权重
groups = {}
for item in data:
group = item.get(sensitive_attribute)
if group:
if group not in groups:
groups[group] = 0
groups[group] += 1
total = len(data)
weights = {}
for group, count in groups.items():
weights[group] = total / (len(groups) * count)
# 应用权重
weighted_data = []
for item in data:
weighted_item = item.copy()
group = item.get(sensitive_attribute)
weighted_item["sample_weight"] = weights.get(group, 1.0)
weighted_data.append(weighted_item)
return weighted_data
def augment_minority_groups(self, data: List[Dict], sensitive_attribute: str,
target_ratio: float = 0.5) -> List[Dict]:
"""增强少数群体"""
# 统计各群体数量
groups = {}
for item in data:
group = item.get(sensitive_attribute)
if group:
if group not in groups:
groups[group] = []
groups[group].append(item)
# 找出最大群体
max_group_size = max(len(items) for items in groups.values())
target_size = int(max_group_size * target_ratio)
# 增强少数群体
augmented_data = list(data)
for group, items in groups.items():
if len(items) < target_size:
# 简单复制增强
deficit = target_size - len(items)
for _ in range(deficit):
augmented_data.append(np.random.choice(items))
return augmented_data
2. 模型去偏见
class ModelDebiasing:
"""模型去偏见"""
def __init__(self, model):
self.model = model
def adversarial_debiasing(self, train_data: List[Dict], sensitive_attribute: str,
epochs: int = 10, lambda_fairness: float = 0.5):
"""对抗性去偏见"""
# 简化实现
for epoch in range(epochs):
# 训练主任务模型
# 训练对抗模型(预测敏感属性)
# 联合优化
pass
def prejudice_remover(self, train_data: List[Dict], sensitive_attribute: str):
"""偏见消除器"""
# 简化实现
pass
def learning_fair_representations(self, train_data: List[Dict], sensitive_attribute: str):
"""学习公平表示"""
# 简化实现
pass
3. 后处理去偏见
class PostprocessingDebiasing:
"""后处理去偏见"""
def __init__(self):
pass
def calibrate_odds(self, predictions: List[int], true_labels: List[int],
sensitive_attribute: List[str]) -> List[int]:
"""校准几率"""
# 简化实现
return predictions
def reject_option(self, predictions: List[int], confidence_scores: List[float],
threshold: float = 0.3) -> List[int]:
"""拒绝选项"""
adjusted_predictions = []
for pred, conf in zip(predictions, confidence_scores):
if conf < threshold:
adjusted_predictions.append(-1) # 标记为需要审核
else:
adjusted_predictions.append(pred)
return adjusted_predictions
公平性指标
class FairnessMetrics:
"""公平性指标"""
@staticmethod
def demographic_parity_difference(predictions: List[int],
sensitive_attribute: List[str],
privileged_value: str) -> float:
"""人口统计均等性差异"""
privileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a == privileged_value]
unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a != privileged_value]
privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
return privileged_rate - unprivileged_rate
@staticmethod
def equal_opportunity_difference(predictions: List[int], true_labels: List[int],
sensitive_attribute: List[str],
privileged_value: str) -> float:
"""均等机会差异"""
# 计算真正例率
privileged_tpr = FairnessMetrics._true_positive_rate(
predictions, true_labels, sensitive_attribute, privileged_value
)
unprivileged_tpr = FairnessMetrics._true_positive_rate(
predictions, true_labels, sensitive_attribute, "unprivileged"
)
return privileged_tpr - unprivileged_tpr
@staticmethod
def _true_positive_rate(predictions, true_labels, sensitive_attribute, value):
"""计算真正例率"""
if value == "unprivileged":
preds = [p for p, a, t in zip(predictions, sensitive_attribute, true_labels)
if a != value and t == 1]
else:
preds = [p for p, a, t in zip(predictions, sensitive_attribute, true_labels)
if a == value and t == 1]
return np.mean(preds) if preds else 0
@staticmethod
def disparate_impact_ratio(predictions: List[int], sensitive_attribute: List[str],
privileged_value: str) -> float:
"""差异化影响比例"""
privileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a == privileged_value]
unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute)
if a != privileged_value]
privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
if privileged_rate > 0:
return unprivileged_rate / privileged_rate
return 1.0
最佳实践
- 早期检测:在模型开发早期检测偏见
- 多维度评估:从多个角度评估偏见
- 持续监控:在生产环境中持续监控偏见
- 透明报告:向用户透明地报告偏见评估结果
总结
识别和消除LLM偏见是构建公平AI系统的关键。通过系统化的检测和消除方法,可以减少AI系统对不同群体的不公平对待。