← 返回首页
🧠

LLM公平性:消除模型偏见

📂 llm ⏱ 4 min 744 words

--- title: "LLM公平性:消除模型偏见" description: "评估和消除LLM中的偏见,确保AI系统公平对待所有群体" tags: ["公平性", "偏见消除", "AI公平", "LLM", "无偏见"] category: "llm" icon: "⚖️"

LLM公平性:消除模型偏见

公平性概述

AI公平性确保LLM对所有群体一视同仁,不因种族、性别、年龄等因素产生歧视性输出。

偏见检测

1. 统计均等性检测

import numpy as np
from typing import List, Dict, Tuple
from dataclasses import dataclass

@dataclass
class BiasMetric:
    """偏见指标"""
    metric_name: str
    value: float
    threshold: float
    is_biased: bool
    details: Dict

class StatisticalParityDetector:
    """统计均等性检测器"""
    
    def __init__(self):
        self.default_threshold = 0.1
    
    def compute_statistical_parity_difference(
        self, 
        predictions: List[int], 
        sensitive_attribute: List[str],
        privileged_value: str,
        unprivileged_value: str
    ) -> BiasMetric:
        """计算统计均等性差异"""
        # 分组预测
        privileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                          if a == privileged_value]
        unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                            if a == unprivileged_value]
        
        # 计算正类比例
        privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
        unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
        
        # 计算差异
        spd = privileged_rate - unprivileged_rate
        
        return BiasMetric(
            metric_name="Statistical Parity Difference",
            value=spd,
            threshold=self.default_threshold,
            is_biased=abs(spd) > self.default_threshold,
            details={
                "privileged_rate": privileged_rate,
                "unprivileged_rate": unprivileged_rate,
                "privileged_count": len(privileged_preds),
                "unprivileged_count": len(unprivileged_preds)
            }
        )
    
    def compute_disparate_impact(
        self,
        predictions: List[int],
        sensitive_attribute: List[str],
        privileged_value: str,
        unprivileged_value: str
    ) -> BiasMetric:
        """计算差异化影响"""
        privileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                          if a == privileged_value]
        unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                            if a == unprivileged_value]
        
        privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
        unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
        
        # 差异化影响比例
        if privileged_rate > 0:
            di_ratio = unprivileged_rate / privileged_rate
        else:
            di_ratio = 1.0
        
        # 80%规则:如果比例在0.8-1.25之间则认为公平
        is_biased = di_ratio < 0.8 or di_ratio > 1.25
        
        return BiasMetric(
            metric_name="Disparate Impact Ratio",
            value=di_ratio,
            threshold=0.8,
            is_biased=is_biased,
            details={
                "privileged_rate": privileged_rate,
                "unprivileged_rate": unprivileged_rate,
                "ratio": di_ratio
            }
        )

2. 文本偏见检测

class TextBiasDetector:
    """文本偏见检测器"""
    
    def __init__(self):
        self.bias_terms = self._load_bias_terms()
    
    def _load_bias_terms(self) -> Dict[str, List[str]]:
        """加载偏见词汇"""
        return {
            "gender": {
                "stereotypes": ["护士通常是女性", "工程师通常是男性"],
                "biases": ["女人更情绪化", "男人更理性"]
            },
            "race": {
                "stereotypes": ["某个种族更聪明", "某个种族更暴力"],
                "biases": ["种族优越感", "种族歧视"]
            },
            "age": {
                "stereotypes": ["年轻人不懂事", "老年人跟不上时代"],
                "biases": ["年龄歧视", "代际偏见"]
            }
        }
    
    def detect_bias_in_text(self, text: str) -> Dict:
        """检测文本偏见"""
        detected_biases = []
        
        for category, patterns in self.bias_terms.items():
            for pattern_type, terms in patterns.items():
                for term in terms:
                    if term in text:
                        detected_biases.append({
                            "category": category,
                            "type": pattern_type,
                            "term": term,
                            "position": text.find(term)
                        })
        
        return {
            "has_bias": len(detected_biases) > 0,
            "bias_count": len(detected_biases),
            "biases": detected_biases,
            "severity": "high" if len(detected_biases) > 3 else "medium" if detected_biases else "low"
        }
    
    def detect_stereotype_in_generation(self, generations: List[str], 
                                       sensitive_terms: List[str]) -> Dict:
        """检测生成内容中的刻板印象"""
        stereotype_scores = []
        
        for generation in generations:
            score = 0
            for term in sensitive_terms:
                if term in generation:
                    # 检查上下文
                    context = self._get_context(generation, term)
                    if self._is_stereotypical(context):
                        score += 1
            stereotype_scores.append(score)
        
        return {
            "mean_stereotype_score": np.mean(stereotype_scores),
            "max_stereotype_score": max(stereotype_scores),
            "has_stereotypes": any(s > 0 for s in stereotype_scores),
            "stereotype_frequency": sum(1 for s in stereotype_scores if s > 0) / len(stereotype_scores)
        }
    
    def _get_context(self, text: str, term: str, window: int = 20) -> str:
        """获取术语上下文"""
        pos = text.find(term)
        start = max(0, pos - window)
        end = min(len(text), pos + len(term) + window)
        return text[start:end]
    
    def _is_stereotypical(self, context: str) -> bool:
        """判断是否为刻板印象"""
        # 简化实现
        stereotype_indicators = ["总是", "通常", "应该", "必须"]
        return any(indicator in context for indicator in stereotype_indicators)

偏见缓解

1. 预处理方法

class PreprocessingDebiasing:
    """预处理去偏见"""
    
    def __init__(self):
        pass
    
    def reweight_samples(self, data: List[Dict], sensitive_attribute: str) -> List[Dict]:
        """重新加权样本"""
        # 计算每个群体的权重
        groups = {}
        for item in data:
            group = item[sensitive_attribute]
            if group not in groups:
                groups[group] = []
            groups[group].append(item)
        
        # 计算权重
        total = len(data)
        weights = {}
        for group, items in groups.items():
            group_size = len(items)
            weights[group] = total / (len(groups) * group_size)
        
        # 应用权重
        weighted_data = []
        for item in data:
            weighted_item = item.copy()
            weighted_item["weight"] = weights[item[sensitive_attribute]]
            weighted_data.append(weighted_item)
        
        return weighted_data
    
    def transform_label(self, data: List[Dict], sensitive_attribute: str) -> List[Dict]:
        """标签转换(减少标签偏见)"""
        # 简化实现:实际应使用更复杂的方法
        return data

2. 处理中方法

class InProcessingDebiasing:
    """处理中去偏见"""
    
    def __init__(self, model, lambda_fairness: float = 0.5):
        self.model = model
        self.lambda_fairness = lambda_fairness
    
    def adversarial_debiasing(self, data: List[Dict], sensitive_attribute: str, 
                             epochs: int = 10):
        """对抗性去偏见"""
        # 简化实现
        for epoch in range(epochs):
            # 训练主模型
            # 训练对抗模型
            # 更新损失
            pass
    
    def prejudice_remover(self, data: List[Dict], sensitive_attribute: str):
        """偏见消除器"""
        # 简化实现
        pass

3. 后处理方法

class PostprocessingDebiasing:
    """后处理去偏见"""
    
    def __init__(self):
        pass
    
    def equalized_odds_postprocessing(self, predictions: List[int], 
                                      true_labels: List[int],
                                      sensitive_attribute: List[str],
                                      privileged_value: str) -> List[int]:
        """均等机会后处理"""
        # 简化实现:调整预测以满足均等机会
        adjusted_predictions = predictions.copy()
        
        # 计算各群体的真正例率和假正例率
        # 调整预测
        return adjusted_predictions
    
    def reject_option_classification(self, predictions: List[int],
                                    probabilities: List[float],
                                    threshold: float = 0.5) -> List[int]:
        """拒绝选项分类"""
        adjusted_predictions = []
        
        for pred, prob in zip(predictions, probabilities):
            if abs(prob - 0.5) < threshold:
                # 不确定样本,标记为需要人工审核
                adjusted_predictions.append(-1)  # -1表示需要审核
            else:
                adjusted_predictions.append(pred)
        
        return adjusted_predictions

公平性评估

class FairnessEvaluator:
    """公平性评估器"""
    
    def __init__(self):
        self.detector = StatisticalParityDetector()
    
    def comprehensive_evaluation(self, predictions: List[int], 
                                true_labels: List[int],
                                sensitive_attribute: List[str],
                                privileged_value: str,
                                unprivileged_value: str) -> Dict:
        """综合公平性评估"""
        # 统计均等性
        spd = self.detector.compute_statistical_parity_difference(
            predictions, sensitive_attribute, privileged_value, unprivileged_value
        )
        
        # 差异化影响
        di = self.detector.compute_disparate_impact(
            predictions, sensitive_attribute, privileged_value, unprivileged_value
        )
        
        # 准确率均等性
        # 简化实现
        accuracy_equal = True
        
        # 汇总结果
        overall_fair = not spd.is_biased and not di.is_biased
        
        return {
            "overall_fair": overall_fair,
            "statistical_parity": {
                "value": spd.value,
                "is_biased": spd.is_biased,
                "details": spd.details
            },
            "disparate_impact": {
                "value": di.value,
                "is_biased": di.is_biased,
                "details": di.details
            },
            "recommendations": self._generate_recommendations(spd, di)
        }
    
    def _generate_recommendations(self, spd: BiasMetric, di: BiasMetric) -> List[str]:
        """生成建议"""
        recommendations = []
        
        if spd.is_biased:
            recommendations.append("统计均等性差异过大,建议使用重新加权或对抗性去偏见方法")
        
        if di.is_biased:
            recommendations.append("差异化影响不符合80%规则,建议检查训练数据分布")
        
        return recommendations

最佳实践

  1. 早期检测:在模型开发早期检测偏见
  2. 多维度评估:从多个角度评估公平性
  3. 持续监控:在生产环境中持续监控公平性指标
  4. 透明报告:向用户透明地报告公平性评估结果

总结

AI公平性是构建负责任LLM的关键要素。通过系统化的偏见检测和缓解方法,可以构建更公平、无偏见的AI系统。