← 返回首页
🧠

LLM偏见:识别和消除AI偏见

📂 llm ⏱ 5 min 914 words

--- title: "LLM偏见:识别和消除AI偏见" description: "识别LLM中的各种偏见类型,并采用方法消除偏见" tags: ["AI偏见", "偏见识别", "偏见消除", "LLM", "公平性"] category: "llm" icon: "🔍"

LLM偏见:识别和消除AI偏见

偏见类型

1. 数据偏见

import numpy as np
from typing import List, Dict, Tuple
from collections import Counter

class DataBiasDetector:
    """数据偏见检测器"""
    
    def __init__(self):
        self.detection_methods = {
            "representation": self._check_representation,
            "label": self._check_label_bias,
            "selection": self._check_selection_bias
        }
    
    def detect_all_biases(self, data: List[Dict], sensitive_attributes: List[str]) -> Dict:
        """检测所有偏见"""
        results = {}
        
        for attr in sensitive_attributes:
            attr_results = {}
            for method_name, method_func in self.detection_methods.items():
                attr_results[method_name] = method_func(data, attr)
            results[attr] = attr_results
        
        return results
    
    def _check_representation(self, data: List[Dict], attribute: str) -> Dict:
        """检查表示偏见"""
        values = [item.get(attribute) for item in data if item.get(attribute)]
        counter = Counter(values)
        total = len(values)
        
        representation = {k: v/total for k, v in counter.items()}
        
        # 检查是否均衡
        expected = 1.0 / len(counter) if counter else 0
        max_deviation = max(abs(v - expected) for v in representation.values()) if representation else 0
        
        return {
            "representation": representation,
            "is_balanced": max_deviation < 0.1,
            "max_deviation": max_deviation
        }
    
    def _check_label_bias(self, data: List[Dict], attribute: str) -> Dict:
        """检查标签偏见"""
        # 按属性值分组
        groups = {}
        for item in data:
            attr_value = item.get(attribute)
            label = item.get("label")
            
            if attr_value and label:
                if attr_value not in groups:
                    groups[attr_value] = []
                groups[attr_value].append(label)
        
        # 计算每个组的标签分布
        group_label_distributions = {}
        for group, labels in groups.items():
            counter = Counter(labels)
            total = len(labels)
            group_label_distributions[group] = {k: v/total for k, v in counter.items()}
        
        return {
            "group_distributions": group_label_distributions,
            "has_bias": self._compare_distributions(group_label_distributions)
        }
    
    def _check_selection_bias(self, data: List[Dict], attribute: str) -> Dict:
        """检查选择偏见"""
        # 简化实现:检查数据收集过程中的偏见
        return {
            "methodology_review_needed": True,
            "potential_biases": ["sampling_bias", "survivorship_bias"]
        }
    
    def _compare_distributions(self, distributions: Dict) -> bool:
        """比较分布"""
        if len(distributions) < 2:
            return False
        
        # 简化比较
        all_values = set()
        for dist in distributions.values():
            all_values.update(dist.keys())
        
        for value in all_values:
            probs = [dist.get(value, 0) for dist in distributions.values()]
            if max(probs) - min(probs) > 0.2:
                return True
        
        return False

2. 模型偏见

class ModelBiasDetector:
    """模型偏见检测器"""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def detect_embedding_bias(self, word_pairs: List[Tuple[str, str]]) -> Dict:
        """检测嵌入偏见"""
        biases = {}
        
        for word1, word2 in word_pairs:
            # 获取词嵌入
            embedding1 = self._get_word_embedding(word1)
            embedding2 = self._get_word_embedding(word2)
            
            # 计算与基准词的余弦相似度
            bias_score = self._compute_bias_score(embedding1, embedding2)
            
            biases[f"{word1}-{word2}"] = {
                "bias_score": bias_score,
                "is_biased": abs(bias_score) > 0.1
            }
        
        return biases
    
    def _get_word_embedding(self, word: str):
        """获取词嵌入"""
        inputs = self.tokenizer(word, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)
            embedding = outputs.hidden_states[-1].mean(dim=1).squeeze()
        return embedding
    
    def _compute_bias_score(self, embedding1, embedding2) -> float:
        """计算偏见分数"""
        # 使用余弦相似度
        cosine_sim = torch.nn.functional.cosine_similarity(
            embedding1.unsqueeze(0), embedding2.unsqueeze(0)
        ).item()
        return cosine_sim
    
    def detect_generation_bias(self, prompts: List[str], sensitive_terms: List[str]) -> Dict:
        """检测生成偏见"""
        generation_biases = []
        
        for prompt in prompts:
            inputs = self.tokenizer(prompt, return_tensors="pt")
            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_new_tokens=50)
                generation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # 检查生成内容中的敏感术语
            for term in sensitive_terms:
                if term in generation:
                    generation_biases.append({
                        "prompt": prompt,
                        "generation": generation,
                        "sensitive_term": term,
                        "context": self._get_context(generation, term)
                    })
        
        return {
            "has_bias": len(generation_biases) > 0,
            "bias_count": len(generation_biases),
            "biases": generation_biases
        }
    
    def _get_context(self, text: str, term: str, window: int = 20) -> str:
        """获取上下文"""
        pos = text.find(term)
        start = max(0, pos - window)
        end = min(len(text), pos + len(term) + window)
        return text[start:end]

偏见消除方法

1. 训练数据去偏见

class TrainingDataDebiasing:
    """训练数据去偏见"""
    
    def __init__(self):
        pass
    
    def reweight_samples(self, data: List[Dict], sensitive_attribute: str) -> List[Dict]:
        """重新加权样本"""
        # 计算每个群体的权重
        groups = {}
        for item in data:
            group = item.get(sensitive_attribute)
            if group:
                if group not in groups:
                    groups[group] = 0
                groups[group] += 1
        
        total = len(data)
        weights = {}
        for group, count in groups.items():
            weights[group] = total / (len(groups) * count)
        
        # 应用权重
        weighted_data = []
        for item in data:
            weighted_item = item.copy()
            group = item.get(sensitive_attribute)
            weighted_item["sample_weight"] = weights.get(group, 1.0)
            weighted_data.append(weighted_item)
        
        return weighted_data
    
    def augment_minority_groups(self, data: List[Dict], sensitive_attribute: str, 
                               target_ratio: float = 0.5) -> List[Dict]:
        """增强少数群体"""
        # 统计各群体数量
        groups = {}
        for item in data:
            group = item.get(sensitive_attribute)
            if group:
                if group not in groups:
                    groups[group] = []
                groups[group].append(item)
        
        # 找出最大群体
        max_group_size = max(len(items) for items in groups.values())
        target_size = int(max_group_size * target_ratio)
        
        # 增强少数群体
        augmented_data = list(data)
        for group, items in groups.items():
            if len(items) < target_size:
                # 简单复制增强
                deficit = target_size - len(items)
                for _ in range(deficit):
                    augmented_data.append(np.random.choice(items))
        
        return augmented_data

2. 模型去偏见

class ModelDebiasing:
    """模型去偏见"""
    
    def __init__(self, model):
        self.model = model
    
    def adversarial_debiasing(self, train_data: List[Dict], sensitive_attribute: str,
                             epochs: int = 10, lambda_fairness: float = 0.5):
        """对抗性去偏见"""
        # 简化实现
        for epoch in range(epochs):
            # 训练主任务模型
            # 训练对抗模型(预测敏感属性)
            # 联合优化
            pass
    
    def prejudice_remover(self, train_data: List[Dict], sensitive_attribute: str):
        """偏见消除器"""
        # 简化实现
        pass
    
    def learning_fair_representations(self, train_data: List[Dict], sensitive_attribute: str):
        """学习公平表示"""
        # 简化实现
        pass

3. 后处理去偏见

class PostprocessingDebiasing:
    """后处理去偏见"""
    
    def __init__(self):
        pass
    
    def calibrate_odds(self, predictions: List[int], true_labels: List[int],
                      sensitive_attribute: List[str]) -> List[int]:
        """校准几率"""
        # 简化实现
        return predictions
    
    def reject_option(self, predictions: List[int], confidence_scores: List[float],
                     threshold: float = 0.3) -> List[int]:
        """拒绝选项"""
        adjusted_predictions = []
        
        for pred, conf in zip(predictions, confidence_scores):
            if conf < threshold:
                adjusted_predictions.append(-1)  # 标记为需要审核
            else:
                adjusted_predictions.append(pred)
        
        return adjusted_predictions

公平性指标

class FairnessMetrics:
    """公平性指标"""
    
    @staticmethod
    def demographic_parity_difference(predictions: List[int], 
                                     sensitive_attribute: List[str],
                                     privileged_value: str) -> float:
        """人口统计均等性差异"""
        privileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                          if a == privileged_value]
        unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                            if a != privileged_value]
        
        privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
        unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
        
        return privileged_rate - unprivileged_rate
    
    @staticmethod
    def equal_opportunity_difference(predictions: List[int], true_labels: List[int],
                                    sensitive_attribute: List[str],
                                    privileged_value: str) -> float:
        """均等机会差异"""
        # 计算真正例率
        privileged_tpr = FairnessMetrics._true_positive_rate(
            predictions, true_labels, sensitive_attribute, privileged_value
        )
        unprivileged_tpr = FairnessMetrics._true_positive_rate(
            predictions, true_labels, sensitive_attribute, "unprivileged"
        )
        
        return privileged_tpr - unprivileged_tpr
    
    @staticmethod
    def _true_positive_rate(predictions, true_labels, sensitive_attribute, value):
        """计算真正例率"""
        if value == "unprivileged":
            preds = [p for p, a, t in zip(predictions, sensitive_attribute, true_labels) 
                    if a != value and t == 1]
        else:
            preds = [p for p, a, t in zip(predictions, sensitive_attribute, true_labels) 
                    if a == value and t == 1]
        
        return np.mean(preds) if preds else 0
    
    @staticmethod
    def disparate_impact_ratio(predictions: List[int], sensitive_attribute: List[str],
                              privileged_value: str) -> float:
        """差异化影响比例"""
        privileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                          if a == privileged_value]
        unprivileged_preds = [p for p, a in zip(predictions, sensitive_attribute) 
                            if a != privileged_value]
        
        privileged_rate = np.mean(privileged_preds) if privileged_preds else 0
        unprivileged_rate = np.mean(unprivileged_preds) if unprivileged_preds else 0
        
        if privileged_rate > 0:
            return unprivileged_rate / privileged_rate
        return 1.0

最佳实践

  1. 早期检测:在模型开发早期检测偏见
  2. 多维度评估:从多个角度评估偏见
  3. 持续监控:在生产环境中持续监控偏见
  4. 透明报告:向用户透明地报告偏见评估结果

总结

识别和消除LLM偏见是构建公平AI系统的关键。通过系统化的检测和消除方法,可以减少AI系统对不同群体的不公平对待。