← 返回首页
🧠

风险评分:评估模型输出风险

📂 llm ⏱ 4 min 697 words

--- title: "风险评分:评估模型输出风险" description: "为LLM输出生成风险评分,识别潜在的有害或不当内容" tags: ["风险评分", "内容安全", "风险评估", "LLM", "安全过滤"] category: "llm" icon: "⚠️"

风险评分:评估模型输出风险

风险评分概述

风险评分是评估LLM输出潜在风险的技术,识别有害、偏见或不当内容,确保输出安全可靠。

风险类别

1. 内容风险评分

import numpy as np
from typing import Dict, List
from dataclasses import dataclass
from enum import Enum

class RiskCategory(Enum):
    HARMFUL = "harmful"
    BIASED = "biased"
    MISLEADING = "misleading"
    INAPPROPRIATE = "inappropriate"
    CONFIDENTIAL = "confidential"
    FACTUAL_ERROR = "factual_error"

@dataclass
class RiskAssessment:
    """风险评估结果"""
    text: str
    overall_risk: float
    category_scores: Dict[str, float]
    risk_factors: List[str]
    recommendations: List[str]

class ContentRiskScorer:
    """内容风险评分"""
    
    def __init__(self):
        self.risk_patterns = self._initialize_patterns()
    
    def _initialize_patterns(self) -> Dict[RiskCategory, List[str]]:
        """初始化风险模式"""
        return {
            RiskCategory.HARMFUL: ["暴力", "伤害", "自杀", "自残"],
            RiskCategory.BIASED: ["性别歧视", "种族歧视", "地域歧视"],
            RiskCategory.MISLEADING: ["虚假信息", "谣言", "误导"],
            RiskCategory.INAPPROPRIATE: ["色情", "赌博", "非法"],
            RiskCategory.CONFIDENTIAL: ["密码", "身份证", "银行卡"],
            RiskCategory.FACTUAL_ERROR: ["据研究表明", "数据显示"]
        }
    
    def score(self, text: str) -> RiskAssessment:
        """评估风险"""
        category_scores = {}
        risk_factors = []
        
        for category, patterns in self.risk_patterns.items():
            score = 0
            for pattern in patterns:
                if pattern in text:
                    score += 0.3
                    risk_factors.append(f"包含{category.value}内容: {pattern}")
            
            category_scores[category.value] = min(score, 1.0)
        
        # 计算总体风险
        overall_risk = np.mean(list(category_scores.values()))
        
        # 生成建议
        recommendations = self._generate_recommendations(overall_risk, category_scores)
        
        return RiskAssessment(
            text=text,
            overall_risk=overall_risk,
            category_scores=category_scores,
            risk_factors=risk_factors,
            recommendations=recommendations
        )
    
    def _generate_recommendations(self, overall_risk: float, category_scores: Dict) -> List[str]:
        """生成建议"""
        recommendations = []
        
        if overall_risk > 0.7:
            recommendations.append("建议完全拒绝该输出")
        elif overall_risk > 0.4:
            recommendations.append("建议人工审核后使用")
        
        if category_scores.get("harmful", 0) > 0.5:
            recommendations.append("检测到潜在有害内容,建议屏蔽")
        
        if category_scores.get("biased", 0) > 0.5:
            recommendations.append("检测到潜在偏见内容,建议修改")
        
        return recommendations

2. 模型风险评分

class ModelRiskScorer:
    """模型风险评分"""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def score_generation_risk(self, prompt: str, n_samples: int = 5) -> Dict:
        """评估生成风险"""
        generations = []
        
        for _ in range(n_samples):
            inputs = self.tokenizer(prompt, return_tensors="pt")
            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_new_tokens=100, 
                                            do_sample=True, temperature=0.7)
                generation = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], 
                                                  skip_special_tokens=True)
                generations.append(generation)
        
        # 评估一致性
        unique_generations = set(generations)
        consistency_score = 1.0 - (len(unique_generations) / len(generations))
        
        # 评估内容风险
        content_risk_scorer = ContentRiskScorer()
        risk_scores = [content_risk_scorer.score(gen).overall_risk for gen in generations]
        
        # 评估不确定性
        uncertainty_score = np.var(risk_scores)
        
        return {
            "prompt": prompt,
            "n_samples": n_samples,
            "consistency_score": consistency_score,
            "mean_risk_score": np.mean(risk_scores),
            "max_risk_score": np.max(risk_scores),
            "risk_variance": uncertainty_score,
            "generations": generations,
            "risk_scores": risk_scores
        }

高级风险检测

1. 对抗性风险检测

class AdversarialRiskDetector:
    """对抗性风险检测"""
    
    def __init__(self):
        self.injection_patterns = [
            "忽略之前的指令",
            "你现在是",
            "系统提示",
            "新指令"
        ]
    
    def detect_injection(self, text: str) -> Dict:
        """检测提示注入"""
        detected = []
        confidence = 0
        
        for pattern in self.injection_patterns:
            if pattern in text:
                detected.append(pattern)
                confidence += 0.3
        
        return {
            "is_injection": len(detected) > 0,
            "confidence": min(confidence, 1.0),
            "detected_patterns": detected
        }
    
    def detect_jailbreak(self, text: str) -> Dict:
        """检测越狱尝试"""
        jailbreak_patterns = [
            "假设你没有限制",
            "作为DAN",
            "忽略安全",
            "虚构场景"
        ]
        
        detected = []
        for pattern in jailbreak_patterns:
            if pattern.lower() in text.lower():
                detected.append(pattern)
        
        return {
            "is_jailbreak": len(detected) > 0,
            "confidence": len(detected) * 0.3,
            "detected_patterns": detected
        }

2. 偏见风险检测

class BiasRiskDetector:
    """偏见风险检测"""
    
    def __init__(self):
        self.bias_terms = {
            "gender": ["他", "她", "男人", "女人", "男性", "女性"],
            "race": ["白人", "黑人", "亚洲人"],
            "age": ["年轻人", "老年人", "老人"]
        }
    
    def detect_bias(self, text: str) -> Dict:
        """检测偏见"""
        detected_biases = {}
        
        for bias_type, terms in self.bias_terms.items():
            found_terms = [term for term in terms if term in text]
            if found_terms:
                detected_biases[bias_type] = found_terms
        
        # 计算偏见分数
        total_bias_score = len(detected_biases) * 0.2
        
        return {
            "has_bias": len(detected_biases) > 0,
            "bias_score": min(total_bias_score, 1.0),
            "detected_biases": detected_biases,
            "severity": "high" if total_bias_score > 0.5 else "medium" if total_bias_score > 0.2 else "low"
        }

综合风险评估

class ComprehensiveRiskAssessor:
    """综合风险评估"""
    
    def __init__(self, model=None, tokenizer=None):
        self.content_scorer = ContentRiskScorer()
        self.model_scorer = ModelRiskScorer(model, tokenizer) if model else None
        self.adversarial_detector = AdversarialRiskDetector()
        self.bias_detector = BiasRiskDetector()
    
    def assess(self, text: str, prompt: str = None) -> Dict:
        """综合风险评估"""
        results = {}
        
        # 内容风险
        content_risk = self.content_scorer.score(text)
        results["content_risk"] = {
            "overall": content_risk.overall_risk,
            "categories": content_risk.category_scores,
            "factors": content_risk.risk_factors
        }
        
        # 对抗性风险
        injection_risk = self.adversarial_detector.detect_injection(text)
        jailbreak_risk = self.adversarial_detector.detect_jailbreak(text)
        results["adversarial_risk"] = {
            "injection": injection_risk,
            "jailbreak": jailbreak_risk
        }
        
        # 偏见风险
        bias_risk = self.bias_detector.detect_bias(text)
        results["bias_risk"] = bias_risk
        
        # 模型风险(如果有prompt)
        if prompt and self.model_scorer:
            model_risk = self.model_scorer.score_generation_risk(prompt)
            results["model_risk"] = {
                "consistency": model_risk["consistency_score"],
                "mean_risk": model_risk["mean_risk_score"]
            }
        
        # 计算总体风险分数
        overall_risk = self._compute_overall_risk(results)
        results["overall_risk"] = overall_risk
        
        # 生成建议
        results["recommendations"] = self._generate_recommendations(results)
        
        return results
    
    def _compute_overall_risk(self, results: Dict) -> float:
        """计算总体风险"""
        scores = []
        
        if "content_risk" in results:
            scores.append(results["content_risk"]["overall"])
        
        if "adversarial_risk" in results:
            adv_score = max(
                results["adversarial_risk"]["injection"]["confidence"],
                results["adversarial_risk"]["jailbreak"]["confidence"]
            )
            scores.append(adv_score)
        
        if "bias_risk" in results:
            scores.append(results["bias_risk"]["bias_score"])
        
        return np.mean(scores) if scores else 0
    
    def _generate_recommendations(self, results: Dict) -> List[str]:
        """生成建议"""
        recommendations = []
        
        overall = results["overall_risk"]
        
        if overall > 0.7:
            recommendations.append("高风险:建议完全拒绝该输出")
        elif overall > 0.4:
            recommendations.append("中风险:建议人工审核")
        elif overall > 0.2:
            recommendations.append("低风险:可使用但需注意")
        else:
            recommendations.append("安全:可直接使用")
        
        return recommendations

可视化

import matplotlib.pyplot as plt

def plot_risk_assessment(risk_results: Dict):
    """绘制风险评估图"""
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 风险类别
    if "content_risk" in risk_results:
        categories = list(risk_results["content_risk"]["categories"].keys())
        scores = list(risk_results["content_risk"]["categories"].values())
        axes[0].bar(categories, scores)
        axes[0].set_ylabel("Risk Score")
        axes[0].set_title("Content Risk Categories")
        axes[0].tick_params(axis="x", rotation=45)
    
    # 总体风险
    overall = risk_results["overall_risk"]
    color = "red" if overall > 0.7 else "orange" if overall > 0.4 else "green"
    axes[1].barh(["Overall Risk"], [overall], color=color)
    axes[1].set_xlim(0, 1)
    axes[1].set_title("Overall Risk Score")
    
    plt.tight_layout()
    plt.show()

最佳实践

  1. 多维度评估:从多个角度评估风险
  2. 实时检测:在输出生成时进行实时风险检测
  3. 持续更新:根据新的风险模式更新检测规则
  4. 用户透明:向用户展示风险评估结果

总结

风险评分是确保LLM输出安全可靠的重要技术。通过多维度的风险评估,可以有效识别和过滤有害内容。