风险评分:评估模型输出风险
--- title: "风险评分:评估模型输出风险" description: "为LLM输出生成风险评分,识别潜在的有害或不当内容" tags: ["风险评分", "内容安全", "风险评估", "LLM", "安全过滤"] category: "llm" icon: "⚠️"
风险评分:评估模型输出风险
风险评分概述
风险评分是评估LLM输出潜在风险的技术,识别有害、偏见或不当内容,确保输出安全可靠。
风险类别
1. 内容风险评分
import numpy as np
from typing import Dict, List
from dataclasses import dataclass
from enum import Enum
class RiskCategory(Enum):
HARMFUL = "harmful"
BIASED = "biased"
MISLEADING = "misleading"
INAPPROPRIATE = "inappropriate"
CONFIDENTIAL = "confidential"
FACTUAL_ERROR = "factual_error"
@dataclass
class RiskAssessment:
"""风险评估结果"""
text: str
overall_risk: float
category_scores: Dict[str, float]
risk_factors: List[str]
recommendations: List[str]
class ContentRiskScorer:
"""内容风险评分"""
def __init__(self):
self.risk_patterns = self._initialize_patterns()
def _initialize_patterns(self) -> Dict[RiskCategory, List[str]]:
"""初始化风险模式"""
return {
RiskCategory.HARMFUL: ["暴力", "伤害", "自杀", "自残"],
RiskCategory.BIASED: ["性别歧视", "种族歧视", "地域歧视"],
RiskCategory.MISLEADING: ["虚假信息", "谣言", "误导"],
RiskCategory.INAPPROPRIATE: ["色情", "赌博", "非法"],
RiskCategory.CONFIDENTIAL: ["密码", "身份证", "银行卡"],
RiskCategory.FACTUAL_ERROR: ["据研究表明", "数据显示"]
}
def score(self, text: str) -> RiskAssessment:
"""评估风险"""
category_scores = {}
risk_factors = []
for category, patterns in self.risk_patterns.items():
score = 0
for pattern in patterns:
if pattern in text:
score += 0.3
risk_factors.append(f"包含{category.value}内容: {pattern}")
category_scores[category.value] = min(score, 1.0)
# 计算总体风险
overall_risk = np.mean(list(category_scores.values()))
# 生成建议
recommendations = self._generate_recommendations(overall_risk, category_scores)
return RiskAssessment(
text=text,
overall_risk=overall_risk,
category_scores=category_scores,
risk_factors=risk_factors,
recommendations=recommendations
)
def _generate_recommendations(self, overall_risk: float, category_scores: Dict) -> List[str]:
"""生成建议"""
recommendations = []
if overall_risk > 0.7:
recommendations.append("建议完全拒绝该输出")
elif overall_risk > 0.4:
recommendations.append("建议人工审核后使用")
if category_scores.get("harmful", 0) > 0.5:
recommendations.append("检测到潜在有害内容,建议屏蔽")
if category_scores.get("biased", 0) > 0.5:
recommendations.append("检测到潜在偏见内容,建议修改")
return recommendations
2. 模型风险评分
class ModelRiskScorer:
"""模型风险评分"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def score_generation_risk(self, prompt: str, n_samples: int = 5) -> Dict:
"""评估生成风险"""
generations = []
for _ in range(n_samples):
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(**inputs, max_new_tokens=100,
do_sample=True, temperature=0.7)
generation = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True)
generations.append(generation)
# 评估一致性
unique_generations = set(generations)
consistency_score = 1.0 - (len(unique_generations) / len(generations))
# 评估内容风险
content_risk_scorer = ContentRiskScorer()
risk_scores = [content_risk_scorer.score(gen).overall_risk for gen in generations]
# 评估不确定性
uncertainty_score = np.var(risk_scores)
return {
"prompt": prompt,
"n_samples": n_samples,
"consistency_score": consistency_score,
"mean_risk_score": np.mean(risk_scores),
"max_risk_score": np.max(risk_scores),
"risk_variance": uncertainty_score,
"generations": generations,
"risk_scores": risk_scores
}
高级风险检测
1. 对抗性风险检测
class AdversarialRiskDetector:
"""对抗性风险检测"""
def __init__(self):
self.injection_patterns = [
"忽略之前的指令",
"你现在是",
"系统提示",
"新指令"
]
def detect_injection(self, text: str) -> Dict:
"""检测提示注入"""
detected = []
confidence = 0
for pattern in self.injection_patterns:
if pattern in text:
detected.append(pattern)
confidence += 0.3
return {
"is_injection": len(detected) > 0,
"confidence": min(confidence, 1.0),
"detected_patterns": detected
}
def detect_jailbreak(self, text: str) -> Dict:
"""检测越狱尝试"""
jailbreak_patterns = [
"假设你没有限制",
"作为DAN",
"忽略安全",
"虚构场景"
]
detected = []
for pattern in jailbreak_patterns:
if pattern.lower() in text.lower():
detected.append(pattern)
return {
"is_jailbreak": len(detected) > 0,
"confidence": len(detected) * 0.3,
"detected_patterns": detected
}
2. 偏见风险检测
class BiasRiskDetector:
"""偏见风险检测"""
def __init__(self):
self.bias_terms = {
"gender": ["他", "她", "男人", "女人", "男性", "女性"],
"race": ["白人", "黑人", "亚洲人"],
"age": ["年轻人", "老年人", "老人"]
}
def detect_bias(self, text: str) -> Dict:
"""检测偏见"""
detected_biases = {}
for bias_type, terms in self.bias_terms.items():
found_terms = [term for term in terms if term in text]
if found_terms:
detected_biases[bias_type] = found_terms
# 计算偏见分数
total_bias_score = len(detected_biases) * 0.2
return {
"has_bias": len(detected_biases) > 0,
"bias_score": min(total_bias_score, 1.0),
"detected_biases": detected_biases,
"severity": "high" if total_bias_score > 0.5 else "medium" if total_bias_score > 0.2 else "low"
}
综合风险评估
class ComprehensiveRiskAssessor:
"""综合风险评估"""
def __init__(self, model=None, tokenizer=None):
self.content_scorer = ContentRiskScorer()
self.model_scorer = ModelRiskScorer(model, tokenizer) if model else None
self.adversarial_detector = AdversarialRiskDetector()
self.bias_detector = BiasRiskDetector()
def assess(self, text: str, prompt: str = None) -> Dict:
"""综合风险评估"""
results = {}
# 内容风险
content_risk = self.content_scorer.score(text)
results["content_risk"] = {
"overall": content_risk.overall_risk,
"categories": content_risk.category_scores,
"factors": content_risk.risk_factors
}
# 对抗性风险
injection_risk = self.adversarial_detector.detect_injection(text)
jailbreak_risk = self.adversarial_detector.detect_jailbreak(text)
results["adversarial_risk"] = {
"injection": injection_risk,
"jailbreak": jailbreak_risk
}
# 偏见风险
bias_risk = self.bias_detector.detect_bias(text)
results["bias_risk"] = bias_risk
# 模型风险(如果有prompt)
if prompt and self.model_scorer:
model_risk = self.model_scorer.score_generation_risk(prompt)
results["model_risk"] = {
"consistency": model_risk["consistency_score"],
"mean_risk": model_risk["mean_risk_score"]
}
# 计算总体风险分数
overall_risk = self._compute_overall_risk(results)
results["overall_risk"] = overall_risk
# 生成建议
results["recommendations"] = self._generate_recommendations(results)
return results
def _compute_overall_risk(self, results: Dict) -> float:
"""计算总体风险"""
scores = []
if "content_risk" in results:
scores.append(results["content_risk"]["overall"])
if "adversarial_risk" in results:
adv_score = max(
results["adversarial_risk"]["injection"]["confidence"],
results["adversarial_risk"]["jailbreak"]["confidence"]
)
scores.append(adv_score)
if "bias_risk" in results:
scores.append(results["bias_risk"]["bias_score"])
return np.mean(scores) if scores else 0
def _generate_recommendations(self, results: Dict) -> List[str]:
"""生成建议"""
recommendations = []
overall = results["overall_risk"]
if overall > 0.7:
recommendations.append("高风险:建议完全拒绝该输出")
elif overall > 0.4:
recommendations.append("中风险:建议人工审核")
elif overall > 0.2:
recommendations.append("低风险:可使用但需注意")
else:
recommendations.append("安全:可直接使用")
return recommendations
可视化
import matplotlib.pyplot as plt
def plot_risk_assessment(risk_results: Dict):
"""绘制风险评估图"""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 风险类别
if "content_risk" in risk_results:
categories = list(risk_results["content_risk"]["categories"].keys())
scores = list(risk_results["content_risk"]["categories"].values())
axes[0].bar(categories, scores)
axes[0].set_ylabel("Risk Score")
axes[0].set_title("Content Risk Categories")
axes[0].tick_params(axis="x", rotation=45)
# 总体风险
overall = risk_results["overall_risk"]
color = "red" if overall > 0.7 else "orange" if overall > 0.4 else "green"
axes[1].barh(["Overall Risk"], [overall], color=color)
axes[1].set_xlim(0, 1)
axes[1].set_title("Overall Risk Score")
plt.tight_layout()
plt.show()
最佳实践
- 多维度评估:从多个角度评估风险
- 实时检测:在输出生成时进行实时风险检测
- 持续更新:根据新的风险模式更新检测规则
- 用户透明:向用户展示风险评估结果
总结
风险评分是确保LLM输出安全可靠的重要技术。通过多维度的风险评估,可以有效识别和过滤有害内容。