内容过滤
--- title: "内容过滤" description: "LLM内容过滤技术详解,包括毒性检测、内容审核和安全过滤实现" tags: ["内容过滤", "毒性检测", "审核", "安全"] category: "llm" icon: "🧠"
内容过滤
内容过滤的重要性
LLM的内容过滤是确保模型输出安全、合规的关键技术。通过内容过滤,可以识别并阻止有害、不当或违反政策的内容生成。
内容过滤类型
1. 基于关键词的过滤
from typing import List, Dict
class KeywordFilter:
def __init__(self):
self.blocked_words = {
"high_risk": ["暴力", "自残", "非法"],
"medium_risk": ["仇恨", "歧视", "骚扰"],
"low_risk": ["赌博", "成人", "欺诈"]
}
def check_content(self, text: str) -> Dict:
results = []
for level, words in self.blocked_words.items():
for word in words:
if word in text:
results.append({
"word": word,
"level": level,
"action": "block" if level == "high_risk" else "flag"
})
return {"safe": len(results) == 0, "issues": results}
# 使用示例
filter = KeywordFilter()
result = filter.check_content("这是一段测试文本")
print(result)
2. 基于模型的毒性检测
from transformers import pipeline
import torch
class ToxicityDetector:
def __init__(self, model_name="unitary/toxic-bert"):
self.classifier = pipeline(
"text-classification",
model=model_name,
device=0 if torch.cuda.is_available() else -1
)
self.thresholds = {
"toxic": 0.8,
"severe_toxic": 0.9,
"obscene": 0.85,
"threat": 0.9,
"insult": 0.8
}
def analyze(self, text: str) -> Dict:
results = self.classifier(text)
analysis = {
"text": text,
"is_safe": True,
"categories": {}
}
for result in results:
label = result['label'].lower()
score = result['score']
threshold = self.thresholds.get(label, 0.8)
analysis["categories"][label] = {
"score": score,
"threshold": threshold,
"flagged": score > threshold
}
if score > threshold:
analysis["is_safe"] = False
return analysis
# 实际应用
detector = ToxicityDetector()
result = detector.analyze("用户输入文本")
if not result["is_safe"]:
print("检测到不当内容")
3. 语义内容理解
from sentence_transformers import SentenceTransformer
import numpy as np
class SemanticContentFilter:
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.unsafe_categories = {
"violence": ["如何制造武器", "伤害他人方法"],
"illegal": ["如何逃税", "如何欺诈"],
"harmful": ["自残方法", "危险行为"]
}
self.category_embeddings = {}
self._precompute_embeddings()
def _precompute_embeddings(self):
for category, examples in self.unsafe_categories.items():
embeddings = self.model.encode(examples)
self.category_embeddings[category] = np.mean(embeddings, axis=0)
def check_safety(self, text: str, threshold: float = 0.6) -> Dict:
text_embedding = self.model.encode([text])[0]
results = {}
for category, category_emb in self.category_embeddings.items():
similarity = np.dot(text_embedding, category_emb) / (
np.linalg.norm(text_embedding) * np.linalg.norm(category_emb)
)
results[category] = {
"similarity": float(similarity),
"flagged": similarity > threshold
}
is_safe = not any(r["flagged"] for r in results.values())
return {"safe": is_safe, "details": results}
多层过滤系统
class MultiLayerContentFilter:
def __init__(self):
self.keyword_filter = KeywordFilter()
self.toxicity_detector = ToxicityDetector()
self.semantic_filter = SemanticContentFilter()
def comprehensive_check(self, text: str) -> Dict:
# 第一层:关键词检查
keyword_result = self.keyword_filter.check_content(text)
if not keyword_result["safe"]:
return {"blocked": True, "reason": "keyword", "details": keyword_result}
# 第二层:毒性检测
toxicity_result = self.toxicity_detector.analyze(text)
if not toxicity_result["is_safe"]:
return {"blocked": True, "reason": "toxicity", "details": toxicity_result}
# 第三层:语义理解
semantic_result = self.semantic_filter.check_safety(text)
if not semantic_result["safe"]:
return {"blocked": True, "reason": "semantic", "details": semantic_result}
return {"blocked": False, "safe": True}
# 使用多层过滤
filter_system = MultiLayerContentFilter()
result = filter_system.comprehensive_check("用户输入")
print(f"内容安全状态: {result['safe'] if 'safe' in result else 'blocked'}")
LLM输出过滤
class LLMOutputFilter:
def __init__(self):
self.pii_patterns = [
r'\b\d{3}-\d{2}-\d{4}\b', # SSN
r'\b\d{16}\b', # Credit card
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' # Email
]
def filter_pii(self, text: str) -> str:
import re
filtered = text
for pattern in self.pii_patterns:
filtered = re.sub(pattern, '[REDACTED]', filtered)
return filtered
def filter_sensitive_info(self, response: str, context: Dict) -> str:
# 过滤个人信息
if context.get("contains_pii"):
response = self.filter_pii(response)
# 过滤商业机密
if context.get("contains_confidential"):
response = "[内容已过滤]"
return response
最佳实践
- 分层过滤:结合多种过滤技术提高准确率
- 持续训练:定期更新过滤模型以应对新型有害内容
- 误报处理:建立误报反馈机制,优化过滤阈值
- 透明度:向用户说明内容过滤政策
总结
内容过滤是LLM安全的重要组成部分。通过关键词过滤、模型检测和语义理解等多层技术,可以有效保护用户和平台安全。