← 返回首页
🧠

内容过滤

📂 llm ⏱ 3 min 442 words

--- title: "内容过滤" description: "LLM内容过滤技术详解,包括毒性检测、内容审核和安全过滤实现" tags: ["内容过滤", "毒性检测", "审核", "安全"] category: "llm" icon: "🧠"

内容过滤

内容过滤的重要性

LLM的内容过滤是确保模型输出安全、合规的关键技术。通过内容过滤,可以识别并阻止有害、不当或违反政策的内容生成。

内容过滤类型

1. 基于关键词的过滤

from typing import List, Dict

class KeywordFilter:
    def __init__(self):
        self.blocked_words = {
            "high_risk": ["暴力", "自残", "非法"],
            "medium_risk": ["仇恨", "歧视", "骚扰"],
            "low_risk": ["赌博", "成人", "欺诈"]
        }
    
    def check_content(self, text: str) -> Dict:
        results = []
        for level, words in self.blocked_words.items():
            for word in words:
                if word in text:
                    results.append({
                        "word": word,
                        "level": level,
                        "action": "block" if level == "high_risk" else "flag"
                    })
        return {"safe": len(results) == 0, "issues": results}

# 使用示例
filter = KeywordFilter()
result = filter.check_content("这是一段测试文本")
print(result)

2. 基于模型的毒性检测

from transformers import pipeline
import torch

class ToxicityDetector:
    def __init__(self, model_name="unitary/toxic-bert"):
        self.classifier = pipeline(
            "text-classification",
            model=model_name,
            device=0 if torch.cuda.is_available() else -1
        )
        self.thresholds = {
            "toxic": 0.8,
            "severe_toxic": 0.9,
            "obscene": 0.85,
            "threat": 0.9,
            "insult": 0.8
        }
    
    def analyze(self, text: str) -> Dict:
        results = self.classifier(text)
        
        analysis = {
            "text": text,
            "is_safe": True,
            "categories": {}
        }
        
        for result in results:
            label = result['label'].lower()
            score = result['score']
            threshold = self.thresholds.get(label, 0.8)
            
            analysis["categories"][label] = {
                "score": score,
                "threshold": threshold,
                "flagged": score > threshold
            }
            
            if score > threshold:
                analysis["is_safe"] = False
        
        return analysis

# 实际应用
detector = ToxicityDetector()
result = detector.analyze("用户输入文本")
if not result["is_safe"]:
    print("检测到不当内容")

3. 语义内容理解

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticContentFilter:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.unsafe_categories = {
            "violence": ["如何制造武器", "伤害他人方法"],
            "illegal": ["如何逃税", "如何欺诈"],
            "harmful": ["自残方法", "危险行为"]
        }
        self.category_embeddings = {}
        self._precompute_embeddings()
    
    def _precompute_embeddings(self):
        for category, examples in self.unsafe_categories.items():
            embeddings = self.model.encode(examples)
            self.category_embeddings[category] = np.mean(embeddings, axis=0)
    
    def check_safety(self, text: str, threshold: float = 0.6) -> Dict:
        text_embedding = self.model.encode([text])[0]
        
        results = {}
        for category, category_emb in self.category_embeddings.items():
            similarity = np.dot(text_embedding, category_emb) / (
                np.linalg.norm(text_embedding) * np.linalg.norm(category_emb)
            )
            results[category] = {
                "similarity": float(similarity),
                "flagged": similarity > threshold
            }
        
        is_safe = not any(r["flagged"] for r in results.values())
        return {"safe": is_safe, "details": results}

多层过滤系统

class MultiLayerContentFilter:
    def __init__(self):
        self.keyword_filter = KeywordFilter()
        self.toxicity_detector = ToxicityDetector()
        self.semantic_filter = SemanticContentFilter()
    
    def comprehensive_check(self, text: str) -> Dict:
        # 第一层:关键词检查
        keyword_result = self.keyword_filter.check_content(text)
        if not keyword_result["safe"]:
            return {"blocked": True, "reason": "keyword", "details": keyword_result}
        
        # 第二层:毒性检测
        toxicity_result = self.toxicity_detector.analyze(text)
        if not toxicity_result["is_safe"]:
            return {"blocked": True, "reason": "toxicity", "details": toxicity_result}
        
        # 第三层:语义理解
        semantic_result = self.semantic_filter.check_safety(text)
        if not semantic_result["safe"]:
            return {"blocked": True, "reason": "semantic", "details": semantic_result}
        
        return {"blocked": False, "safe": True}

# 使用多层过滤
filter_system = MultiLayerContentFilter()
result = filter_system.comprehensive_check("用户输入")
print(f"内容安全状态: {result['safe'] if 'safe' in result else 'blocked'}")

LLM输出过滤

class LLMOutputFilter:
    def __init__(self):
        self.pii_patterns = [
            r'\b\d{3}-\d{2}-\d{4}\b',  # SSN
            r'\b\d{16}\b',              # Credit card
            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'  # Email
        ]
    
    def filter_pii(self, text: str) -> str:
        import re
        filtered = text
        for pattern in self.pii_patterns:
            filtered = re.sub(pattern, '[REDACTED]', filtered)
        return filtered
    
    def filter_sensitive_info(self, response: str, context: Dict) -> str:
        # 过滤个人信息
        if context.get("contains_pii"):
            response = self.filter_pii(response)
        
        # 过滤商业机密
        if context.get("contains_confidential"):
            response = "[内容已过滤]"
        
        return response

最佳实践

  1. 分层过滤:结合多种过滤技术提高准确率
  2. 持续训练:定期更新过滤模型以应对新型有害内容
  3. 误报处理:建立误报反馈机制,优化过滤阈值
  4. 透明度:向用户说明内容过滤政策

总结

内容过滤是LLM安全的重要组成部分。通过关键词过滤、模型检测和语义理解等多层技术,可以有效保护用户和平台安全。