← 返回首页
🧠

幻觉检测

📂 llm ⏱ 3 min 402 words

--- title: "幻觉检测" description: "大语言模型幻觉检测技术,包括事实性验证、接地技术和可信度评估" tags: ["幻觉检测", "事实性", "接地技术", "可信度"] category: "llm" icon: "🧠"

幻觉检测

大语言模型的幻觉(Hallucination)问题是指模型生成看似合理但实际不准确或完全虚构的内容。检测和缓解幻觉是构建可信AI系统的关键挑战。

幻觉类型

事实性幻觉

模型生成与真实世界事实不符的内容:

# 幻觉示例
hallucination_examples = [
    {
        "query": "爱因斯坦何时获得诺贝尔奖?",
        "hallucinated": "1921年,因相对论获得诺贝尔物理学奖",  # 奖项正确但原因错误
        "fact": "1921年,因光电效应获得诺贝尔物理学奖",
    },
    {
        "query": "北京的人口是多少?",
        "hallucinated": "约5000万",  # 数字明显错误
        "fact": "约2100万(2020年数据)",
    },
]

内在幻觉 vs 外在幻觉

class HallucinationDetector:
    def __init__(self, reference_docs):
        self.reference_docs = reference_docs
    
    def detect内在幻觉(self, query, response):
        """内在幻觉:与查询上下文矛盾"""
        # 检查回复是否与查询中的前提矛盾
        pass
    
    def detect外在幻觉(self, response):
        """外在幻觉:无法从上下文中验证"""
        # 检查回复中是否有无法验证的声明
        pass

基于检索的幻觉检测(RAG验证)

通过检索外部知识库验证生成内容的事实性:

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

class RAGFactChecker:
    def __init__(self, knowledge_base, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
        self.model = SentenceTransformer(model_name)
        self.kb = knowledge_base
        self.index = self._build_index(knowledge_base)
    
    def _build_index(self, docs):
        """构建向量索引"""
        embeddings = self.model.encode(docs)
        index = faiss.IndexFlatIP(embeddings.shape[1])
        index.add(embeddings.astype('float32'))
        return index
    
    def check_factuality(self, claim, top_k=5):
        """验证声明的事实性"""
        claim_embedding = self.model.encode([claim])
        scores, indices = self.index.search(claim_embedding, top_k)
        
        # 语义相似度阈值
        SIMILARITY_THRESHOLD = 0.75
        supported = scores[0][0] > SIMILARITY_THRESHOLD
        
        return {
            "claim": claim,
            "supported": supported,
            "confidence": float(scores[0][0]),
            "evidence": [self.kb[i] for i in indices[0]],
        }
    
    def check_response(self, query, response):
        """检查完整回复的事实性"""
        # 将回复拆分为独立声明
        claims = self._split_claims(response)
        results = [self.check_factuality(claim) for claim in claims]
        
        supported_count = sum(1 for r in results if r["supported"])
        return {
            "total_claims": len(claims),
            "supported": supported_count,
            "unsupported": len(claims) - supported_count,
            "factuality_score": supported_count / len(claims) if claims else 1.0,
            "details": results,
        }
    
    def _split_claims(self, text):
        """将文本拆分为独立声明"""
        sentences = text.replace("。", "。\n").replace("!", "!\n").replace("?", "?\n")
        return [s.strip() for s in sentences.split("\n") if s.strip()]

自我一致性检测

通过多次采样生成回复,检测一致性来识别潜在幻觉:

import torch

class SelfConsistencyDetector:
    def __init__(self, model, tokenizer, num_samples=5):
        self.model = model
        self.tokenizer = tokenizer
        self.num_samples = num_samples
    
    def detect(self, prompt, temperature=0.7):
        """通过多次采样检测幻觉"""
        responses = []
        for _ in range(self.num_samples):
            inputs = self.tokenizer(prompt, return_tensors="pt")
            outputs = self.model.generate(
                **inputs, 
                max_new_tokens=256,
                temperature=temperature,
                do_sample=True,
            )
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            responses.append(response)
        
        # 计算语义一致性
        consistency_score = self._compute_consistency(responses)
        
        return {
            "responses": responses,
            "consistency_score": consistency_score,
            "is_hallucination": consistency_score < 0.6,
        }
    
    def _compute_consistency(self, responses):
        """计算回复间的一致性分数"""
        embeddings = self.model.encode(responses)
        # 计算两两余弦相似度
        similarities = []
        for i in range(len(responses)):
            for j in range(i + 1, len(responses)):
                sim = np.dot(embeddings[i], embeddings[j]) / (
                    np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j])
                )
                similarities.append(sim)
        return np.mean(similarities)

接地技术(Grounding)

将模型输出锚定到可信来源:

class GroundedGeneration:
    def __init__(self, retriever, generator):
        self.retriever = retriever
        self.generator = generator
    
    def generate_with_grounding(self, query, num_docs=3):
        """基于检索文档生成回复"""
        # 检索相关文档
        docs = self.retriever.search(query, top_k=num_docs)
        
        # 构建接地提示
        context = "\n".join([f"[{i+1}] {doc}" for i, doc in enumerate(docs)])
        grounded_prompt = f"""基于以下参考文档回答问题。如果文档中没有相关信息,请明确说明。

参考文档:
{context}

问题:{query}
回答:"""
        
        response = self.generator.generate(grounded_prompt)
        
        # 提取引用
        citations = self._extract_citations(response, docs)
        
        return {
            "response": response,
            "citations": citations,
            "grounded": len(citations) > 0,
        }
    
    def _extract_citations(self, response, docs):
        """提取回复中的引用"""
        citations = []
        for i, doc in enumerate(docs):
            if f"[{i+1}]" in response or doc[:20] in response:
                citations.append({"doc_id": i, "source": doc[:100]})
        return citations

幻觉评估指标

指标 描述 计算方式
FActScore 细粒度事实准确率 每个原子声明的验证
TruthfulQA 真实性基准 多选题准确率
自我一致性 多次采样一致性 语义相似度平均值
引用准确率 引用来源的正确性 引用与原文匹配度

幻觉检测是AI可信度的基础,结合RAG、自我一致性和接地技术可以有效降低幻觉率。