幻觉检测
--- title: "幻觉检测" description: "大语言模型幻觉检测技术,包括事实性验证、接地技术和可信度评估" tags: ["幻觉检测", "事实性", "接地技术", "可信度"] category: "llm" icon: "🧠"
幻觉检测
大语言模型的幻觉(Hallucination)问题是指模型生成看似合理但实际不准确或完全虚构的内容。检测和缓解幻觉是构建可信AI系统的关键挑战。
幻觉类型
事实性幻觉
模型生成与真实世界事实不符的内容:
# 幻觉示例
hallucination_examples = [
{
"query": "爱因斯坦何时获得诺贝尔奖?",
"hallucinated": "1921年,因相对论获得诺贝尔物理学奖", # 奖项正确但原因错误
"fact": "1921年,因光电效应获得诺贝尔物理学奖",
},
{
"query": "北京的人口是多少?",
"hallucinated": "约5000万", # 数字明显错误
"fact": "约2100万(2020年数据)",
},
]
内在幻觉 vs 外在幻觉
class HallucinationDetector:
def __init__(self, reference_docs):
self.reference_docs = reference_docs
def detect内在幻觉(self, query, response):
"""内在幻觉:与查询上下文矛盾"""
# 检查回复是否与查询中的前提矛盾
pass
def detect外在幻觉(self, response):
"""外在幻觉:无法从上下文中验证"""
# 检查回复中是否有无法验证的声明
pass
基于检索的幻觉检测(RAG验证)
通过检索外部知识库验证生成内容的事实性:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
class RAGFactChecker:
def __init__(self, knowledge_base, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
self.model = SentenceTransformer(model_name)
self.kb = knowledge_base
self.index = self._build_index(knowledge_base)
def _build_index(self, docs):
"""构建向量索引"""
embeddings = self.model.encode(docs)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings.astype('float32'))
return index
def check_factuality(self, claim, top_k=5):
"""验证声明的事实性"""
claim_embedding = self.model.encode([claim])
scores, indices = self.index.search(claim_embedding, top_k)
# 语义相似度阈值
SIMILARITY_THRESHOLD = 0.75
supported = scores[0][0] > SIMILARITY_THRESHOLD
return {
"claim": claim,
"supported": supported,
"confidence": float(scores[0][0]),
"evidence": [self.kb[i] for i in indices[0]],
}
def check_response(self, query, response):
"""检查完整回复的事实性"""
# 将回复拆分为独立声明
claims = self._split_claims(response)
results = [self.check_factuality(claim) for claim in claims]
supported_count = sum(1 for r in results if r["supported"])
return {
"total_claims": len(claims),
"supported": supported_count,
"unsupported": len(claims) - supported_count,
"factuality_score": supported_count / len(claims) if claims else 1.0,
"details": results,
}
def _split_claims(self, text):
"""将文本拆分为独立声明"""
sentences = text.replace("。", "。\n").replace("!", "!\n").replace("?", "?\n")
return [s.strip() for s in sentences.split("\n") if s.strip()]
自我一致性检测
通过多次采样生成回复,检测一致性来识别潜在幻觉:
import torch
class SelfConsistencyDetector:
def __init__(self, model, tokenizer, num_samples=5):
self.model = model
self.tokenizer = tokenizer
self.num_samples = num_samples
def detect(self, prompt, temperature=0.7):
"""通过多次采样检测幻觉"""
responses = []
for _ in range(self.num_samples):
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.model.generate(
**inputs,
max_new_tokens=256,
temperature=temperature,
do_sample=True,
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
responses.append(response)
# 计算语义一致性
consistency_score = self._compute_consistency(responses)
return {
"responses": responses,
"consistency_score": consistency_score,
"is_hallucination": consistency_score < 0.6,
}
def _compute_consistency(self, responses):
"""计算回复间的一致性分数"""
embeddings = self.model.encode(responses)
# 计算两两余弦相似度
similarities = []
for i in range(len(responses)):
for j in range(i + 1, len(responses)):
sim = np.dot(embeddings[i], embeddings[j]) / (
np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j])
)
similarities.append(sim)
return np.mean(similarities)
接地技术(Grounding)
将模型输出锚定到可信来源:
class GroundedGeneration:
def __init__(self, retriever, generator):
self.retriever = retriever
self.generator = generator
def generate_with_grounding(self, query, num_docs=3):
"""基于检索文档生成回复"""
# 检索相关文档
docs = self.retriever.search(query, top_k=num_docs)
# 构建接地提示
context = "\n".join([f"[{i+1}] {doc}" for i, doc in enumerate(docs)])
grounded_prompt = f"""基于以下参考文档回答问题。如果文档中没有相关信息,请明确说明。
参考文档:
{context}
问题:{query}
回答:"""
response = self.generator.generate(grounded_prompt)
# 提取引用
citations = self._extract_citations(response, docs)
return {
"response": response,
"citations": citations,
"grounded": len(citations) > 0,
}
def _extract_citations(self, response, docs):
"""提取回复中的引用"""
citations = []
for i, doc in enumerate(docs):
if f"[{i+1}]" in response or doc[:20] in response:
citations.append({"doc_id": i, "source": doc[:100]})
return citations
幻觉评估指标
| 指标 | 描述 | 计算方式 |
|---|---|---|
| FActScore | 细粒度事实准确率 | 每个原子声明的验证 |
| TruthfulQA | 真实性基准 | 多选题准确率 |
| 自我一致性 | 多次采样一致性 | 语义相似度平均值 |
| 引用准确率 | 引用来源的正确性 | 引用与原文匹配度 |
幻觉检测是AI可信度的基础,结合RAG、自我一致性和接地技术可以有效降低幻觉率。