LLM端到端测试
--- title: "LLM端到端测试" description: "全面介绍LLM系统的端到端测试方法,包括完整场景测试、用户交互测试、多轮对话测试、测试脚本编写以及自动化端到端测试框架" tags: ["端到端测试", "E2E测试", "场景测试", "LLM验证"] category: "llm" icon: "🧠"
LLM端到端测试
端到端测试的意义
端到端(E2E)测试模拟真实用户行为,验证LLM系统从输入到输出的完整流程。它关注的是用户视角的体验:提交问题后是否得到正确的回答,多轮对话是否连贯,异常情况是否优雅处理。
设计端到端测试场景
测试场景应覆盖核心业务流程和边界情况:
class E2EScenario:
def __init__(self, name, steps, expected_outcome):
self.name = name
self.steps = steps
self.expected_outcome = expected_outcome
def validate(self, actual_outcome):
results = []
for key, expected in self.expected_outcome.items():
actual = actual_outcome.get(key)
passed = self._check_match(actual, expected)
results.append({
"field": key,
"expected": expected,
"actual": actual,
"passed": passed,
})
return results
def _check_match(self, actual, expected):
if callable(expected):
return expected(actual)
return actual == expected
# 定义测试场景
scenarios = [
E2EScenario(
name="简单问答",
steps=[
{"type": "input", "content": "什么是机器学习?"},
],
expected_outcome={
"status": "success",
"response_length": lambda x: len(x) > 50,
"contains_keywords": lambda x: "机器" in x or "学习" in x,
}
),
E2EScenario(
name="多轮对话",
steps=[
{"type": "input", "content": "我想学Python"},
{"type": "input", "content": "从哪里开始学?"},
{"type": "input", "content": "推荐一些练习项目"},
],
expected_outcome={
"status": "success",
"context_preserved": True,
}
),
]
完整场景测试框架
import asyncio
import aiohttp
class LLMEndToEndTester:
def __init__(self, base_url, api_key):
self.base_url = base_url
self.headers = {"Authorization": f"Bearer {api_key}"}
self.session = None
async def setup(self):
self.session = aiohttp.ClientSession()
async def teardown(self):
await self.session.close()
async def test_single_query(self, query, validators):
async with self.session.post(
f"{self.base_url}/chat",
json={"message": query},
headers=self.headers,
) as resp:
result = await resp.json()
validation_results = []
for validator in validators:
passed, message = validator(result)
validation_results.append({
"validator": validator.__name__,
"passed": passed,
"message": message,
})
return validation_results
async def test_conversation_flow(self, messages):
conversation_id = None
results = []
for msg in messages:
payload = {"message": msg["content"]}
if conversation_id:
payload["conversation_id"] = conversation_id
async with self.session.post(
f"{self.base_url}/chat",
json=payload,
headers=self.headers,
) as resp:
result = await resp.json()
conversation_id = result.get("conversation_id")
results.append(result)
return results
多轮对话测试
多轮对话是LLM应用的核心场景,需要测试上下文保持、话题切换、指代消解等能力:
class ConversationTest:
def __init__(self, llm_client):
self.client = llm_client
self.history = []
def send(self, message):
self.history.append({"role": "user", "content": message})
response = self.client.chat(self.history)
self.history.append({"role": "assistant", "content": response})
return response
def test_context_preservation(self):
"""测试上下文保持"""
self.send("我叫张三")
response = self.send("我叫什么名字?")
assert "张三" in response, "模型应记住用户名字"
def test_topic_switching(self):
"""测试话题切换"""
self.send("Python的GIL是什么?")
self.send("那JavaScript呢?")
response = self.send("回到刚才的GIL话题")
assert "GIL" in response or "全局解释器锁" in response
def test_reference_resolution(self):
"""测试指代消解"""
self.send("苹果和香蕉哪个更贵?")
response = self.send("那橙子呢?")
assert len(response) > 0, "应能理解指代关系"
断言策略
LLM输出具有不确定性,断言需要更灵活:
class LLMAssertions:
@staticmethod
def contains_any(text, keywords):
"""检查是否包含任一关键词"""
return any(kw in text for kw in keywords)
@staticmethod
def response_length_in_range(text, min_len, max_len):
return min_len <= len(text) <= max_len
@staticmethod
def no_harmful_content(text, banned_words):
"""检查是否包含有害内容"""
return not any(bw in text for bw in banned_words)
@staticmethod
def follows_format(text, pattern):
"""检查格式是否符合要求"""
import re
return bool(re.search(pattern, text))
自动化测试执行
将端到端测试集成到CI/CD流水线,每次部署前自动运行。使用标签标记不同优先级的场景,快速通道运行核心场景(5分钟内完成),完整通道运行所有场景。对于关键业务场景,设置自动化监控持续验证。