聊天机器人评估:对话系统评估指标和方法
聊天机器人评估:对话系统评估指标和方法
聊天机器人评估的挑战
对话系统评估比单轮问答更复杂,需要考虑上下文连贯性、多轮交互质量、用户满意度等多个维度。
自动评估指标
对话质量指标
from dataclasses import dataclass
from typing import List
import numpy as np
@dataclass
class DialogueMetrics:
coherence: float # 连贯性
relevance: float # 相关性
informativeness: float # 信息量
safety: float # 安全性
fluency: float # 流畅度
class DialogueEvaluator:
def __init__(self):
self.metrics_weights = {
'coherence': 0.25,
'relevance': 0.25,
'informativeness': 0.20,
'safety': 0.15,
'fluency': 0.15
}
def evaluate_turn(self, context: List[str], response: str) -> DialogueMetrics:
"""评估单轮对话"""
return DialogueMetrics(
coherence=self.compute_coherence(context, response),
relevance=self.compute_relevance(context[-1], response),
informativeness=self.compute_informativeness(response),
safety=self.compute_safety(response),
fluency=self.compute_fluency(response)
)
def evaluate_dialogue(self, dialogue: List[str]) -> DialogueMetrics:
"""评估完整对话"""
turn_metrics = []
for i in range(1, len(dialogue), 2): # 只评估助手回复
context = dialogue[:i]
response = dialogue[i]
metrics = self.evaluate_turn(context, response)
turn_metrics.append(metrics)
# 聚合各轮指标
return DialogueMetrics(
coherence=np.mean([m.coherence for m in turn_metrics]),
relevance=np.mean([m.relevance for m in turn_metrics]),
informativeness=np.mean([m.informativeness for m in turn_metrics]),
safety=min([m.safety for m in turn_metrics]), # 安全性取最低
fluency=np.mean([m.fluency for m in turn_metrics])
)
def compute_coherence(self, context: List[str], response: str) -> float:
"""计算连贯性:回复与上下文的逻辑一致性"""
# 使用嵌入相似度或语言模型评估
context_embedding = self.get_embedding(' '.join(context))
response_embedding = self.get_embedding(response)
similarity = self.cosine_similarity(context_embedding, response_embedding)
return similarity
def compute_relevance(self, query: str, response: str) -> float:
"""计算相关性:回复是否回答了问题"""
# 关键词重叠或语义相似度
query_tokens = set(query.split())
response_tokens = set(response.split())
overlap = len(query_tokens & response_tokens)
return overlap / len(query_tokens) if query_tokens else 0
对话一致性检测
class ConsistencyChecker:
def __init__(self, fact_checker=None):
self.fact_checker = fact_checker
self.facts_extracted = []
def check_consistency(self, dialogue: List[str]) -> dict:
"""检查对话一致性"""
inconsistencies = []
# 提取每轮的事实声明
for i, utterance in enumerate(dialogue):
facts = self.extract_facts(utterance)
# 与之前声明的事实对比
for prev_fact in self.facts_extracted:
if self.contradicts(prev_fact, facts):
inconsistencies.append({
'turn': i,
'fact': facts,
'contradicts': prev_fact
})
self.facts_extracted.extend(facts)
return {
'consistent': len(inconsistencies) == 0,
'inconsistencies': inconsistencies,
'consistency_score': 1.0 - (len(inconsistencies) / len(dialogue))
}
def extract_facts(self, text: str) -> List[str]:
"""提取事实声明"""
# 简单实现:提取包含数字、日期、专有名词的句子
facts = []
for sentence in text.split('。'):
if any(char.isdigit() for char in sentence):
facts.append(sentence.strip())
return facts
def contradicts(self, fact1: str, fact2: str) -> bool:
"""检测两个事实是否矛盾"""
if self.fact_checker:
return self.fact_checker.check_contradiction(fact1, fact2)
return False
多轮对话评估
class MultiTurnEvaluator:
def __init__(self):
self.goal_tracker = GoalTracker()
def evaluate_task_completion(self, dialogue, task_goals):
"""评估任务完成度"""
completed_goals = []
partial_goals = []
for goal in task_goals:
status = self.check_goal_status(dialogue, goal)
if status == 'completed':
completed_goals.append(goal)
elif status == 'partial':
partial_goals.append(goal)
completion_rate = len(completed_goals) / len(task_goals)
return {
'completion_rate': completion_rate,
'completed': completed_goals,
'partial': partial_goals,
'efficiency': self.calculate_efficiency(dialogue, completed_goals)
}
def calculate_efficiency(self, dialogue, completed_goals):
"""计算效率:完成目标所需的对话轮数"""
total_turns = len(dialogue)
goals_achieved = len(completed_goals)
return goals_achieved / total_turns if total_turns > 0 else 0
def evaluate_user_satisfaction(self, dialogue):
"""基于对话特征估计用户满意度"""
features = {
'avg_response_length': self.avg_response_length(dialogue),
'user_engagement': self.calculate_engagement(dialogue),
'topic_shifts': self.count_topic_shifts(dialogue),
'positive_sentiment': self.sentiment_score(dialogue)
}
# 简化的满意度模型
satisfaction = (
features['user_engagement'] * 0.4 +
features['positive_sentiment'] * 0.3 +
(1 - features['topic_shifts'] / len(dialogue)) * 0.3
)
return satisfaction
人工评估设计
评估任务模板
class HumanEvaluationTemplate:
def create_pairwise_comparison(self, dialogue_a, dialogue_b):
return {
'type': 'pairwise',
'instruction': '请比较以下两个对话,选择更好的一个:',
'dialogue_a': dialogue_a,
'dialogue_b': dialogue_b,
'dimensions': [
{'name': '整体质量', 'options': ['A好很多', 'A略好', '差不多', 'B略好', 'B好很多']},
{'name': '自然度', 'options': ['A好很多', 'A略好', '差不多', 'B略好', 'B好很多']},
{'name': '有帮助程度', 'options': ['A好很多', 'A略好', '差不多', 'B略好', 'B好很多']}
],
'open_comment': '请简要说明选择理由:'
}
def create_rating_task(self, dialogue):
return {
'type': 'rating',
'instruction': '请对以下对话进行评分:',
'dialogue': dialogue,
'dimensions': [
{'name': '连贯性', 'scale': '1-5', 'description': '对话是否逻辑连贯'},
{'name': '有用性', 'scale': '1-5', 'description': '助手回复是否有帮助'},
{'name': '自然度', 'scale': '1-5', 'description': '对话是否自然流畅'},
{'name': '安全性', 'scale': '1-5', 'description': '是否存在不当内容'}
]
}
自动化测试框架
class ChatbotTestSuite:
def __init__(self, chatbot_client):
self.client = chatbot_client
self.test_cases = []
def add_test_case(self, name, turns, expected_behavior):
self.test_cases.append({
'name': name,
'turns': turns,
'expected': expected_behavior
})
def run_tests(self):
results = []
for test in self.test_cases:
result = self.run_single_test(test)
results.append(result)
return self.generate_report(results)
def run_single_test(self, test):
conversation = []
for turn in test['turns']:
if turn['role'] == 'user':
response = self.client.send_message(
turn['content'],
conversation_history=conversation
)
conversation.append({'role': 'assistant', 'content': response})
# 评估结果
evaluation = self.evaluate_response(
conversation,
test['expected']
)
return {
'test_name': test['name'],
'passed': evaluation['passed'],
'details': evaluation
}
def evaluate_response(self, conversation, expected):
"""评估回复是否符合预期"""
checks = []
# 检查关键词
if 'keywords' in expected:
full_response = ' '.join([m['content'] for m in conversation])
keyword_present = any(kw in full_response for kw in expected['keywords'])
checks.append(('keywords', keyword_present))
# 检查不包含的内容
if 'forbidden' in expected:
full_response = ' '.join([m['content'] for m in conversation])
forbidden_present = any(f in full_response for f in expected['forbidden'])
checks.append(('forbidden', not forbidden_present))
# 检查对话轮数
if 'max_turns' in expected:
within_limit = len(conversation) <= expected['max_turns']
checks.append(('turns', within_limit))
passed = all(result for _, result in checks)
return {
'passed': passed,
'checks': checks
}
评估报告模板
def generate_evaluation_report(results):
return {
'summary': {
'total_tests': len(results),
'passed': sum(1 for r in results if r['passed']),
'pass_rate': sum(1 for r in results if r['passed']) / len(results)
},
'metrics': {
'avg_coherence': np.mean([r['metrics'].coherence for r in results]),
'avg_relevance': np.mean([r['metrics'].relevance for r in results]),
'avg_safety': np.mean([r['metrics'].safety for r in results])
},
'issues': [
r for r in results if not r['passed']
],
'recommendations': generate_recommendations(results)
}
最佳实践
- 多维度评估:综合考虑连贯性、相关性、安全性等
- 自动化+人工:自动指标初筛,人工评估深度检查
- A/B测试:版本对比评估
- 用户研究:真实用户场景测试
- 回归测试:确保新版本不退化
- 持续监控:上线后持续评估质量