人工评估:LLM输出质量的人工评估方法和最佳实践
人工评估:LLM输出质量的人工评估方法和最佳实践
为什么需要人工评估
自动化指标虽然高效,但在评估主观质量(如对话自然度、创意性、安全性)时存在局限。人工评估是验证LLM输出质量的金标准。
评估标准设计
多维度评估框架
from dataclasses import dataclass
from enum import Enum
from typing import List, Dict
class RatingScale(Enum):
POOR = 1
FAIR = 2
GOOD = 3
VERY_GOOD = 4
EXCELLENT = 5
@dataclass
class EvaluationCriteria:
name: str
description: str
weight: float
scale: RatingScale
class LLMEvaluationFramework:
def __init__(self):
self.criteria = [
EvaluationCriteria(
name="relevance",
description="回答与问题的相关程度",
weight=0.25,
scale=RatingScale.EXCELLENT
),
EvaluationCriteria(
name="accuracy",
description="信息的准确性和可靠性",
weight=0.25,
scale=RatingScale.EXCELLENT
),
EvaluationCriteria(
name="completeness",
description="回答的完整程度",
weight=0.20,
scale=RatingScale.EXCELLENT
),
EvaluationCriteria(
name="clarity",
description="表达的清晰度和易理解性",
weight=0.15,
scale=RatingScale.EXCELLENT
),
EvaluationCriteria(
name="safety",
description="内容的安全性和合规性",
weight=0.15,
scale=RatingScale.EXCELLENT
)
]
def calculate_score(self, ratings: Dict[str, int]) -> float:
"""计算加权总分"""
total_score = 0
total_weight = 0
for criteria in self.criteria:
if criteria.name in ratings:
total_score += ratings[criteria.name] * criteria.weight
total_weight += criteria.weight
return total_score / total_weight if total_weight > 0 else 0
评估问题设计
class EvaluationQuestionDesigner:
def design_pairwise_question(self, prompt, response_a, response_b):
"""设计成对比较问题"""
return {
'type': 'pairwise',
'prompt': prompt,
'responses': {
'A': response_a,
'B': response_b
},
'question': '哪个回答更好?',
'dimensions': ['整体质量', '准确性', '有用性'],
'options': ['A明显更好', 'A略好', '差不多', 'B略好', 'B明显更好']
}
def design_rating_question(self, prompt, response):
"""设计评分问题"""
return {
'type': 'rating',
'prompt': prompt,
'response': response,
'dimensions': [
{'name': '相关性', 'scale': 1-5},
{'name': '准确性', 'scale': 1-5},
{'name': '完整性', 'scale': 1-5},
{'name': '清晰度', 'scale': 1-5}
],
'open_comment': True
}
def design_safety_question(self, prompt, response):
"""设计安全评估问题"""
return {
'type': 'safety',
'prompt': prompt,
'response': response,
'checks': [
'是否包含有害内容',
'是否涉及敏感话题',
'是否符合伦理规范',
'是否可能被滥用'
]
}
评估员管理
评估员培训流程
class EvaluatorTraining:
def __init__(self):
self.training_materials = []
self.qualifications = []
def create_training_program(self):
return {
'phase_1': {
'name': '基础培训',
'duration': '2小时',
'content': [
'评估标准讲解',
'评估工具使用',
'案例分析练习'
]
},
'phase_2': {
'name': '校准训练',
'duration': '1小时',
'content': [
'与专家评估对比',
'讨论分歧案例',
'统一评估标准'
]
},
'phase_3': {
'name': '试评估',
'duration': '2小时',
'content': [
'完成50个测试案例',
'准确率要求>85%',
'通过后正式上岗'
]
}
}
def check_qualification(self, evaluator_id):
"""检查评估员资质"""
results = self.get_evaluation_results(evaluator_id)
# 与专家评估的一致性
agreement_rate = self.calculate_agreement(results)
# 评估速度
avg_time = self.calculate_avg_time(results)
return {
'qualified': agreement_rate > 0.8 and avg_time < 120,
'agreement_rate': agreement_rate,
'avg_time_seconds': avg_time
}
评估员质量控制
class EvaluatorQualityControl:
def __init__(self, gold_standard_dataset):
self.gold_standard = gold_standard_dataset
def monitor_evaluator(self, evaluator_id, recent_evaluations):
"""监控评估员质量"""
# 与金标准对比
gold_matches = 0
total = 0
for evaluation in recent_evaluations:
if evaluation['item_id'] in self.gold_standard:
gold = self.gold_standard[evaluation['item_id']]
if self.is_match(evaluation['scores'], gold['scores']):
gold_matches += 1
total += 1
accuracy = gold_matches / total if total > 0 else 0
# 检测偏差
bias = self.detect_bias(recent_evaluations)
# 检测一致性
consistency = self.check_consistency(evaluator_id)
return {
'accuracy': accuracy,
'bias': bias,
'consistency': consistency,
'needs_retraining': accuracy < 0.8 or consistency < 0.7
}
def detect_bias(self, evaluations):
"""检测评估偏差"""
# 检查是否对某些模型有偏好
model_scores = {}
for e in evaluations:
model = e.get('model', 'unknown')
if model not in model_scores:
model_scores[model] = []
model_scores[model].append(e['overall_score'])
avg_scores = {
model: sum(scores)/len(scores)
for model, scores in model_scores.items()
}
if len(avg_scores) > 1:
values = list(avg_scores.values())
variance = sum((x - sum(values)/len(values)) ** 2 for x in values) / len(values)
return variance > 0.5 # 方差过大说明有偏差
return False
评估流程管理
批量评估任务
class EvaluationBatchManager:
def __init__(self, db_client, task_queue):
self.db = db_client
self.queue = task_queue
def create_batch(self, items, evaluators_per_item=3):
"""创建批量评估任务"""
batch_id = self.db.create_batch(len(items))
for item in items:
# 每个项目分配多个评估员
assigned_evaluators = self.select_evaluators(
evaluators_per_item,
item.get('domain')
)
for evaluator in assigned_evaluators:
self.queue.enqueue({
'batch_id': batch_id,
'item_id': item['id'],
'evaluator_id': evaluator['id'],
'deadline': self.calculate_deadline()
})
return batch_id
def select_evaluators(self, count, domain=None):
"""选择合适的评估员"""
query = """
SELECT e.id, e.name, e.expertise, q.accuracy_rate
FROM evaluators e
JOIN evaluator_qualifications q ON e.id = q.evaluator_id
WHERE e.status = 'active'
AND (e.expertise @> %s::jsonb OR %s IS NULL)
ORDER BY q.accuracy_rate DESC
LIMIT %s
"""
return self.db.execute(query, (domain, domain, count))
def aggregate_results(self, batch_id):
"""聚合评估结果"""
results = self.db.get_batch_results(batch_id)
aggregated = {}
for item_id, evaluations in results.items():
scores = [e['scores'] for e in evaluations]
# 计算平均分
avg_scores = self.average_scores(scores)
# 计算评估员一致性
agreement = self.calculate_inter_rater_agreement(scores)
aggregated[item_id] = {
'scores': avg_scores,
'agreement': agreement,
'num_evaluations': len(evaluations)
}
return aggregated
评估报告生成
class EvaluationReportGenerator:
def generate_report(self, batch_results):
report = {
'summary': {
'total_items': len(batch_results),
'avg_score': self.calculate_overall_average(batch_results),
'score_distribution': self.get_score_distribution(batch_results)
},
'quality_metrics': {
'high_quality_rate': self.calculate_quality_rate(batch_results, threshold=4.0),
'low_quality_rate': self.calculate_quality_rate(batch_results, threshold=2.0)
},
'issues': self.identify_issues(batch_results),
'recommendations': self.generate_recommendations(batch_results)
}
return report
def identify_issues(self, results):
issues = []
for item_id, result in results.items():
if result['scores']['accuracy'] < 3:
issues.append({
'item_id': item_id,
'type': 'accuracy',
'severity': 'high'
})
if result['agreement'] < 0.6:
issues.append({
'item_id': item_id,
'type': 'ambiguity',
'severity': 'medium'
})
return issues
最佳实践
- 明确标准:制定清晰、可操作的评估指南
- 多评估员:每个样本至少2-3人评估
- 定期校准:定期组织评估员校准会议
- 质量监控:持续监控评估员质量
- 金标准:维护高质量的参考答案集
- 迭代优化:根据评估结果优化标准和流程