🧠

人工评估：LLM输出质量的人工评估方法和最佳实践

📂 llm ⏱ 4 min 643 words

人工评估：LLM输出质量的人工评估方法和最佳实践

为什么需要人工评估

自动化指标虽然高效，但在评估主观质量（如对话自然度、创意性、安全性）时存在局限。人工评估是验证LLM输出质量的金标准。

评估标准设计

多维度评估框架

from dataclasses import dataclass
from enum import Enum
from typing import List, Dict

class RatingScale(Enum):
    POOR = 1
    FAIR = 2
    GOOD = 3
    VERY_GOOD = 4
    EXCELLENT = 5

@dataclass
class EvaluationCriteria:
    name: str
    description: str
    weight: float
    scale: RatingScale

class LLMEvaluationFramework:
    def __init__(self):
        self.criteria = [
            EvaluationCriteria(
                name="relevance",
                description="回答与问题的相关程度",
                weight=0.25,
                scale=RatingScale.EXCELLENT
            ),
            EvaluationCriteria(
                name="accuracy",
                description="信息的准确性和可靠性",
                weight=0.25,
                scale=RatingScale.EXCELLENT
            ),
            EvaluationCriteria(
                name="completeness",
                description="回答的完整程度",
                weight=0.20,
                scale=RatingScale.EXCELLENT
            ),
            EvaluationCriteria(
                name="clarity",
                description="表达的清晰度和易理解性",
                weight=0.15,
                scale=RatingScale.EXCELLENT
            ),
            EvaluationCriteria(
                name="safety",
                description="内容的安全性和合规性",
                weight=0.15,
                scale=RatingScale.EXCELLENT
            )
        ]
    
    def calculate_score(self, ratings: Dict[str, int]) -> float:
        """计算加权总分"""
        total_score = 0
        total_weight = 0
        
        for criteria in self.criteria:
            if criteria.name in ratings:
                total_score += ratings[criteria.name] * criteria.weight
                total_weight += criteria.weight
        
        return total_score / total_weight if total_weight > 0 else 0

评估问题设计

class EvaluationQuestionDesigner:
    def design_pairwise_question(self, prompt, response_a, response_b):
        """设计成对比较问题"""
        return {
            'type': 'pairwise',
            'prompt': prompt,
            'responses': {
                'A': response_a,
                'B': response_b
            },
            'question': '哪个回答更好？',
            'dimensions': ['整体质量', '准确性', '有用性'],
            'options': ['A明显更好', 'A略好', '差不多', 'B略好', 'B明显更好']
        }
    
    def design_rating_question(self, prompt, response):
        """设计评分问题"""
        return {
            'type': 'rating',
            'prompt': prompt,
            'response': response,
            'dimensions': [
                {'name': '相关性', 'scale': 1-5},
                {'name': '准确性', 'scale': 1-5},
                {'name': '完整性', 'scale': 1-5},
                {'name': '清晰度', 'scale': 1-5}
            ],
            'open_comment': True
        }
    
    def design_safety_question(self, prompt, response):
        """设计安全评估问题"""
        return {
            'type': 'safety',
            'prompt': prompt,
            'response': response,
            'checks': [
                '是否包含有害内容',
                '是否涉及敏感话题',
                '是否符合伦理规范',
                '是否可能被滥用'
            ]
        }

评估员管理

评估员培训流程

class EvaluatorTraining:
    def __init__(self):
        self.training_materials = []
        self.qualifications = []
    
    def create_training_program(self):
        return {
            'phase_1': {
                'name': '基础培训',
                'duration': '2小时',
                'content': [
                    '评估标准讲解',
                    '评估工具使用',
                    '案例分析练习'
                ]
            },
            'phase_2': {
                'name': '校准训练',
                'duration': '1小时',
                'content': [
                    '与专家评估对比',
                    '讨论分歧案例',
                    '统一评估标准'
                ]
            },
            'phase_3': {
                'name': '试评估',
                'duration': '2小时',
                'content': [
                    '完成50个测试案例',
                    '准确率要求>85%',
                    '通过后正式上岗'
                ]
            }
        }
    
    def check_qualification(self, evaluator_id):
        """检查评估员资质"""
        results = self.get_evaluation_results(evaluator_id)
        
        # 与专家评估的一致性
        agreement_rate = self.calculate_agreement(results)
        
        # 评估速度
        avg_time = self.calculate_avg_time(results)
        
        return {
            'qualified': agreement_rate > 0.8 and avg_time < 120,
            'agreement_rate': agreement_rate,
            'avg_time_seconds': avg_time
        }

评估员质量控制

class EvaluatorQualityControl:
    def __init__(self, gold_standard_dataset):
        self.gold_standard = gold_standard_dataset
    
    def monitor_evaluator(self, evaluator_id, recent_evaluations):
        """监控评估员质量"""
        
        # 与金标准对比
        gold_matches = 0
        total = 0
        
        for evaluation in recent_evaluations:
            if evaluation['item_id'] in self.gold_standard:
                gold = self.gold_standard[evaluation['item_id']]
                if self.is_match(evaluation['scores'], gold['scores']):
                    gold_matches += 1
                total += 1
        
        accuracy = gold_matches / total if total > 0 else 0
        
        # 检测偏差
        bias = self.detect_bias(recent_evaluations)
        
        # 检测一致性
        consistency = self.check_consistency(evaluator_id)
        
        return {
            'accuracy': accuracy,
            'bias': bias,
            'consistency': consistency,
            'needs_retraining': accuracy < 0.8 or consistency < 0.7
        }
    
    def detect_bias(self, evaluations):
        """检测评估偏差"""
        # 检查是否对某些模型有偏好
        model_scores = {}
        for e in evaluations:
            model = e.get('model', 'unknown')
            if model not in model_scores:
                model_scores[model] = []
            model_scores[model].append(e['overall_score'])
        
        avg_scores = {
            model: sum(scores)/len(scores) 
            for model, scores in model_scores.items()
        }
        
        if len(avg_scores) > 1:
            values = list(avg_scores.values())
            variance = sum((x - sum(values)/len(values)) ** 2 for x in values) / len(values)
            return variance > 0.5  # 方差过大说明有偏差
        
        return False

评估流程管理

批量评估任务

class EvaluationBatchManager:
    def __init__(self, db_client, task_queue):
        self.db = db_client
        self.queue = task_queue
    
    def create_batch(self, items, evaluators_per_item=3):
        """创建批量评估任务"""
        batch_id = self.db.create_batch(len(items))
        
        for item in items:
            # 每个项目分配多个评估员
            assigned_evaluators = self.select_evaluators(
                evaluators_per_item,
                item.get('domain')
            )
            
            for evaluator in assigned_evaluators:
                self.queue.enqueue({
                    'batch_id': batch_id,
                    'item_id': item['id'],
                    'evaluator_id': evaluator['id'],
                    'deadline': self.calculate_deadline()
                })
        
        return batch_id
    
    def select_evaluators(self, count, domain=None):
        """选择合适的评估员"""
        query = """
        SELECT e.id, e.name, e.expertise, q.accuracy_rate
        FROM evaluators e
        JOIN evaluator_qualifications q ON e.id = q.evaluator_id
        WHERE e.status = 'active'
            AND (e.expertise @> %s::jsonb OR %s IS NULL)
        ORDER BY q.accuracy_rate DESC
        LIMIT %s
        """
        return self.db.execute(query, (domain, domain, count))
    
    def aggregate_results(self, batch_id):
        """聚合评估结果"""
        results = self.db.get_batch_results(batch_id)
        
        aggregated = {}
        for item_id, evaluations in results.items():
            scores = [e['scores'] for e in evaluations]
            
            # 计算平均分
            avg_scores = self.average_scores(scores)
            
            # 计算评估员一致性
            agreement = self.calculate_inter_rater_agreement(scores)
            
            aggregated[item_id] = {
                'scores': avg_scores,
                'agreement': agreement,
                'num_evaluations': len(evaluations)
            }
        
        return aggregated

评估报告生成

class EvaluationReportGenerator:
    def generate_report(self, batch_results):
        report = {
            'summary': {
                'total_items': len(batch_results),
                'avg_score': self.calculate_overall_average(batch_results),
                'score_distribution': self.get_score_distribution(batch_results)
            },
            'quality_metrics': {
                'high_quality_rate': self.calculate_quality_rate(batch_results, threshold=4.0),
                'low_quality_rate': self.calculate_quality_rate(batch_results, threshold=2.0)
            },
            'issues': self.identify_issues(batch_results),
            'recommendations': self.generate_recommendations(batch_results)
        }
        
        return report
    
    def identify_issues(self, results):
        issues = []
        
        for item_id, result in results.items():
            if result['scores']['accuracy'] < 3:
                issues.append({
                    'item_id': item_id,
                    'type': 'accuracy',
                    'severity': 'high'
                })
            
            if result['agreement'] < 0.6:
                issues.append({
                    'item_id': item_id,
                    'type': 'ambiguity',
                    'severity': 'medium'
                })
        
        return issues

最佳实践

明确标准：制定清晰、可操作的评估指南
多评估员：每个样本至少2-3人评估
定期校准：定期组织评估员校准会议
质量监控：持续监控评估员质量
金标准：维护高质量的参考答案集
迭代优化：根据评估结果优化标准和流程