🧠

基准数据集：LLM评测常用基准数据集介绍

📂 llm ⏱ 7 min 1313 words

基准数据集模型评测性能评估质量评估 LLM评估

基准数据集：LLM评测常用基准数据集介绍

LLM 评测的重要性

随着大语言模型的快速发展，客观、全面的模型评测变得至关重要。基准数据集（Benchmark Datasets）为模型评估提供了标准化的测试环境，帮助研究者和开发者理解模型的能力边界和改进方向。

基准数据集的核心价值：

标准化比较：提供统一的评估标准，便于不同模型间的公平比较
能力画像：揭示模型在不同维度的能力表现
改进方向：指导模型优化的重点领域
质量保证：确保模型在部署前达到预期质量标准

主流基准数据集分类

1. 知识与推理能力基准

MMLU (Massive Multitask Language Understanding)

# MMLU数据集加载与评估示例
from datasets import load_dataset
import json

class MMLUEvaluator:
    """MMLU评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("cais/mmlu", "all")
        self.subjects = [
            "abstract_algebra", "anatomy", "astronomy", "business_ethics",
            "college_biology", "college_chemistry", "college_computer_science"
        ]
    
    def format_prompt(self, question: dict) -> str:
        """格式化评测提示"""
        prompt = f"Question: {question['question']}\n"
        prompt += f"A. {question['choices'][0]}\n"
        prompt += f"B. {question['choices'][1]}\n"
        prompt += f"C. {question['choices'][2]}\n"
        prompt += f"D. {question['choices'][3]}\n"
        prompt += "Answer:"
        return prompt
    
    def evaluate_subject(self, subject: str, model, n_samples: int = 100) -> dict:
        """评估特定学科"""
        subject_data = self.dataset["test"].filter(lambda x: x["subject"] == subject)
        
        correct = 0
        total = min(n_samples, len(subject_data))
        
        for i in range(total):
            sample = subject_data[i]
            prompt = self.format_prompt(sample)
            
            # 获取模型预测
            prediction = model.predict(prompt)
            answer = prediction.strip().upper()
            
            # 检查正确性
            if answer == sample["answer"]:
                correct += 1
        
        return {
            "subject": subject,
            "accuracy": correct / total,
            "total": total,
            "correct": correct
        }
    
    def evaluate_all(self, model, n_per_subject: int = 50) -> dict:
        """评估所有学科"""
        results = {}
        for subject in self.subjects:
            results[subject] = self.evaluate_subject(subject, model, n_per_subject)
        
        # 计算总体准确率
        total_correct = sum(r["correct"] for r in results.values())
        total_samples = sum(r["total"] for r in results.values())
        
        return {
            "overall_accuracy": total_correct / total_samples,
            "by_subject": results
        }

# 使用示例
evaluator = MMLUEvaluator()
# results = evaluator.evaluate_all(my_model)

GSM8K (Grade School Math 8K)

class GSM8KEvaluator:
    """GSM8K数学推理评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("openai/gsm8k", "main")
    
    def format_prompt(self, problem: dict) -> str:
        """格式化数学问题"""
        return f"Question: {problem['question']}\n\nLet's think step by step."
    
    def extract_answer(self, response: str) -> float:
        """从模型响应中提取答案"""
        # 寻找最后一个数字
        import re
        numbers = re.findall(r'[-+]?\d*\.?\d+', response)
        if numbers:
            return float(numbers[-1])
        return None
    
    def evaluate(self, model, n_samples: int = 200) -> dict:
        """评估数学推理能力"""
        test_data = self.dataset["test"]
        
        correct = 0
        total = min(n_samples, len(test_data))
        
        for i in range(total):
            sample = test_data[i]
            prompt = self.format_prompt(sample)
            
            response = model.generate(prompt)
            predicted_answer = self.extract_answer(response)
            
            # 提取标准答案
            expected_answer = float(sample["answer"].split("#### ")[1].replace(",", ""))
            
            if predicted_answer == expected_answer:
                correct += 1
        
        return {
            "accuracy": correct / total,
            "total": total,
            "correct": correct
        }

# 使用示例
evaluator = GSM8KEvaluator()
# math_results = evaluator.evaluate(my_model, n_samples=100)

2. 语言理解与生成基准

HellaSwag (常识推理)

class HellaSwagEvaluator:
    """HellaSwag常识推理评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("Rowan/hellaswag", "default")
    
    def format_prompt(self, sample: dict) -> str:
        """格式化常识推理问题"""
        context = sample["activity_label"] + ": " + sample["ctx_a"] + " " + sample["ctx_b"]
        prompt = f"{context}\n\nWhich of the following is the most logical continuation?\n"
        for i, ending in enumerate(sample["endings"]):
            prompt += f"{chr(65+i)}. {ending}\n"
        prompt += "Answer:"
        return prompt
    
    def evaluate(self, model, n_samples: int = 1000) -> dict:
        """评估常识推理能力"""
        correct = 0
        total = min(n_samples, len(self.dataset["validation"]))
        
        for i in range(total):
            sample = self.dataset["validation"][i]
            prompt = self.format_prompt(sample)
            
            response = model.generate(prompt)
            prediction = response.strip().upper()
            
            # 检查是否正确选择
            if prediction == chr(65 + sample["label"]):
                correct += 1
        
        return {
            "accuracy": correct / total,
            "total": total
        }

ARC (AI2 Reasoning Challenge)

class ARCEvaluator:
    """ARC推理挑战评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")
    
    def format_prompt(self, sample: dict) -> str:
        """格式化ARC问题"""
        prompt = f"Question: {sample['question']}\n"
        for choice in sample["choices"]["text"]:
            prompt += f"- {choice}\n"
        prompt += "Answer:"
        return prompt
    
    def evaluate(self, model, subset: str = "validation", n_samples: int = 500) -> dict:
        """评估推理能力"""
        data = self.dataset[subset]
        
        correct = 0
        total = min(n_samples, len(data))
        
        for i in range(total):
            sample = data[i]
            prompt = self.format_prompt(sample)
            
            response = model.generate(prompt)
            prediction = response.strip()
            
            # 简单匹配检查
            answer_idx = sample["answerKey"]
            if prediction == answer_idx or prediction.lower() == sample["choices"]["label"][int(answer_idx)].lower():
                correct += 1
        
        return {
            "accuracy": correct / total,
            "total": total
        }

3. 代码生成基准

HumanEval

class HumanEvalEvaluator:
    """HumanEval代码生成评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("openai_humaneval")
    
    def format_prompt(self, sample: dict) -> str:
        """格式化代码生成问题"""
        return f"```python\n{sample['prompt']}\n```\n\nComplete the function above."
    
    def extract_code(self, response: str) -> str:
        """从响应中提取代码"""
        import re
        code_match = re.search(r'```python\n(.*?)```', response, re.DOTALL)
        if code_match:
            return code_match.group(1)
        return response
    
    def evaluate(self, model, n_samples: int = 164) -> dict:
        """评估代码生成能力"""
        pass_1_count = 0
        total = min(n_samples, len(self.dataset["test"]))
        
        for i in range(total):
            sample = self.dataset["test"][i]
            prompt = self.format_prompt(sample)
            
            response = model.generate(prompt)
            code = self.extract_code(response)
            
            # 执行测试用例
            try:
                exec_globals = {}
                exec(sample["prompt"] + code, exec_globals)
                exec(sample["test"], exec_globals)
                pass_1_count += 1
            except Exception:
                pass
        
        return {
            "pass@1": pass_1_count / total,
            "total": total,
            "passed": pass_1_count
        }

# 使用示例
evaluator = HumanEvalEvaluator()
# code_results = evaluator.evaluate(my_model)

MBPP (Mostly Basic Python Problems)

class MBPPEvaluator:
    """MBPP代码评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("google-research-datasets/mbpp")
    
    def format_prompt(self, sample: dict) -> str:
        """格式化代码问题"""
        return f"Task: {sample['text']}\n\nWrite a Python function to solve this task."
    
    def evaluate(self, model, n_samples: int = 500) -> dict:
        """评估代码生成能力"""
        correct = 0
        total = min(n_samples, len(self.dataset["test"]))
        
        for i in range(total):
            sample = self.dataset["test"][i]
            prompt = self.format_prompt(sample)
            
            response = model.generate(prompt)
            
            # 执行测试用例
            try:
                exec_globals = {}
                exec(response, exec_globals)
                exec(sample["test_list"][0], exec_globals)
                correct += 1
            except Exception:
                pass
        
        return {
            "accuracy": correct / total,
            "total": total
        }

4. 对话与指令遵循基准

MT-Bench (多轮对话评测)

class MTBenchEvaluator:
    """MT-Bench多轮对话评测器"""
    
    def __init__(self):
        self.categories = [
            "writing", "roleplay", "reasoning", "math", "coding",
            "extraction", "stem", "humanities"
        ]
    
    def create_evaluation_prompt(self, conversation: list) -> str:
        """创建评测提示"""
        prompt = "Please evaluate the following conversation:\n\n"
        for turn in conversation:
            role = turn["role"]
            content = turn["content"]
            prompt += f"{role}: {content}\n\n"
        prompt += "Evaluate the response on a scale of 1-10 for helpfulness, accuracy, and safety."
        return prompt
    
    def evaluate_conversation(self, model, conversation: list) -> dict:
        """评估单个对话"""
        eval_prompt = self.create_evaluation_prompt(conversation)
        
        # 使用GPT-4作为评判
        from openai import OpenAI
        client = OpenAI()
        
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": eval_prompt}],
            temperature=0.1
        )
        
        return {
            "conversation": conversation,
            "evaluation": response.choices[0].message.content
        }
    
    def evaluate_category(self, model, category: str, n_samples: int = 10) -> dict:
        """评估特定类别"""
        # 加载该类别的测试数据
        # 这里简化处理
        return {
            "category": category,
            "avg_score": 7.5,  # 示例分数
            "samples": n_samples
        }

# 使用示例
evaluator = MTBenchEvaluator()

AlpacaEval

class AlpacaEvalEvaluator:
    """AlpacaEval指令遵循评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("tatsu-lab/alpaca_eval")
    
    def evaluate(self, model, n_samples: int = 805) -> dict:
        """评估指令遵循能力"""
        win_count = 0
        total = min(n_samples, len(self.dataset["eval"]))
        
        for i in range(total):
            sample = self.dataset["eval"][i]
            
            # 生成模型响应
            model_response = model.generate(sample["instruction"])
            
            # 使用GPT-4比较
            from openai import OpenAI
            client = OpenAI()
            
            comparison_prompt = f"Which response is better?\n\nInstruction: {sample['instruction']}\n\nResponse A: {model_response}\n\nResponse B: {sample['output']}\n\nBetter response (A or B):"
            
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": comparison_prompt}],
                temperature=0.1
            )
            
            if "A" in response.choices[0].message.content:
                win_count += 1
        
        return {
            "win_rate": win_count / total,
            "total": total,
            "wins": win_count
        }

5. 安全性基准

TruthfulQA (真实性评测)

class TruthfulQAEvaluator:
    """TruthfulQA真实性评测器"""
    
    def __init__(self):
        self.dataset = load_dataset("truthful_qa", "generation")
    
    def evaluate(self, model, n_samples: int = 817) -> dict:
        """评估模型真实性"""
        truthful_count = 0
        total = min(n_samples, len(self.dataset["validation"]))
        
        for i in range(total):
            sample = self.dataset["validation"][i]
            question = sample["question"]
            
            response = model.generate(question)
            
            # 检查是否包含真实答案
            # 这里简化处理，实际应用需要更复杂的评估
            if self._is_truthful(response, sample["correct_answers"]):
                truthful_count += 1
        
        return {
            "truthfulness_rate": truthful_count / total,
            "total": total
        }
    
    def _is_truthful(self, response: str, correct_answers: list) -> bool:
        """检查回答是否真实"""
        response_lower = response.lower()
        for answer in correct_answers:
            if answer.lower() in response_lower:
                return True
        return False

基准数据集使用最佳实践

1. 评测流程设计

class LLMEvaluationPipeline:
    """LLM评测流水线"""
    
    def __init__(self):
        self.evaluators = {
            "mmlu": MMLUEvaluator(),
            "gsm8k": GSM8KEvaluator(),
            "humaneval": HumanEvalEvaluator(),
            "hellaswag": HellaSwagEvaluator()
        }
    
    def run_comprehensive_evaluation(self, model, config: dict = None) -> dict:
        """运行全面评测"""
        if config is None:
            config = {
                "mmlu": {"n_samples": 100},
                "gsm8k": {"n_samples": 200},
                "humaneval": {"n_samples": 164},
                "hellaswag": {"n_samples": 500}
            }
        
        results = {}
        
        for benchmark, params in config.items():
            if benchmark in self.evaluators:
                evaluator = self.evaluators[benchmark]
                results[benchmark] = evaluator.evaluate(model, **params)
        
        return results
    
    def generate_report(self, results: dict) -> str:
        """生成评测报告"""
        report = "# LLM Evaluation Report\n\n"
        
        for benchmark, result in results.items():
            report += f"## {benchmark.upper()}\n"
            for key, value in result.items():
                if isinstance(value, float):
                    report += f"- {key}: {value:.4f}\n"
                else:
                    report += f"- {key}: {value}\n"
            report += "\n"
        
        return report

# 使用示例
pipeline = LLMEvaluationPipeline()
# results = pipeline.run_comprehensive_evaluation(my_model)
# report = pipeline.generate_report(results)

2. 结果分析与可视化

import matplotlib.pyplot as plt
import numpy as np

class BenchmarkAnalyzer:
    """基准评测结果分析器"""
    
    def __init__(self):
        self.metrics_history = []
    
    def plot_comparison(self, model_results: dict, baseline_results: dict = None):
        """绘制模型比较图"""
        benchmarks = list(model_results.keys())
        scores = [model_results[b].get("accuracy", 0) for b in benchmarks]
        
        x = np.arange(len(benchmarks))
        width = 0.35
        
        fig, ax = plt.subplots(figsize=(12, 6))
        
        if baseline_results:
            baseline_scores = [baseline_results[b].get("accuracy", 0) for b in benchmarks]
            ax.bar(x - width/2, baseline_scores, width, label='Baseline')
            ax.bar(x + width/2, scores, width, label='Current Model')
        else:
            ax.bar(x, scores)
        
        ax.set_ylabel('Score')
        ax.set_title('Benchmark Comparison')
        ax.set_xticks(x)
        ax.set_xticklabels(benchmarks, rotation=45)
        ax.legend()
        
        plt.tight_layout()
        return fig
    
    def track_improvement(self, results_history: list):
        """跟踪改进趋势"""
        self.metrics_history.extend(results_history)
    
    def get_improvement_report(self) -> dict:
        """获取改进报告"""
        if len(self.metrics_history) < 2:
            return {"message": "Not enough data for improvement analysis"}
        
        first = self.metrics_history[0]
        last = self.metrics_history[-1]
        
        improvements = {}
        for metric in first.keys():
            if metric in last and isinstance(first[metric], (int, float)):
                improvement = last[metric] - first[metric]
                improvements[metric] = {
                    "initial": first[metric],
                    "current": last[metric],
                    "improvement": improvement,
                    "improvement_pct": (improvement / first[metric]) * 100 if first[metric] != 0 else 0
                }
        
        return improvements

基准数据集选择指南

评测维度	推荐数据集	适用场景
知识理解	MMLU, ARC	通用能力评估
数学推理	GSM8K, MATH	逻辑推理能力
代码生成	HumanEval, MBPP	编程能力评估
常识推理	HellaSwag, WinoGrande	语言理解能力
对话质量	MT-Bench, AlpacaEval	指令遵循能力
真实性	TruthfulQA	信息准确性
安全性	ToxiGen, BBQ	安全性评估

总结

基准数据集是LLM评测的基石。选择合适的基准数据集、遵循科学的评测方法，才能准确评估模型能力并指导改进方向。随着LLM技术的发展，新的基准数据集也在不断涌现，持续关注最新的评测方法对于保持模型竞争力至关重要。