基准数据集:LLM评测常用基准数据集介绍
基准数据集:LLM评测常用基准数据集介绍
LLM 评测的重要性
随着大语言模型的快速发展,客观、全面的模型评测变得至关重要。基准数据集(Benchmark Datasets)为模型评估提供了标准化的测试环境,帮助研究者和开发者理解模型的能力边界和改进方向。
基准数据集的核心价值:
- 标准化比较:提供统一的评估标准,便于不同模型间的公平比较
- 能力画像:揭示模型在不同维度的能力表现
- 改进方向:指导模型优化的重点领域
- 质量保证:确保模型在部署前达到预期质量标准
主流基准数据集分类
1. 知识与推理能力基准
MMLU (Massive Multitask Language Understanding)
# MMLU数据集加载与评估示例
from datasets import load_dataset
import json
class MMLUEvaluator:
"""MMLU评测器"""
def __init__(self):
self.dataset = load_dataset("cais/mmlu", "all")
self.subjects = [
"abstract_algebra", "anatomy", "astronomy", "business_ethics",
"college_biology", "college_chemistry", "college_computer_science"
]
def format_prompt(self, question: dict) -> str:
"""格式化评测提示"""
prompt = f"Question: {question['question']}\n"
prompt += f"A. {question['choices'][0]}\n"
prompt += f"B. {question['choices'][1]}\n"
prompt += f"C. {question['choices'][2]}\n"
prompt += f"D. {question['choices'][3]}\n"
prompt += "Answer:"
return prompt
def evaluate_subject(self, subject: str, model, n_samples: int = 100) -> dict:
"""评估特定学科"""
subject_data = self.dataset["test"].filter(lambda x: x["subject"] == subject)
correct = 0
total = min(n_samples, len(subject_data))
for i in range(total):
sample = subject_data[i]
prompt = self.format_prompt(sample)
# 获取模型预测
prediction = model.predict(prompt)
answer = prediction.strip().upper()
# 检查正确性
if answer == sample["answer"]:
correct += 1
return {
"subject": subject,
"accuracy": correct / total,
"total": total,
"correct": correct
}
def evaluate_all(self, model, n_per_subject: int = 50) -> dict:
"""评估所有学科"""
results = {}
for subject in self.subjects:
results[subject] = self.evaluate_subject(subject, model, n_per_subject)
# 计算总体准确率
total_correct = sum(r["correct"] for r in results.values())
total_samples = sum(r["total"] for r in results.values())
return {
"overall_accuracy": total_correct / total_samples,
"by_subject": results
}
# 使用示例
evaluator = MMLUEvaluator()
# results = evaluator.evaluate_all(my_model)
GSM8K (Grade School Math 8K)
class GSM8KEvaluator:
"""GSM8K数学推理评测器"""
def __init__(self):
self.dataset = load_dataset("openai/gsm8k", "main")
def format_prompt(self, problem: dict) -> str:
"""格式化数学问题"""
return f"Question: {problem['question']}\n\nLet's think step by step."
def extract_answer(self, response: str) -> float:
"""从模型响应中提取答案"""
# 寻找最后一个数字
import re
numbers = re.findall(r'[-+]?\d*\.?\d+', response)
if numbers:
return float(numbers[-1])
return None
def evaluate(self, model, n_samples: int = 200) -> dict:
"""评估数学推理能力"""
test_data = self.dataset["test"]
correct = 0
total = min(n_samples, len(test_data))
for i in range(total):
sample = test_data[i]
prompt = self.format_prompt(sample)
response = model.generate(prompt)
predicted_answer = self.extract_answer(response)
# 提取标准答案
expected_answer = float(sample["answer"].split("#### ")[1].replace(",", ""))
if predicted_answer == expected_answer:
correct += 1
return {
"accuracy": correct / total,
"total": total,
"correct": correct
}
# 使用示例
evaluator = GSM8KEvaluator()
# math_results = evaluator.evaluate(my_model, n_samples=100)
2. 语言理解与生成基准
HellaSwag (常识推理)
class HellaSwagEvaluator:
"""HellaSwag常识推理评测器"""
def __init__(self):
self.dataset = load_dataset("Rowan/hellaswag", "default")
def format_prompt(self, sample: dict) -> str:
"""格式化常识推理问题"""
context = sample["activity_label"] + ": " + sample["ctx_a"] + " " + sample["ctx_b"]
prompt = f"{context}\n\nWhich of the following is the most logical continuation?\n"
for i, ending in enumerate(sample["endings"]):
prompt += f"{chr(65+i)}. {ending}\n"
prompt += "Answer:"
return prompt
def evaluate(self, model, n_samples: int = 1000) -> dict:
"""评估常识推理能力"""
correct = 0
total = min(n_samples, len(self.dataset["validation"]))
for i in range(total):
sample = self.dataset["validation"][i]
prompt = self.format_prompt(sample)
response = model.generate(prompt)
prediction = response.strip().upper()
# 检查是否正确选择
if prediction == chr(65 + sample["label"]):
correct += 1
return {
"accuracy": correct / total,
"total": total
}
ARC (AI2 Reasoning Challenge)
class ARCEvaluator:
"""ARC推理挑战评测器"""
def __init__(self):
self.dataset = load_dataset("allenai/ai2_arc", "ARC-Easy")
def format_prompt(self, sample: dict) -> str:
"""格式化ARC问题"""
prompt = f"Question: {sample['question']}\n"
for choice in sample["choices"]["text"]:
prompt += f"- {choice}\n"
prompt += "Answer:"
return prompt
def evaluate(self, model, subset: str = "validation", n_samples: int = 500) -> dict:
"""评估推理能力"""
data = self.dataset[subset]
correct = 0
total = min(n_samples, len(data))
for i in range(total):
sample = data[i]
prompt = self.format_prompt(sample)
response = model.generate(prompt)
prediction = response.strip()
# 简单匹配检查
answer_idx = sample["answerKey"]
if prediction == answer_idx or prediction.lower() == sample["choices"]["label"][int(answer_idx)].lower():
correct += 1
return {
"accuracy": correct / total,
"total": total
}
3. 代码生成基准
HumanEval
class HumanEvalEvaluator:
"""HumanEval代码生成评测器"""
def __init__(self):
self.dataset = load_dataset("openai_humaneval")
def format_prompt(self, sample: dict) -> str:
"""格式化代码生成问题"""
return f"```python\n{sample['prompt']}\n```\n\nComplete the function above."
def extract_code(self, response: str) -> str:
"""从响应中提取代码"""
import re
code_match = re.search(r'```python\n(.*?)```', response, re.DOTALL)
if code_match:
return code_match.group(1)
return response
def evaluate(self, model, n_samples: int = 164) -> dict:
"""评估代码生成能力"""
pass_1_count = 0
total = min(n_samples, len(self.dataset["test"]))
for i in range(total):
sample = self.dataset["test"][i]
prompt = self.format_prompt(sample)
response = model.generate(prompt)
code = self.extract_code(response)
# 执行测试用例
try:
exec_globals = {}
exec(sample["prompt"] + code, exec_globals)
exec(sample["test"], exec_globals)
pass_1_count += 1
except Exception:
pass
return {
"pass@1": pass_1_count / total,
"total": total,
"passed": pass_1_count
}
# 使用示例
evaluator = HumanEvalEvaluator()
# code_results = evaluator.evaluate(my_model)
MBPP (Mostly Basic Python Problems)
class MBPPEvaluator:
"""MBPP代码评测器"""
def __init__(self):
self.dataset = load_dataset("google-research-datasets/mbpp")
def format_prompt(self, sample: dict) -> str:
"""格式化代码问题"""
return f"Task: {sample['text']}\n\nWrite a Python function to solve this task."
def evaluate(self, model, n_samples: int = 500) -> dict:
"""评估代码生成能力"""
correct = 0
total = min(n_samples, len(self.dataset["test"]))
for i in range(total):
sample = self.dataset["test"][i]
prompt = self.format_prompt(sample)
response = model.generate(prompt)
# 执行测试用例
try:
exec_globals = {}
exec(response, exec_globals)
exec(sample["test_list"][0], exec_globals)
correct += 1
except Exception:
pass
return {
"accuracy": correct / total,
"total": total
}
4. 对话与指令遵循基准
MT-Bench (多轮对话评测)
class MTBenchEvaluator:
"""MT-Bench多轮对话评测器"""
def __init__(self):
self.categories = [
"writing", "roleplay", "reasoning", "math", "coding",
"extraction", "stem", "humanities"
]
def create_evaluation_prompt(self, conversation: list) -> str:
"""创建评测提示"""
prompt = "Please evaluate the following conversation:\n\n"
for turn in conversation:
role = turn["role"]
content = turn["content"]
prompt += f"{role}: {content}\n\n"
prompt += "Evaluate the response on a scale of 1-10 for helpfulness, accuracy, and safety."
return prompt
def evaluate_conversation(self, model, conversation: list) -> dict:
"""评估单个对话"""
eval_prompt = self.create_evaluation_prompt(conversation)
# 使用GPT-4作为评判
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": eval_prompt}],
temperature=0.1
)
return {
"conversation": conversation,
"evaluation": response.choices[0].message.content
}
def evaluate_category(self, model, category: str, n_samples: int = 10) -> dict:
"""评估特定类别"""
# 加载该类别的测试数据
# 这里简化处理
return {
"category": category,
"avg_score": 7.5, # 示例分数
"samples": n_samples
}
# 使用示例
evaluator = MTBenchEvaluator()
AlpacaEval
class AlpacaEvalEvaluator:
"""AlpacaEval指令遵循评测器"""
def __init__(self):
self.dataset = load_dataset("tatsu-lab/alpaca_eval")
def evaluate(self, model, n_samples: int = 805) -> dict:
"""评估指令遵循能力"""
win_count = 0
total = min(n_samples, len(self.dataset["eval"]))
for i in range(total):
sample = self.dataset["eval"][i]
# 生成模型响应
model_response = model.generate(sample["instruction"])
# 使用GPT-4比较
from openai import OpenAI
client = OpenAI()
comparison_prompt = f"Which response is better?\n\nInstruction: {sample['instruction']}\n\nResponse A: {model_response}\n\nResponse B: {sample['output']}\n\nBetter response (A or B):"
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": comparison_prompt}],
temperature=0.1
)
if "A" in response.choices[0].message.content:
win_count += 1
return {
"win_rate": win_count / total,
"total": total,
"wins": win_count
}
5. 安全性基准
TruthfulQA (真实性评测)
class TruthfulQAEvaluator:
"""TruthfulQA真实性评测器"""
def __init__(self):
self.dataset = load_dataset("truthful_qa", "generation")
def evaluate(self, model, n_samples: int = 817) -> dict:
"""评估模型真实性"""
truthful_count = 0
total = min(n_samples, len(self.dataset["validation"]))
for i in range(total):
sample = self.dataset["validation"][i]
question = sample["question"]
response = model.generate(question)
# 检查是否包含真实答案
# 这里简化处理,实际应用需要更复杂的评估
if self._is_truthful(response, sample["correct_answers"]):
truthful_count += 1
return {
"truthfulness_rate": truthful_count / total,
"total": total
}
def _is_truthful(self, response: str, correct_answers: list) -> bool:
"""检查回答是否真实"""
response_lower = response.lower()
for answer in correct_answers:
if answer.lower() in response_lower:
return True
return False
基准数据集使用最佳实践
1. 评测流程设计
class LLMEvaluationPipeline:
"""LLM评测流水线"""
def __init__(self):
self.evaluators = {
"mmlu": MMLUEvaluator(),
"gsm8k": GSM8KEvaluator(),
"humaneval": HumanEvalEvaluator(),
"hellaswag": HellaSwagEvaluator()
}
def run_comprehensive_evaluation(self, model, config: dict = None) -> dict:
"""运行全面评测"""
if config is None:
config = {
"mmlu": {"n_samples": 100},
"gsm8k": {"n_samples": 200},
"humaneval": {"n_samples": 164},
"hellaswag": {"n_samples": 500}
}
results = {}
for benchmark, params in config.items():
if benchmark in self.evaluators:
evaluator = self.evaluators[benchmark]
results[benchmark] = evaluator.evaluate(model, **params)
return results
def generate_report(self, results: dict) -> str:
"""生成评测报告"""
report = "# LLM Evaluation Report\n\n"
for benchmark, result in results.items():
report += f"## {benchmark.upper()}\n"
for key, value in result.items():
if isinstance(value, float):
report += f"- {key}: {value:.4f}\n"
else:
report += f"- {key}: {value}\n"
report += "\n"
return report
# 使用示例
pipeline = LLMEvaluationPipeline()
# results = pipeline.run_comprehensive_evaluation(my_model)
# report = pipeline.generate_report(results)
2. 结果分析与可视化
import matplotlib.pyplot as plt
import numpy as np
class BenchmarkAnalyzer:
"""基准评测结果分析器"""
def __init__(self):
self.metrics_history = []
def plot_comparison(self, model_results: dict, baseline_results: dict = None):
"""绘制模型比较图"""
benchmarks = list(model_results.keys())
scores = [model_results[b].get("accuracy", 0) for b in benchmarks]
x = np.arange(len(benchmarks))
width = 0.35
fig, ax = plt.subplots(figsize=(12, 6))
if baseline_results:
baseline_scores = [baseline_results[b].get("accuracy", 0) for b in benchmarks]
ax.bar(x - width/2, baseline_scores, width, label='Baseline')
ax.bar(x + width/2, scores, width, label='Current Model')
else:
ax.bar(x, scores)
ax.set_ylabel('Score')
ax.set_title('Benchmark Comparison')
ax.set_xticks(x)
ax.set_xticklabels(benchmarks, rotation=45)
ax.legend()
plt.tight_layout()
return fig
def track_improvement(self, results_history: list):
"""跟踪改进趋势"""
self.metrics_history.extend(results_history)
def get_improvement_report(self) -> dict:
"""获取改进报告"""
if len(self.metrics_history) < 2:
return {"message": "Not enough data for improvement analysis"}
first = self.metrics_history[0]
last = self.metrics_history[-1]
improvements = {}
for metric in first.keys():
if metric in last and isinstance(first[metric], (int, float)):
improvement = last[metric] - first[metric]
improvements[metric] = {
"initial": first[metric],
"current": last[metric],
"improvement": improvement,
"improvement_pct": (improvement / first[metric]) * 100 if first[metric] != 0 else 0
}
return improvements
基准数据集选择指南
| 评测维度 | 推荐数据集 | 适用场景 |
|---|---|---|
| 知识理解 | MMLU, ARC | 通用能力评估 |
| 数学推理 | GSM8K, MATH | 逻辑推理能力 |
| 代码生成 | HumanEval, MBPP | 编程能力评估 |
| 常识推理 | HellaSwag, WinoGrande | 语言理解能力 |
| 对话质量 | MT-Bench, AlpacaEval | 指令遵循能力 |
| 真实性 | TruthfulQA | 信息准确性 |
| 安全性 | ToxiGen, BBQ | 安全性评估 |
总结
基准数据集是LLM评测的基石。选择合适的基准数据集、遵循科学的评测方法,才能准确评估模型能力并指导改进方向。随着LLM技术的发展,新的基准数据集也在不断涌现,持续关注最新的评测方法对于保持模型竞争力至关重要。