基准测试
--- title: "基准测试" description: "LLM基准测试详解,包括MMLU、HumanEval等主流基准测试和排行榜分析" tags: ["基准测试", "MMLU", "HumanEval", "排行榜"] category: "llm" icon: "🧠"
基准测试
基准测试的意义
基准测试(Benchmark)是评估LLM性能的标准方法。通过统一的测试集和评估标准,可以客观比较不同模型的能力,推动模型发展。
主流基准测试
1. MMLU(Massive Multitask Language Understanding)
import json
from typing import List, Dict
from datasets import load_dataset
class MMLUEvaluator:
def __init__(self):
self.dataset = load_dataset("cais/mmlu", "all")
self.subjects = [
"abstract_algebra", "anatomy", "astronomy", "business_ethics",
"college_biology", "college_chemistry", "computer_security"
]
def evaluate_subject(self, model, subject: str, num_samples: int = 100) -> Dict:
correct = 0
total = 0
# 获取该学科的测试数据
test_data = self.dataset["test"].filter(
lambda x: x["subject"] == subject
).select(range(min(num_samples, len(self.dataset["test"]))))
for sample in test_data:
question = sample["question"]
choices = sample["choices"]
answer = sample["answer"]
# 构建提示
prompt = self.format_question(question, choices)
prediction = model.generate(prompt)
# 提取答案
predicted_answer = self.extract_answer(prediction)
if predicted_answer == answer:
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
return {
"subject": subject,
"accuracy": accuracy,
"correct": correct,
"total": total
}
def format_question(self, question: str, choices: List[str]) -> str:
formatted = f"问题: {question}\n\n选项:\n"
for i, choice in enumerate(choices):
formatted += f"{chr(65+i)}. {choice}\n"
formatted += "\n请给出正确答案的字母:"
return formatted
def extract_answer(self, response: str) -> int:
# 从模型响应中提取答案
response = response.strip().upper()
for i, letter in enumerate(['A', 'B', 'C', 'D']):
if response.startswith(letter):
return i
return -1
# 使用示例
evaluator = MMLUEvaluator()
result = evaluator.evaluate_subject(model, "computer_security")
print(f"MMLU计算机安全准确率: {result['accuracy']:.2%}")
2. HumanEval(代码生成评估)
from typing import Callable
import ast
class HumanEvalEvaluator:
def __init__(self):
self.dataset = self.load_humaneval()
def evaluate_problem(self, model, problem: Dict) -> Dict:
# 提取问题和测试用例
prompt = problem["prompt"]
test_cases = problem["test_cases"]
entry_point = problem["entry_point"]
# 生成代码
code_response = model.generate(
f"请完成以下Python函数:\n{prompt}"
)
# 提取生成的代码
generated_code = self.extract_code(code_response)
# 执行测试
test_results = self.run_tests(
generated_code, entry_point, test_cases
)
return {
"task_id": problem["task_id"],
"passed": all(test_results),
"pass_rate": sum(test_results) / len(test_results)
}
def extract_code(self, response: str) -> str:
# 从响应中提取Python代码
try:
# 尝试解析整个响应
ast.parse(response)
return response
except SyntaxError:
# 尝试提取代码块
import re
code_match = re.search(r'```python\n(.*?)```', response, re.DOTALL)
if code_match:
return code_match.group(1)
return ""
def run_tests(self, code: str, entry_point: str, test_cases: List[str]) -> List[bool]:
results = []
exec_globals = {}
try:
exec(code, exec_globals)
func = exec_globals[entry_point]
for test_case in test_cases:
try:
exec(test_case, {"func": func})
results.append(True)
except AssertionError:
results.append(False)
except Exception:
results = [False] * len(test_cases)
return results
# 评估示例
evaluator = HumanEvalEvaluator()
results = []
for problem in evaluator.dataset:
result = evaluator.evaluate_problem(model, problem)
results.append(result)
pass_rate = sum(r["passed"] for r in results) / len(results)
print(f"HumanEval通过率: {pass_rate:.2%}")
3. GSM8K(数学推理)
class GSM8KEvaluator:
def __init__(self):
self.dataset = load_dataset("gsm8k", "main")
def evaluate(self, model, num_samples: int = 200) -> Dict:
correct = 0
total = 0
test_data = self.dataset["test"].select(range(num_samples))
for sample in test_data:
question = sample["question"]
answer = sample["answer"]
# 生成推理过程和答案
prompt = f"""
请一步步解决这个数学问题,最后给出数字答案。
问题: {question}
解答:
"""
response = model.generate(prompt)
# 提取最终数字答案
predicted = self.extract_number(response)
expected = self.extract_number(answer)
if predicted == expected:
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
return {
"accuracy": accuracy,
"correct": correct,
"total": total
}
def extract_number(self, text: str) -> float:
import re
numbers = re.findall(r'-?\d+\.?\d*', text)
if numbers:
return float(numbers[-1])
return None
4. TruthfulQA(真实性)
class TruthfulQAEvaluator:
def __init__(self):
self.dataset = load_dataset("truthfulqa", "generation")
def evaluate(self, model) -> Dict:
truthful_count = 0
informative_count = 0
total = len(self.dataset["validation"])
for sample in self.dataset["validation"]:
question = sample["question"]
# 生成多个回答
responses = []
for _ in range 5):
response = model.generate(question)
responses.append(response)
# 评估回答
is_truthful = self.check_truthfulness(responses, sample["correct_answers"])
is_informative = self.check_informativeness(responses)
if is_truthful:
truthful_count += 1
if is_informative:
informative_count += 1
return {
"truthful_score": truthful_count / total,
"informative_score": informative_count / total
}
排行榜分析
class LeaderboardAnalyzer:
def __init__(self):
self.benchmarks = ["MMLU", "HumanEval", "GSM8K", "TruthfulQA"]
def compare_models(self, model_results: Dict[Dict]) -> pd.DataFrame:
"""
比较多个模型在不同基准上的表现
"""
import pandas as pd
data = []
for model_name, results in model_results.items():
row = {"Model": model_name}
for benchmark in self.benchmarks:
if benchmark in results:
row[benchmark] = results[benchmark]["accuracy"]
data.append(row)
df = pd.DataFrame(data)
df = df.set_index("Model")
# 计算平均分
df["Average"] = df.mean(axis=1)
return df.sort_values("Average", ascending=False)
def visualize_comparison(self, df: pd.DataFrame):
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(12, 6))
df.plot(kind="bar", ax=ax)
plt.title("LLM基准测试对比")
plt.ylabel("得分")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("benchmark_comparison.png")
基准测试最佳实践
- 选择合适的基准:根据应用场景选择相关基准
- 多次评估:进行多次评估取平均值
- 统计显著性:使用置信区间评估结果
- 持续跟踪:定期重新评估以检测性能变化
总结
基准测试是LLM评估的重要工具。通过标准化的测试集和评估指标,可以客观比较模型性能,推动技术进步。