← 返回首页
🧠

基准测试

📂 llm ⏱ 4 min 610 words

--- title: "基准测试" description: "LLM基准测试详解,包括MMLU、HumanEval等主流基准测试和排行榜分析" tags: ["基准测试", "MMLU", "HumanEval", "排行榜"] category: "llm" icon: "🧠"

基准测试

基准测试的意义

基准测试(Benchmark)是评估LLM性能的标准方法。通过统一的测试集和评估标准,可以客观比较不同模型的能力,推动模型发展。

主流基准测试

1. MMLU(Massive Multitask Language Understanding)

import json
from typing import List, Dict
from datasets import load_dataset

class MMLUEvaluator:
    def __init__(self):
        self.dataset = load_dataset("cais/mmlu", "all")
        self.subjects = [
            "abstract_algebra", "anatomy", "astronomy", "business_ethics",
            "college_biology", "college_chemistry", "computer_security"
        ]
    
    def evaluate_subject(self, model, subject: str, num_samples: int = 100) -> Dict:
        correct = 0
        total = 0
        
        # 获取该学科的测试数据
        test_data = self.dataset["test"].filter(
            lambda x: x["subject"] == subject
        ).select(range(min(num_samples, len(self.dataset["test"]))))
        
        for sample in test_data:
            question = sample["question"]
            choices = sample["choices"]
            answer = sample["answer"]
            
            # 构建提示
            prompt = self.format_question(question, choices)
            prediction = model.generate(prompt)
            
            # 提取答案
            predicted_answer = self.extract_answer(prediction)
            if predicted_answer == answer:
                correct += 1
            total += 1
        
        accuracy = correct / total if total > 0 else 0
        
        return {
            "subject": subject,
            "accuracy": accuracy,
            "correct": correct,
            "total": total
        }
    
    def format_question(self, question: str, choices: List[str]) -> str:
        formatted = f"问题: {question}\n\n选项:\n"
        for i, choice in enumerate(choices):
            formatted += f"{chr(65+i)}. {choice}\n"
        formatted += "\n请给出正确答案的字母:"
        return formatted
    
    def extract_answer(self, response: str) -> int:
        # 从模型响应中提取答案
        response = response.strip().upper()
        for i, letter in enumerate(['A', 'B', 'C', 'D']):
            if response.startswith(letter):
                return i
        return -1

# 使用示例
evaluator = MMLUEvaluator()
result = evaluator.evaluate_subject(model, "computer_security")
print(f"MMLU计算机安全准确率: {result['accuracy']:.2%}")

2. HumanEval(代码生成评估)

from typing import Callable
import ast

class HumanEvalEvaluator:
    def __init__(self):
        self.dataset = self.load_humaneval()
    
    def evaluate_problem(self, model, problem: Dict) -> Dict:
        # 提取问题和测试用例
        prompt = problem["prompt"]
        test_cases = problem["test_cases"]
        entry_point = problem["entry_point"]
        
        # 生成代码
        code_response = model.generate(
            f"请完成以下Python函数:\n{prompt}"
        )
        
        # 提取生成的代码
        generated_code = self.extract_code(code_response)
        
        # 执行测试
        test_results = self.run_tests(
            generated_code, entry_point, test_cases
        )
        
        return {
            "task_id": problem["task_id"],
            "passed": all(test_results),
            "pass_rate": sum(test_results) / len(test_results)
        }
    
    def extract_code(self, response: str) -> str:
        # 从响应中提取Python代码
        try:
            # 尝试解析整个响应
            ast.parse(response)
            return response
        except SyntaxError:
            # 尝试提取代码块
            import re
            code_match = re.search(r'```python\n(.*?)```', response, re.DOTALL)
            if code_match:
                return code_match.group(1)
            return ""
    
    def run_tests(self, code: str, entry_point: str, test_cases: List[str]) -> List[bool]:
        results = []
        exec_globals = {}
        
        try:
            exec(code, exec_globals)
            func = exec_globals[entry_point]
            
            for test_case in test_cases:
                try:
                    exec(test_case, {"func": func})
                    results.append(True)
                except AssertionError:
                    results.append(False)
        except Exception:
            results = [False] * len(test_cases)
        
        return results

# 评估示例
evaluator = HumanEvalEvaluator()
results = []
for problem in evaluator.dataset:
    result = evaluator.evaluate_problem(model, problem)
    results.append(result)

pass_rate = sum(r["passed"] for r in results) / len(results)
print(f"HumanEval通过率: {pass_rate:.2%}")

3. GSM8K(数学推理)

class GSM8KEvaluator:
    def __init__(self):
        self.dataset = load_dataset("gsm8k", "main")
    
    def evaluate(self, model, num_samples: int = 200) -> Dict:
        correct = 0
        total = 0
        
        test_data = self.dataset["test"].select(range(num_samples))
        
        for sample in test_data:
            question = sample["question"]
            answer = sample["answer"]
            
            # 生成推理过程和答案
            prompt = f"""
            请一步步解决这个数学问题,最后给出数字答案。
            
            问题: {question}
            
            解答:
            """
            
            response = model.generate(prompt)
            
            # 提取最终数字答案
            predicted = self.extract_number(response)
            expected = self.extract_number(answer)
            
            if predicted == expected:
                correct += 1
            total += 1
        
        accuracy = correct / total if total > 0 else 0
        
        return {
            "accuracy": accuracy,
            "correct": correct,
            "total": total
        }
    
    def extract_number(self, text: str) -> float:
        import re
        numbers = re.findall(r'-?\d+\.?\d*', text)
        if numbers:
            return float(numbers[-1])
        return None

4. TruthfulQA(真实性)

class TruthfulQAEvaluator:
    def __init__(self):
        self.dataset = load_dataset("truthfulqa", "generation")
    
    def evaluate(self, model) -> Dict:
        truthful_count = 0
        informative_count = 0
        total = len(self.dataset["validation"])
        
        for sample in self.dataset["validation"]:
            question = sample["question"]
            
            # 生成多个回答
            responses = []
            for _ in range 5):
                response = model.generate(question)
                responses.append(response)
            
            # 评估回答
            is_truthful = self.check_truthfulness(responses, sample["correct_answers"])
            is_informative = self.check_informativeness(responses)
            
            if is_truthful:
                truthful_count += 1
            if is_informative:
                informative_count += 1
        
        return {
            "truthful_score": truthful_count / total,
            "informative_score": informative_count / total
        }

排行榜分析

class LeaderboardAnalyzer:
    def __init__(self):
        self.benchmarks = ["MMLU", "HumanEval", "GSM8K", "TruthfulQA"]
    
    def compare_models(self, model_results: Dict[Dict]) -> pd.DataFrame:
        """
        比较多个模型在不同基准上的表现
        """
        import pandas as pd
        
        data = []
        for model_name, results in model_results.items():
            row = {"Model": model_name}
            for benchmark in self.benchmarks:
                if benchmark in results:
                    row[benchmark] = results[benchmark]["accuracy"]
            data.append(row)
        
        df = pd.DataFrame(data)
        df = df.set_index("Model")
        
        # 计算平均分
        df["Average"] = df.mean(axis=1)
        
        return df.sort_values("Average", ascending=False)
    
    def visualize_comparison(self, df: pd.DataFrame):
        import matplotlib.pyplot as plt
        
        fig, ax = plt.subplots(figsize=(12, 6))
        df.plot(kind="bar", ax=ax)
        plt.title("LLM基准测试对比")
        plt.ylabel("得分")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig("benchmark_comparison.png")

基准测试最佳实践

  1. 选择合适的基准:根据应用场景选择相关基准
  2. 多次评估:进行多次评估取平均值
  3. 统计显著性:使用置信区间评估结果
  4. 持续跟踪:定期重新评估以检测性能变化

总结

基准测试是LLM评估的重要工具。通过标准化的测试集和评估指标,可以客观比较模型性能,推动技术进步。