LLM基准测试:标准化的模型性能测试和对比方法
LLM基准测试:标准化的模型性能测试和对比方法
基准测试的重要性
基准测试提供标准化的评估框架,让你能够客观地比较不同模型的性能,为模型选择和优化提供数据支撑。
测试集设计
构建高质量测试集
from dataclasses import dataclass
from typing import List, Dict
import json
@dataclass
class BenchmarkItem:
id: str
category: str
difficulty: str # easy, medium, hard
prompt: str
expected_output: str
evaluation_criteria: Dict[str, float]
class BenchmarkDataset:
def __init__(self):
self.items: List[BenchmarkItem] = []
def add_item(self, item: BenchmarkItem):
self.items.append(item)
def get_statistics(self):
categories = {}
difficulties = {}
for item in self.items:
categories[item.category] = categories.get(item.category, 0) + 1
difficulties[item.difficulty] = difficulties.get(item.difficulty, 0) + 1
return {
'total': len(self.items),
'categories': categories,
'difficulties': difficulties
}
def export(self, filepath: str):
data = {
'metadata': self.get_statistics(),
'items': [
{
'id': item.id,
'category': item.category,
'difficulty': item.difficulty,
'prompt': item.prompt,
'expected_output': item.expected_output,
'criteria': item.evaluation_criteria
}
for item in self.items
]
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
测试集分类体系
class BenchmarkCategories:
CATEGORIES = {
'knowledge': {
'name': '知识问答',
'subcategories': ['science', 'history', 'geography', 'culture'],
'weight': 0.25
},
'reasoning': {
'name': '逻辑推理',
'subcategories': ['math', 'logic', 'common_sense', 'causal'],
'weight': 0.25
},
'language': {
'name': '语言理解',
'subcategories': ['comprehension', 'summarization', 'translation'],
'weight': 0.20
},
'creativity': {
'name': '创意生成',
'subcategories': ['writing', 'brainstorming', 'storytelling'],
'weight': 0.15
},
'safety': {
'name': '安全性',
'subcategories': ['harmful', 'biased', 'misinformation'],
'weight': 0.15
}
}
@classmethod
def get_category_weight(cls, category: str) -> float:
return cls.CATEGORIES.get(category, {}).get('weight', 0)
评估指标实现
标准化评估指标
import numpy as np
from collections import defaultdict
class BenchmarkEvaluator:
def __init__(self):
self.scorers = {
'accuracy': self.accuracy_scorer,
'f1': self.f1_scorer,
'bleu': self.bleu_scorer,
'rouge': self.rouge_scorer,
'exact_match': self.exact_match_scorer
}
def evaluate_item(self, item: BenchmarkItem, model_output: str) -> Dict[str, float]:
"""评估单个测试项"""
scores = {}
for metric_name, scorer in self.scorers.items():
if metric_name in item.evaluation_criteria:
score = scorer(item.expected_output, model_output)
scores[metric_name] = score
return scores
def accuracy_scorer(self, expected: str, predicted: str) -> float:
"""准确率评分"""
return 1.0 if expected.strip() == predicted.strip() else 0.0
def f1_scorer(self, expected: str, predicted: str) -> float:
"""F1分数"""
expected_tokens = set(expected.split())
predicted_tokens = set(predicted.split())
if not predicted_tokens:
return 0.0
common = expected_tokens & predicted_tokens
precision = len(common) / len(predicted_tokens)
recall = len(common) / len(expected_tokens)
if precision + recall == 0:
return 0.0
return 2 * (precision * recall) / (precision + recall)
def bleu_scorer(self, expected: str, predicted: str) -> float:
"""BLEU分数简化实现"""
expected_tokens = expected.split()
predicted_tokens = predicted.split()
if not predicted_tokens:
return 0.0
# 1-gram precision
matches = sum(1 for token in predicted_tokens if token in expected_tokens)
precision = matches / len(predicted_tokens)
# 简化:只计算1-gram
return precision
def rouge_scorer(self, expected: str, predicted: str) -> float:
"""ROUGE分数简化实现"""
expected_tokens = expected.split()
predicted_tokens = predicted.split()
if not expected_tokens:
return 0.0
common = set(expected_tokens) & set(predicted_tokens)
recall = len(common) / len(expected_tokens)
return recall
综合评分系统
class BenchmarkScorer:
def __init__(self, category_weights: Dict[str, float]):
self.category_weights = category_weights
self.evaluator = BenchmarkEvaluator()
def score_model(self, results: List[Dict]) -> Dict:
"""计算模型综合得分"""
category_scores = defaultdict(list)
for result in results:
category = result['category']
item_score = np.mean(list(result['scores'].values()))
category_scores[category].append(item_score)
# 计算各分类平均分
category_averages = {
cat: np.mean(scores)
for cat, scores in category_scores.items()
}
# 加权总分
total_score = sum(
category_averages.get(cat, 0) * weight
for cat, weight in self.category_weights.items()
)
return {
'total_score': total_score,
'category_scores': category_averages,
'detailed_results': results
}
模型对比测试
对比测试框架
class ModelComparison:
def __init__(self, benchmark_dataset: BenchmarkDataset):
self.dataset = benchmark_dataset
self.results = {}
def test_model(self, model_name: str, model_client) -> Dict:
"""测试单个模型"""
results = []
for item in self.dataset.items:
# 调用模型
response = model_client.generate(
prompt=item.prompt,
max_tokens=500,
temperature=0.0
)
# 评估结果
scores = self.evaluator.evaluate_item(item, response)
results.append({
'item_id': item.id,
'category': item.category,
'difficulty': item.difficulty,
'model_output': response,
'expected_output': item.expected_output,
'scores': scores
})
self.results[model_name] = results
return results
def compare_models(self, model_names: List[str]) -> Dict:
"""对比多个模型"""
comparison = {}
for model_name in model_names:
scorer = BenchmarkScorer(BenchmarkCategories.get_all_weights())
scored = scorer.score_model(self.results.get(model_name, []))
comparison[model_name] = scored
# 生成对比报告
return self.generate_comparison_report(comparison)
def generate_comparison_report(self, comparison: Dict) -> Dict:
"""生成对比报告"""
# 排名
rankings = sorted(
comparison.items(),
key=lambda x: x[1]['total_score'],
reverse=True
)
# 各分类最佳
category_leaders = {}
for model_name, scores in comparison.items():
for category, score in scores['category_scores'].items():
if category not in category_leaders or score > category_leaders[category]['score']:
category_leaders[category] = {
'model': model_name,
'score': score
}
return {
'rankings': [(name, data['total_score']) for name, data in rankings],
'category_leaders': category_leaders,
'detailed_comparison': comparison
}
性能测试
class PerformanceBenchmark:
def __init__(self):
self.metrics = {
'latency': [],
'throughput': [],
'memory': []
}
def measure_latency(self, model_client, prompt: str, n_runs: int = 10) -> Dict:
"""测量延迟"""
latencies = []
for _ in range(n_runs):
start_time = time.time()
model_client.generate(prompt)
end_time = time.time()
latencies.append(end_time - start_time)
return {
'avg_latency': np.mean(latencies),
'p50_latency': np.percentile(latencies, 50),
'p95_latency': np.percentile(latencies, 95),
'p99_latency': np.percentile(latencies, 99),
'std_latency': np.std(latencies)
}
def measure_throughput(self, model_client, prompt: str, duration: int = 60) -> Dict:
"""测量吞吐量"""
count = 0
start_time = time.time()
while time.time() - start_time < duration:
model_client.generate(prompt)
count += 1
return {
'requests_per_second': count / duration,
'total_requests': count
}
def measure_memory(self, model_client, prompt: str) -> Dict:
"""测量内存使用"""
import psutil
process = psutil.Process()
# 测试前
mem_before = process.memory_info().rss / 1024 / 1024 # MB
# 执行推理
model_client.generate(prompt)
# 测试后
mem_after = process.memory_info().rss / 1024 / 1024
return {
'memory_before_mb': mem_before,
'memory_after_mb': mem_after,
'memory_delta_mb': mem_after - mem_before
}
基准测试报告
def generate_benchmark_report(comparison_results, performance_results):
report = {
'executive_summary': {
'best_overall': comparison_results['rankings'][0][0],
'best_by_category': comparison_results['category_leaders']
},
'accuracy_comparison': {
'rankings': comparison_results['rankings']
},
'performance_comparison': performance_results,
'recommendations': generate_recommendations(
comparison_results,
performance_results
)
}
return report
最佳实践
- 测试集多样性:覆盖多个领域和难度级别
- 可重复性:固定随机种子,确保结果可复现
- 多维度评估:准确性、延迟、吞吐量都要测
- 公平对比:相同参数设置下对比
- 持续更新:定期更新测试集
- 公开透明:公开测试方法和结果