LLM评估方法:如何科学评价大语言模型
LLM评估方法:如何科学评价大语言模型
为什么需要评估LLM?
评估是理解LLM能力、选择合适模型、指导优化方向的关键。
评估的挑战
- 开放式输出:LLM生成的答案可能多种多样
- 主观性:很多任务没有唯一正确答案
- 多维度:需要评估多个方面的能力
- 数据泄露:测试数据可能在训练集中
自动评估指标
1. BLEU(Bilingual Evaluation Understudy)
用于评估机器翻译和文本生成质量。
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
def compute_bleu(references, hypotheses):
"""
references: list of list of reference sentences
hypotheses: list of hypothesis sentences
"""
# 分词
references_tokenized = [[ref.split() for ref in refs] for refs in references]
hypotheses_tokenized = [hyp.split() for hyp in hypotheses]
# 计算BLEU
smoothie = SmoothingFunction().method1
bleu_score = corpus_bleu(
references_tokenized,
hypotheses_tokenized,
weights=(0.25, 0.25, 0.25, 0.25),
smoothing_function=smoothie
)
return bleu_score
# 示例
references = ["我喜欢吃苹果", "我爱吃苹果"](/notes/-)
hypotheses = ["我喜欢吃苹果"]
bleu = compute_bleu(references, hypotheses)
print(f"BLEU: {bleu:.4f}")
2. ROUGE(Recall-Oriented Understudy for Gisting Evaluation)
用于评估文本摘要质量。
from rouge_score import rouge_scorer
def compute_rouge(references, hypotheses):
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
results = {
'rouge1': {'precision': [], 'recall': [], 'fmeasure': []},
'rouge2': {'precision': [], 'recall': [], 'fmeasure': []},
'rougeL': {'precision': [], 'recall': [], 'fmeasure': []}
}
for ref, hyp in zip(references, hypotheses):
scores = scorer.score(ref, hyp)
for metric in results:
for stat in results[metric]:
results[metric][stat].append(scores[metric][stat])
# 求平均
avg_results = {}
for metric in results:
avg_results[metric] = {
stat: sum(values) / len(values)
for stat, values in results[metric].items()
}
return avg_results
3. 困惑度(Perplexity)
衡量语言模型预测下一个词的能力。
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def compute_perplexity(model, tokenizer, text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
perplexity = torch.exp(loss)
return perplexity.item()
# 示例
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
text = "人工智能正在改变我们的世界。"
ppl = compute_perplexity(model, tokenizer, text)
print(f"困惑度: {ppl:.2f}")
4. BERTScore
使用BERT计算语义相似度。
from bert_score import score
def compute_bertscore(references, hypotheses):
P, R, F1 = score(hypotheses, references, lang="zh", model_type="bert-base-chinese")
return {
"precision": P.mean().item(),
"recall": R.mean().item(),
"f1": F1.mean().item()
}
基准测试
1. MMLU(Massive Multitask Language Evaluation)
评估模型在57个学科上的知识和推理能力。
def evaluate_mmlu(model, tokenizer, dataset):
correct = 0
total = 0
for example in dataset:
question = example["question"]
choices = example["choices"]
answer = example["answer"]
# 构造prompt
prompt = f"""
{question}
A. {choices[0]}
B. {choices[1]}
C. {choices[2]}
D. {choices[3]}
答案是:
"""
# 生成
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1)
response = tokenizer.decode(outputs[0])
# 解析答案
predicted = parse_answer(response)
if predicted == answer:
correct += 1
total += 1
return correct / total
2. HumanEval
评估代码生成能力。
def evaluate_humaneval(model, tokenizer, problems):
pass_at_k_results = []
for problem in problems:
prompt = problem["prompt"]
test_cases = problem["test_cases"]
# 生成多个代码解决方案
solutions = []
for _ in range(20): # 生成20个候选
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.8,
do_sample=True
)
code = tokenizer.decode(outputs[0])
solutions.append(code)
# 计算pass@k
pass_at_1 = compute_pass_at_k(solutions, test_cases, k=1)
pass_at_10 = compute_pass_at_k(solutions, test_cases, k=10)
pass_at_k_results.append({
"pass@1": pass_at_1,
"pass@10": pass_at_10
})
return {
"pass@1": sum(r["pass@1"] for r in pass_at_k_results) / len(pass_at_k_results),
"pass@10": sum(r["pass@10"] for r in pass_at_k_results) / len(pass_at_k_results)
}
def compute_pass_at_k(solutions, test_cases, k):
correct = 0
for solution in solutions[:k]:
if run_test_cases(solution, test_cases):
correct += 1
return correct / k
3. 其他基准测试
- GSM8K:小学数学推理
- HellaSwag:常识推理
- TruthfulQA:真实性
- WinoGrande:指代消解
- ARC:科学推理
人工评估
1. 直接评分
def human_evaluation_prompt(question, answer):
return f"""
请评估以下AI回答的质量(1-5分):
问题:{question}
AI回答:{answer}
评分标准:
1分 - 完全不相关或错误
2分 - 部分相关但有明显错误
3分 - 基本正确但不够完善
4分 - 正确且较为完善
5分 - 完美回答
请给出评分和理由:
"""
2. A/B比较
def ab_comparison_prompt(question, answer_a, answer_b):
return f"""
请比较以下两个AI回答的质量:
问题:{question}
回答A:{answer_a}
回答B:{answer_b}
请选择更好的回答(A/B/差不多),并说明理由:
"""
3. LLM-as-Judge
使用强模型评估弱模型。
def llm_judge(question, answer, reference_answer=None):
prompt = f"""
你是一个AI评估专家。请评估以下AI回答的质量。
问题:{question}
AI回答:{answer}
"""
if reference_answer:
prompt += f"""
参考答案:{reference_answer}
"""
prompt += """
请从以下维度评分(1-5分):
1. 相关性:回答是否与问题相关
2. 准确性:信息是否正确
3. 完整性:是否涵盖了关键点
4. 清晰度:表达是否清楚易懂
请给出每个维度的分数和总评:
"""
return call_llm(prompt)
评估中的陷阱
1. 数据泄露
def check_data_leakage(test_set, training_sets):
"""检查测试数据是否在训练集中出现"""
leaked = []
for test_item in test_set:
for train_set in training_sets:
for train_item in train_set:
similarity = compute_similarity(test_item, train_item)
if similarity > 0.9: # 高度相似
leaked.append(test_item)
break
return leaked
2. 过度优化指标
# 不好的做法:针对特定指标过度优化
def bad_evaluation():
# 模型在BLEU上得分很高,但实际生成质量很差
pass
# 好的做法:多维度综合评估
def good_evaluation():
return {
"automatic": {
"bleu": compute_bleu(...),
"rouge": compute_rouge(...),
"perplexity": compute_perplexity(...)
},
"human": {
"quality_score": human_evaluation(...),
"preference_rate": ab_comparison(...)
},
"task_specific": {
"accuracy": task_accuracy(...),
"latency": inference_latency(...)
}
}
3. 评估集大小
def determine_eval_size(confidence=0.95, margin=0.03, p=0.5):
"""计算所需的最小评估样本数"""
import math
z = 1.96 # 95%置信度
n = (z**2 * p * (1-p)) / (margin**2)
return math.ceil(n)
评估框架
OpenAI Evals
from evals import Eval
class MyEval(Eval):
def run(self, recorder):
# 加载测试数据
test_data = self.load_test_data()
results = []
for example in test_data:
# 运行模型
output = self.completion_fn(example["prompt"])
# 评估
score = self.score_fn(example, output)
results.append(score)
# 记录结果
recorder.record_final_score(results)
lm-evaluation-harness
# 使用EleutherAI的评估框架
python eval.py \
--model gpt2 \
--tasks humanelu,gsm8k,mmlu \
--num_fewshot 0
总结
LLM评估是一个复杂但至关重要的任务。通过结合自动评估指标、基准测试和人工评估,可以全面了解模型的能力。同时,需要注意评估中的陷阱,确保评估结果的可靠性。