BLEU与ROUGE:文本生成评估指标
--- title: "BLEU与ROUGE:文本生成评估指标" description: "掌握BLEU和ROUGE指标的原理、计算方法和在LLM评估中的应用" tags: ["BLEU", "ROUGE", "文本评估", "机器翻译"] category: "llm" icon: "🧠"
BLEU与ROUGE:文本生成评估指标
BLEU指标
BLEU原理
BLEU(Bilingual Evaluation Understudy)是机器翻译中最常用的自动评估指标。它通过计算候选文本与参考文本之间的n-gram重叠来评估翻译质量。
BLEU的核心思想:
- 精度导向:衡量候选文本中有多少n-gram出现在参考文本中
- Brevity Penalty:惩罚过短的翻译
- 几何平均:综合考虑不同阶的n-gram
BLEU计算
from collections import Counter
import math
def ngrams(tokens, n):
"""生成n-gram"""
return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
def bleu_score(candidate, references, max_n=4):
"""计算BLEU分数"""
cand_tokens = candidate.split()
# 计算各阶n-gram精度
precisions = []
for n in range(1, max_n+1):
cand_ngrams = ngrams(cand_tokens, n)
cand_counts = Counter(cand_ngrams)
# 截断到参考中的最大出现次数
max_ref_counts = Counter()
for ref in references:
ref_tokens = ref.split()
ref_ngrams = ngrams(ref_tokens, n)
ref_counts = Counter(ref_ngrams)
for ngram, count in ref_counts.items():
max_ref_counts[ngram] = max(max_ref_counts[ngram], count)
# 计算修正精度
clipped_counts = {
ngram: min(count, max_ref_counts.get(ngram, 0))
for ngram, count in cand_counts.items()
}
precision = sum(clipped_counts.values()) / max(sum(cand_counts.values()), 1)
precisions.append(precision)
# 几何平均
if min(precisions) > 0:
geo_mean = math.exp(sum(math.log(p) for p in precisions) / len(precisions))
else:
geo_mean = 0
# Brevity Penalty
cand_len = len(cand_tokens)
ref_len = min(len(ref.split()) for ref in references)
if cand_len > ref_len:
bp = 1
else:
bp = math.exp(1 - ref_len / cand_len)
return bp * geo_mean
# 示例
candidate = "机器学习是人工智能的一个分支"
references = [
"机器学习是人工智能的一个子集",
"机器学习属于人工智能领域"
]
score = bleu_score(candidate, references)
print(f"BLEU分数: {score:.4f}")
使用NLTK计算
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
# 单句BLEU
reference = ['机器', '学习', '是', '人工智能', '的', '一个', '分支'](/notes/------)
candidate = ['机器', '学习', '属于', '人工智能', '领域']
# 不同权重
weights_1 = (1, 0, 0, 0) # BLEU-1
weights_2 = (0.5, 0.5, 0, 0) # BLEU-2
weights_4 = (0.25, 0.25, 0.25, 0.25) # BLEU-4
# 计算
bleu1 = sentence_bleu(reference, candidate, weights=weights_1)
bleu4 = sentence_bleu(reference, candidate, weights=weights_4)
print(f"BLEU-1: {bleu1:.4f}")
print(f"BLEU-4: {bleu4:.4f}")
# 使用平滑处理
smoother = SmoothingFunction()
bleu_smooth = sentence_bleu(reference, candidate,
weights=weights_4,
smoothing_function=smoother.method1)
print(f"平滑BLEU-4: {bleu_smooth:.4f}")
# 语料库BLEU
references_corpus = [['机器', '学习', '是', '人工智能', '的', '子集'](/notes/-----)]
candidates_corpus = ['机器', '学习', '属于', '人工智能', '领域'](/notes/----)
corpus_score = corpus_bleu(references_corpus, candidates_corpus)
print(f"语料库BLEU: {corpus_score:.4f}")
ROUGE指标
ROUGE原理
ROUGE(Recall-Oriented Understudy for Gisting Evaluation)是面向召回率的评估指标,常用于文本摘要任务。
ROUGE主要变体:
- ROUGE-1:unigram重叠
- ROUGE-2:bigram重叠
- ROUGE-L:最长公共子序列
- ROUGE-Lsum:按句分割的ROUGE-L
ROUGE计算
from rouge_score import rouge_scorer
def calculate_rouge(references, hypotheses):
"""计算ROUGE分数"""
scorer = rouge_scorer.RougeScorer(
['rouge1', 'rouge2', ' rougeL', 'rougeLsum'],
use_stemmer=True
)
all_scores = []
for ref, hyp in zip(references, hypotheses):
scores = scorer.score(ref, hyp)
all_scores.append(scores)
# 计算平均分数
avg_scores = {}
for metric in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:
avg_scores[metric] = {
'precision': np.mean([s[metric].precision for s in all_scores]),
'recall': np.mean([s[metric].recall for s in all_scores]),
'fmeasure': np.mean([s[metric].fmeasure for s in all_scores])
}
return avg_scores
# 使用示例
references = [
"机器学习是人工智能的一个分支,它使计算机能够从数据中学习。",
"深度学习是机器学习的一个子集,使用多层神经网络。"
]
hypotheses = [
"机器学习属于人工智能领域,让计算机从数据学习。",
"深度学习使用多层神经网络,是机器学习的一部分。"
]
scores = calculate_rouge(references, hypotheses)
for metric, values in scores.items():
print(f"{metric}: P={values['precision']:.3f}, R={values['recall']:.3f}, F={values['fmeasure']:.3f}")
手动实现ROUGE-L
def lcs_length(x, y):
"""计算最长公共子序列长度"""
m, n = len(x), len(y)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if x[i-1] == y[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
return dp[m][n]
def rouge_l_score(reference, hypothesis):
"""计算ROUGE-L分数"""
ref_tokens = reference.split()
hyp_tokens = hypothesis.split()
lcs = lcs_length(ref_tokens, hyp_tokens)
precision = lcs / len(hyp_tokens) if hyp_tokens else 0
recall = lcs / len(ref_tokens) if ref_tokens else 0
if precision + recall > 0:
fmeasure = 2 * precision * recall / (precision + recall)
else:
fmeasure = 0
return {
'precision': precision,
'recall': recall,
'fmeasure': fmeasure
}
# 使用
reference = "机器学习是人工智能的一个分支"
hypothesis = "机器学习属于人工智能领域"
score = rouge_l_score(reference, hypothesis)
print(f"ROUGE-L: {score}")
BLEU vs ROUGE对比
# 使用场景对比
comparison = {
"机器翻译": {
"主要指标": "BLEU",
"原因": "BLEU注重精度,适合评估翻译准确性"
},
"文本摘要": {
"主要指标": "ROUGE",
"原因": "ROUGE注重召回率,确保摘要覆盖原文要点"
},
"文本生成": {
"主要指标": "两者结合",
"原因": "综合考虑精度和召回率"
}
}
# 代码示例:同时使用两者
def comprehensive_evaluation(references, hypotheses):
"""综合评估"""
# BLEU分数
bleu_scores = []
for hyp, refs in zip(hypotheses, references):
score = sentence_bleu([refs.split()], hyp.split())
bleu_scores.append(score)
# ROUGE分数
rouge_scores = calculate_rouge(references, hypotheses)
return {
"bleu": {
"mean": np.mean(bleu_scores),
"std": np.std(bleu_scores)
},
"rouge": rouge_scores
}
实际应用
机器翻译评估
def evaluate_mt_system(test_data, translate_func):
"""评估机器翻译系统"""
references = []
hypotheses = []
for item in test_data:
ref = item["reference"]
hyp = translate_func(item["source"])
references.append(ref)
hypotheses.append(hyp)
# 计算BLEU
bleu = corpus_bleu(
[[ref.split()] for ref in references],
[hyp.split() for hyp in hypotheses]
)
# 计算ROUGE
rouge = calculate_rouge(references, hypotheses)
return {
"corpus_bleu": bleu,
"rouge": rouge
}
文本摘要评估
def evaluate_summarization(articles, summaries, reference_summaries):
"""评估文本摘要"""
results = []
for article, summary, ref_summary in zip(articles, summaries, reference_summaries):
# ROUGE分数
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(ref_summary, summary)
results.append({
"rouge1": scores['rouge1'].fmeasure,
"rouge2": scores['rouge2'].fmeasure,
"rougeL": scores['rougeL'].fmeasure
})
# 平均分数
avg_results = {
"rouge1": np.mean([r["rouge1"] for r in results]),
"rouge2": np.mean([r["rouge2"] for r in results]),
"rougeL": np.mean([r["rougeL"] for r in results])
}
return avg_results
局限性与改进
# 1. 不考虑语义相似性
# 解决方案:结合嵌入相似度
from sentence_transformers import SentenceTransformer
def semantic_bleu(reference, hypothesis):
"""语义增强的BLEU"""
model = SentenceTransformer('all-MiniLM-L6-v2')
# 语义相似度
ref_embedding = model.encode(reference)
hyp_embedding = model.encode(hypothesis)
semantic_similarity = np.dot(ref_embedding, hyp_embedding) / (
np.linalg.norm(ref_embedding) * np.linalg.norm(hyp_embedding)
)
# 传统BLEU
traditional_bleu = sentence_bleu([reference.split()], hypothesis.split())
# 综合分数
combined = 0.5 * traditional_bleu + 0.5 * semantic_similarity
return combined
BLEU和ROUGE作为经典的文本评估指标,在LLM评估中仍然发挥着重要作用,但需要结合其他指标综合评估。