← 返回首页
🧠

BLEU与ROUGE:文本生成评估指标

📂 llm ⏱ 4 min 716 words

--- title: "BLEU与ROUGE:文本生成评估指标" description: "掌握BLEU和ROUGE指标的原理、计算方法和在LLM评估中的应用" tags: ["BLEU", "ROUGE", "文本评估", "机器翻译"] category: "llm" icon: "🧠"

BLEU与ROUGE:文本生成评估指标

BLEU指标

BLEU原理

BLEU(Bilingual Evaluation Understudy)是机器翻译中最常用的自动评估指标。它通过计算候选文本与参考文本之间的n-gram重叠来评估翻译质量。

BLEU的核心思想:

BLEU计算

from collections import Counter
import math

def ngrams(tokens, n):
    """生成n-gram"""
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def bleu_score(candidate, references, max_n=4):
    """计算BLEU分数"""
    cand_tokens = candidate.split()
    
    # 计算各阶n-gram精度
    precisions = []
    for n in range(1, max_n+1):
        cand_ngrams = ngrams(cand_tokens, n)
        cand_counts = Counter(cand_ngrams)
        
        # 截断到参考中的最大出现次数
        max_ref_counts = Counter()
        for ref in references:
            ref_tokens = ref.split()
            ref_ngrams = ngrams(ref_tokens, n)
            ref_counts = Counter(ref_ngrams)
            for ngram, count in ref_counts.items():
                max_ref_counts[ngram] = max(max_ref_counts[ngram], count)
        
        # 计算修正精度
        clipped_counts = {
            ngram: min(count, max_ref_counts.get(ngram, 0))
            for ngram, count in cand_counts.items()
        }
        
        precision = sum(clipped_counts.values()) / max(sum(cand_counts.values()), 1)
        precisions.append(precision)
    
    # 几何平均
    if min(precisions) > 0:
        geo_mean = math.exp(sum(math.log(p) for p in precisions) / len(precisions))
    else:
        geo_mean = 0
    
    # Brevity Penalty
    cand_len = len(cand_tokens)
    ref_len = min(len(ref.split()) for ref in references)
    
    if cand_len > ref_len:
        bp = 1
    else:
        bp = math.exp(1 - ref_len / cand_len)
    
    return bp * geo_mean

# 示例
candidate = "机器学习是人工智能的一个分支"
references = [
    "机器学习是人工智能的一个子集",
    "机器学习属于人工智能领域"
]
score = bleu_score(candidate, references)
print(f"BLEU分数: {score:.4f}")

使用NLTK计算

from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction

# 单句BLEU
reference = ['机器', '学习', '是', '人工智能', '的', '一个', '分支'](/notes/------)
candidate = ['机器', '学习', '属于', '人工智能', '领域']

# 不同权重
weights_1 = (1, 0, 0, 0)  # BLEU-1
weights_2 = (0.5, 0.5, 0, 0)  # BLEU-2
weights_4 = (0.25, 0.25, 0.25, 0.25)  # BLEU-4

# 计算
bleu1 = sentence_bleu(reference, candidate, weights=weights_1)
bleu4 = sentence_bleu(reference, candidate, weights=weights_4)

print(f"BLEU-1: {bleu1:.4f}")
print(f"BLEU-4: {bleu4:.4f}")

# 使用平滑处理
smoother = SmoothingFunction()
bleu_smooth = sentence_bleu(reference, candidate, 
                           weights=weights_4,
                           smoothing_function=smoother.method1)
print(f"平滑BLEU-4: {bleu_smooth:.4f}")

# 语料库BLEU
references_corpus = [['机器', '学习', '是', '人工智能', '的', '子集'](/notes/-----)]
candidates_corpus = ['机器', '学习', '属于', '人工智能', '领域'](/notes/----)

corpus_score = corpus_bleu(references_corpus, candidates_corpus)
print(f"语料库BLEU: {corpus_score:.4f}")

ROUGE指标

ROUGE原理

ROUGE(Recall-Oriented Understudy for Gisting Evaluation)是面向召回率的评估指标,常用于文本摘要任务。

ROUGE主要变体:

ROUGE计算

from rouge_score import rouge_scorer

def calculate_rouge(references, hypotheses):
    """计算ROUGE分数"""
    scorer = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', ' rougeL', 'rougeLsum'],
        use_stemmer=True
    )
    
    all_scores = []
    
    for ref, hyp in zip(references, hypotheses):
        scores = scorer.score(ref, hyp)
        all_scores.append(scores)
    
    # 计算平均分数
    avg_scores = {}
    for metric in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:
        avg_scores[metric] = {
            'precision': np.mean([s[metric].precision for s in all_scores]),
            'recall': np.mean([s[metric].recall for s in all_scores]),
            'fmeasure': np.mean([s[metric].fmeasure for s in all_scores])
        }
    
    return avg_scores

# 使用示例
references = [
    "机器学习是人工智能的一个分支,它使计算机能够从数据中学习。",
    "深度学习是机器学习的一个子集,使用多层神经网络。"
]
hypotheses = [
    "机器学习属于人工智能领域,让计算机从数据学习。",
    "深度学习使用多层神经网络,是机器学习的一部分。"
]

scores = calculate_rouge(references, hypotheses)
for metric, values in scores.items():
    print(f"{metric}: P={values['precision']:.3f}, R={values['recall']:.3f}, F={values['fmeasure']:.3f}")

手动实现ROUGE-L

def lcs_length(x, y):
    """计算最长公共子序列长度"""
    m, n = len(x), len(y)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if x[i-1] == y[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    
    return dp[m][n]

def rouge_l_score(reference, hypothesis):
    """计算ROUGE-L分数"""
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    
    lcs = lcs_length(ref_tokens, hyp_tokens)
    
    precision = lcs / len(hyp_tokens) if hyp_tokens else 0
    recall = lcs / len(ref_tokens) if ref_tokens else 0
    
    if precision + recall > 0:
        fmeasure = 2 * precision * recall / (precision + recall)
    else:
        fmeasure = 0
    
    return {
        'precision': precision,
        'recall': recall,
        'fmeasure': fmeasure
    }

# 使用
reference = "机器学习是人工智能的一个分支"
hypothesis = "机器学习属于人工智能领域"
score = rouge_l_score(reference, hypothesis)
print(f"ROUGE-L: {score}")

BLEU vs ROUGE对比

# 使用场景对比
comparison = {
    "机器翻译": {
        "主要指标": "BLEU",
        "原因": "BLEU注重精度,适合评估翻译准确性"
    },
    "文本摘要": {
        "主要指标": "ROUGE",
        "原因": "ROUGE注重召回率,确保摘要覆盖原文要点"
    },
    "文本生成": {
        "主要指标": "两者结合",
        "原因": "综合考虑精度和召回率"
    }
}

# 代码示例:同时使用两者
def comprehensive_evaluation(references, hypotheses):
    """综合评估"""
    # BLEU分数
    bleu_scores = []
    for hyp, refs in zip(hypotheses, references):
        score = sentence_bleu([refs.split()], hyp.split())
        bleu_scores.append(score)
    
    # ROUGE分数
    rouge_scores = calculate_rouge(references, hypotheses)
    
    return {
        "bleu": {
            "mean": np.mean(bleu_scores),
            "std": np.std(bleu_scores)
        },
        "rouge": rouge_scores
    }

实际应用

机器翻译评估

def evaluate_mt_system(test_data, translate_func):
    """评估机器翻译系统"""
    references = []
    hypotheses = []
    
    for item in test_data:
        ref = item["reference"]
        hyp = translate_func(item["source"])
        
        references.append(ref)
        hypotheses.append(hyp)
    
    # 计算BLEU
    bleu = corpus_bleu(
        [[ref.split()] for ref in references],
        [hyp.split() for hyp in hypotheses]
    )
    
    # 计算ROUGE
    rouge = calculate_rouge(references, hypotheses)
    
    return {
        "corpus_bleu": bleu,
        "rouge": rouge
    }

文本摘要评估

def evaluate_summarization(articles, summaries, reference_summaries):
    """评估文本摘要"""
    results = []
    
    for article, summary, ref_summary in zip(articles, summaries, reference_summaries):
        # ROUGE分数
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(ref_summary, summary)
        
        results.append({
            "rouge1": scores['rouge1'].fmeasure,
            "rouge2": scores['rouge2'].fmeasure,
            "rougeL": scores['rougeL'].fmeasure
        })
    
    # 平均分数
    avg_results = {
        "rouge1": np.mean([r["rouge1"] for r in results]),
        "rouge2": np.mean([r["rouge2"] for r in results]),
        "rougeL": np.mean([r["rougeL"] for r in results])
    }
    
    return avg_results

局限性与改进

# 1. 不考虑语义相似性
# 解决方案:结合嵌入相似度
from sentence_transformers import SentenceTransformer

def semantic_bleu(reference, hypothesis):
    """语义增强的BLEU"""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # 语义相似度
    ref_embedding = model.encode(reference)
    hyp_embedding = model.encode(hypothesis)
    semantic_similarity = np.dot(ref_embedding, hyp_embedding) / (
        np.linalg.norm(ref_embedding) * np.linalg.norm(hyp_embedding)
    )
    
    # 传统BLEU
    traditional_bleu = sentence_bleu([reference.split()], hypothesis.split())
    
    # 综合分数
    combined = 0.5 * traditional_bleu + 0.5 * semantic_similarity
    return combined

BLEU和ROUGE作为经典的文本评估指标,在LLM评估中仍然发挥着重要作用,但需要结合其他指标综合评估。