← 返回首页
🤖

序列标注详解

📂 ai ⏱ 3 min 408 words

序列标注详解

什么是序列标注

序列标注是为序列中的每个元素分配标签的任务,典型应用包括命名实体识别(NER)和词性标注。

BIO标注方案

BIO是最常用的序列标注方案:

def bio_tagging(sentence, entities):
    tokens = list(sentence)
    tags = ['O'] * len(tokens)
    
    for entity, entity_type in entities:
        start = sentence.find(entity)
        if start != -1:
            tags[start] = f'B-{entity_type}'
            for i in range(1, len(entity)):
                tags[start + i] = f'I-{entity_type}'
    
    return list(zip(tokens, tags))

sentence = "北京是中国的首都"
entities = [("北京", "LOC")]
tags = bio_tagging(sentence, entities)
print(tags)

HMM序列标注

隐马尔可夫模型是经典的序列标注方法:

import numpy as np

class HMM:
    def __init__(self, n_states, n_observations):
        self.n_states = n_states
        self.n_observations = n_observations
        self.transition = np.random.dirichlet(np.ones(n_states), n_states)
        self.emission = np.random.dirichlet(np.ones(n_observations), n_states)
        self.initial = np.random.dirichlet(np.ones(n_states))
    
    def viterbi(self, observations):
        n_obs = len(observations)
        dp = np.zeros((self.n_states, n_obs))
        backpointers = np.zeros((self.n_states, n_obs), dtype=int)
        
        dp[:, 0] = np.log(self.initial + 1e-10) + \
                    np.log(self.emission[:, observations[0]] + 1e-10)
        
        for t in range(1, n_obs):
            for s in range(self.n_states):
                probs = dp[:, t-1] + np.log(self.transition[:, s] + 1e-10)
                backpointers[s, t] = np.argmax(probs)
                dp[s, t] = probs[backpointers[s, t]] + \
                          np.log(self.emission[s, observations[t]] + 1e-10)
        
        best_path = [np.argmax(dp[:, -1])]
        for t in range(n_obs-2, -1, -1):
            best_path.insert(0, backpointers[best_path[0], t+1])
        
        return best_path

hmm = HMM(n_states=3, n_observations=5)
observations = [0, 1, 2, 3, 4]
path = hmm.viterbi(observations)
print("最优状态序列:", path)

CRF条件随机场

CRF考虑标签间的依赖关系:

import torch
import torch.nn as nn

class CRF(nn.Module):
    def __init__(self, num_tags):
        super(CRF, self).__init__()
        self.num_tags = num_tags
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))
        self.start_transitions = nn.Parameter(torch.randn(num_tags))
        self.end_transitions = nn.Parameter(torch.randn(num_tags))
    
    def forward(self, emissions, tags, mask):
        numerator = self._compute_score(emissions, tags, mask)
        denominator = self._compute_normalizer(emissions, mask)
        return (numerator - denominator).mean()
    
    def _compute_score(self, emissions, tags, mask):
        batch_size, seq_len, num_tags = emissions.shape
        
        score = self.start_transitions[tags[:, 0]]
        score += emissions[:, 0].gather(1, tags[:, 0].unsqueeze(1)).squeeze(1)
        
        for t in range(1, seq_len):
            score += self.transitions[tags[:, t-1], tags[:, t]] * mask[:, t]
            score += emissions[:, t].gather(1, tags[:, t].unsqueeze(1)).squeeze(1) * mask[:, t]
        
        score += self.end_transitions[tags[:, -1]]
        return score

crf = CRF(num_tags=5)
emissions = torch.randn(32, 50, 5)
tags = torch.randint(0, 5, (32, 50))
mask = torch.ones(32, 50)
loss = crf(emissions, tags, mask)
print("CRF损失:", loss.item())

BiLSTM-CRF模型

结合BiLSTM和CRF进行序列标注:

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, 
                           batch_first=True, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim, num_tags)
        self.crf = CRF(num_tags)
    
    def forward(self, x, tags=None, mask=None):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        emissions = self.hidden2tag(lstm_out)
        
        if tags is not None:
            loss = self.crf(emissions, tags, mask)
            return loss
        else:
            return emissions

model = BiLSTM_CRF(vocab_size=10000, embedding_dim=128, 
                    hidden_dim=256, num_tags=5)

评估指标

def evaluate_ner(true_tags, pred_tags):
    correct = sum(1 for t, p in zip(true_tags, pred_tags) if t == p)
    total = len(true_tags)
    return correct / total

true = ['B-PER', 'I-PER', 'O', 'B-LOC', 'O']
pred = ['B-PER', 'I-PER', 'O', 'O', 'O']
accuracy = evaluate_ner(true, pred)
print(f"准确率: {accuracy:.2f}")

总结

序列标注是NLP的重要任务。CRF考虑标签依赖关系,结合BiLSTM可以获得更好的标注效果。