序列标注详解
序列标注详解
什么是序列标注
序列标注是为序列中的每个元素分配标签的任务,典型应用包括命名实体识别(NER)和词性标注。
BIO标注方案
BIO是最常用的序列标注方案:
def bio_tagging(sentence, entities):
tokens = list(sentence)
tags = ['O'] * len(tokens)
for entity, entity_type in entities:
start = sentence.find(entity)
if start != -1:
tags[start] = f'B-{entity_type}'
for i in range(1, len(entity)):
tags[start + i] = f'I-{entity_type}'
return list(zip(tokens, tags))
sentence = "北京是中国的首都"
entities = [("北京", "LOC")]
tags = bio_tagging(sentence, entities)
print(tags)
HMM序列标注
隐马尔可夫模型是经典的序列标注方法:
import numpy as np
class HMM:
def __init__(self, n_states, n_observations):
self.n_states = n_states
self.n_observations = n_observations
self.transition = np.random.dirichlet(np.ones(n_states), n_states)
self.emission = np.random.dirichlet(np.ones(n_observations), n_states)
self.initial = np.random.dirichlet(np.ones(n_states))
def viterbi(self, observations):
n_obs = len(observations)
dp = np.zeros((self.n_states, n_obs))
backpointers = np.zeros((self.n_states, n_obs), dtype=int)
dp[:, 0] = np.log(self.initial + 1e-10) + \
np.log(self.emission[:, observations[0]] + 1e-10)
for t in range(1, n_obs):
for s in range(self.n_states):
probs = dp[:, t-1] + np.log(self.transition[:, s] + 1e-10)
backpointers[s, t] = np.argmax(probs)
dp[s, t] = probs[backpointers[s, t]] + \
np.log(self.emission[s, observations[t]] + 1e-10)
best_path = [np.argmax(dp[:, -1])]
for t in range(n_obs-2, -1, -1):
best_path.insert(0, backpointers[best_path[0], t+1])
return best_path
hmm = HMM(n_states=3, n_observations=5)
observations = [0, 1, 2, 3, 4]
path = hmm.viterbi(observations)
print("最优状态序列:", path)
CRF条件随机场
CRF考虑标签间的依赖关系:
import torch
import torch.nn as nn
class CRF(nn.Module):
def __init__(self, num_tags):
super(CRF, self).__init__()
self.num_tags = num_tags
self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))
self.start_transitions = nn.Parameter(torch.randn(num_tags))
self.end_transitions = nn.Parameter(torch.randn(num_tags))
def forward(self, emissions, tags, mask):
numerator = self._compute_score(emissions, tags, mask)
denominator = self._compute_normalizer(emissions, mask)
return (numerator - denominator).mean()
def _compute_score(self, emissions, tags, mask):
batch_size, seq_len, num_tags = emissions.shape
score = self.start_transitions[tags[:, 0]]
score += emissions[:, 0].gather(1, tags[:, 0].unsqueeze(1)).squeeze(1)
for t in range(1, seq_len):
score += self.transitions[tags[:, t-1], tags[:, t]] * mask[:, t]
score += emissions[:, t].gather(1, tags[:, t].unsqueeze(1)).squeeze(1) * mask[:, t]
score += self.end_transitions[tags[:, -1]]
return score
crf = CRF(num_tags=5)
emissions = torch.randn(32, 50, 5)
tags = torch.randint(0, 5, (32, 50))
mask = torch.ones(32, 50)
loss = crf(emissions, tags, mask)
print("CRF损失:", loss.item())
BiLSTM-CRF模型
结合BiLSTM和CRF进行序列标注:
class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
super(BiLSTM_CRF, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim//2,
batch_first=True, bidirectional=True)
self.hidden2tag = nn.Linear(hidden_dim, num_tags)
self.crf = CRF(num_tags)
def forward(self, x, tags=None, mask=None):
embeds = self.embedding(x)
lstm_out, _ = self.lstm(embeds)
emissions = self.hidden2tag(lstm_out)
if tags is not None:
loss = self.crf(emissions, tags, mask)
return loss
else:
return emissions
model = BiLSTM_CRF(vocab_size=10000, embedding_dim=128,
hidden_dim=256, num_tags=5)
评估指标
def evaluate_ner(true_tags, pred_tags):
correct = sum(1 for t, p in zip(true_tags, pred_tags) if t == p)
total = len(true_tags)
return correct / total
true = ['B-PER', 'I-PER', 'O', 'B-LOC', 'O']
pred = ['B-PER', 'I-PER', 'O', 'O', 'O']
accuracy = evaluate_ner(true, pred)
print(f"准确率: {accuracy:.2f}")
总结
序列标注是NLP的重要任务。CRF考虑标签依赖关系,结合BiLSTM可以获得更好的标注效果。