自然语言处理基础
自然语言处理基础
自然语言处理(NLP)是让计算机理解、分析和生成人类语言的技术。本篇将介绍NLP的核心基础任务:分词、词性标注和命名实体识别。
中文分词
中文没有天然的词边界,分词是中文NLP的第一步。
基于规则的分词
class SimpleSegmenter:
def __init__(self, dictionary):
self.dictionary = set(dictionary)
self.max_word_length = max(len(w) for w in dictionary)
def segment(self, text):
result = []
i = 0
while i < len(text):
matched = False
for length in range(self.max_word_length, 0, -1):
word = text[i:i+length]
if word in self.dictionary:
result.append(word)
i += length
matched = True
break
if not matched:
result.append(text[i])
i += 1
return result
dictionary = ["我", "爱", "自然", "语言", "处理", "深度", "学习"]
segmenter = SimpleSegmenter(dictionary)
print(segmenter.segment("我爱自然语言处理"))
使用jieba分词
import jieba
import jieba.posseg as pseg
text = "我爱自然语言处理和深度学习"
words = jieba.lcut(text)
print("分词结果:", words)
words_with_pos = pseg.lcut(text)
for word, flag in words_with_pos:
print(f"{word} ({flag})")
词性标注
词性标注是为文本中的每个词分配一个语法类别(如名词、动词等)。
基于HMM的词性标注
import numpy as np
class HMMTagger:
def __init__(self):
self.states = ['N', 'V', 'ADJ', 'ADV']
self.start_prob = {'N': 0.4, 'V': 0.3, 'ADJ': 0.2, 'ADV': 0.1}
self.transition_prob = {
'N': {'N': 0.3, 'V': 0.4, 'ADJ': 0.2, 'ADV': 0.1},
'V': {'N': 0.3, 'V': 0.2, 'ADJ': 0.3, 'ADV': 0.2},
'ADJ': {'N': 0.5, 'V': 0.2, 'ADJ': 0.2, 'ADV': 0.1},
'ADV': {'N': 0.2, 'V': 0.5, 'ADJ': 0.2, 'ADV': 0.1}
}
def viterbi(self, observations):
V = [{}]
path = {}
for state in self.states:
V[0][state] = self.start_prob[state]
path[state] = [state]
for t in range(1, len(observations)):
V.append({})
new_path = {}
for state in self.states:
max_prob = 0
prev_state = None
for prev_s in self.states:
prob = V[t-1][prev_s] * self.transition_prob[prev_s][state]
if prob > max_prob:
max_prob = prob
prev_state = prev_s
V[t][state] = max_prob
new_path[state] = path[prev_state] + [state]
path = new_path
max_final = max(V[-1].values())
for state in self.states:
if V[-1][state] == max_final:
return path[state]
命名实体识别(NER)
NER旨在从文本中识别出人名、地名、机构名等实体。
基于BiLSTM-CRF的NER模型
import torch
import torch.nn as nn
class BiLSTMCRF(nn.Module):
def __init__(self, vocab_size, tag_to_ix, embedding_dim=128,
hidden_dim=256):
super(BiLSTMCRF, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
num_layers=2, bidirectional=True, batch_first=True)
self.hidden2tag = nn.Linear(hidden_dim, len(tag_to_ix))
self.tag_to_ix = tag_to_ix
self.transitions = nn.Parameter(
torch.randn(len(tag_to_ix), len(tag_to_ix))
)
def _forward_alg(self, feats):
init_alphas = torch.full((1, len(self.tag_to_ix)), -10000.)
init_alphas[0][self.tag_to_ix['START']] = 0.
forward_var = init_alphas
for feat in feats:
emit_score = feat.view(1, -1).expand(len(self.tag_to_ix), -1)
trans_score = self.transitions
next_tag_var = forward_var + trans_score + emit_score
forward_var = torch.logsumexp(next_tag_var, dim=1).view(1, -1)
return torch.logsumexp(forward_var + self.transitions[self.tag_to_ix['STOP']], dim=1)
def _score_sentence(self, feats, tags):
score = torch.zeros(1)
start = torch.tensor([self.tag_to_ix['START']], dtype=torch.long)
tags = torch.cat([start, tags])
for i, feat in enumerate(feats):
score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
score = score + self.transitions[self.tag_to_ix['STOP'], tags[-1]]
return score
def _viterbi_decode(self, feats):
backpointers = []
viterbi_var = torch.full((1, len(self.tag_to_ix)), -10000.)
viterbi_var[0][self.tag_to_ix['START']] = 0
for feat in feats:
bptrs_t = []
viterbivars_t = []
for next_tag in range(len(self.tag_to_ix)):
next_tag_var = viterbi_var + self.transitions[next_tag]
best_tag_id = next_tag_var.argmax(1).item()
bptrs_t.append(best_tag_id)
viterbivars_t.append(next_tag_var[0][best_tag_id].item())
viterbi_var = (torch.tensor(viterbivars_t) + feat).view(1, -1)
backpointers.append(bptrs_t)
terminal_var = viterbi_var + self.transitions[self.tag_to_ix['STOP']]
best_tag_id = terminal_var.argmax(1).item()
path = [best_tag_id]
for bptrs_t in reversed(backpointers):
best_tag_id = bptrs_t[best_tag_id]
path.append(best_tag_id)
return path[:-1][::-1]
def neg_log_likelihood(self, sentence, tags):
feats = self._get_lstm_features(sentence)
forward_score = self._forward_alg(feats)
gold_score = self._score_sentence(feats, tags)
return forward_score - gold_score
def _get_lstm_features(self, sentence):
embeds = self.embedding(sentence)
lstm_out, _ = self.lstm(embeds)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats
def forward(self, sentence):
lstm_feats = self._get_lstm_features(sentence)
tag_seq = self._viterbi_decode(lstm_feats)
return tag_seq
实用工具与库
from transformers import BertTokenizer, BertForTokenClassification
import torch
def ner_with_bert(text, model_name="bert-base-chinese"):
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
return list(zip(tokens, predictions[0].tolist()))
总结
NLP基础任务是构建复杂语言应用的基石。掌握分词、词性标注和NER技术,为后续的情感分析、文本分类、机器翻译等高级任务打下坚实基础。随着预训练语言模型的发展,这些任务的实现方式也在不断演进。