词向量详解
词向量详解
什么是词向量
词向量是将词语映射到连续向量空间的技术,使得语义相似的词语在向量空间中距离相近。
Word2Vec原理
Word2Vec通过神经网络学习词语的分布式表示:
import torch
import torch.nn as nn
class SkipGram(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.output = nn.Linear(embedding_dim, vocab_size)
def forward(self, center_word):
embed = self.embeddings(center_word)
output = self.output(embed)
return output
vocab_size = 10000
embedding_dim = 300
model = SkipGram(vocab_size, embedding_dim)
CBOW模型
CBOW根据上下文预测中心词:
class CBOW(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(CBOW, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.output = nn.Linear(embedding_dim, vocab_size)
def forward(self, context_words):
embeds = self.embeddings(context_words)
mean_embed = embeds.mean(dim=0)
output = self.output(mean_embed)
return output
cbow_model = CBOW(vocab_size, embedding_dim)
训练数据准备
def create_skipgram_pairs(tokens, window_size=2):
pairs = []
for i, center in enumerate(tokens):
for j in range(max(0, i-window_size), min(len(tokens), i+window_size+1)):
if i != j:
pairs.append((center, tokens[j]))
return pairs
tokens = ["我", "喜欢", "学习", "深度", "学习", "技术"]
pairs = create_skipgram_pairs(tokens)
print("训练对:", pairs[:5])
GloVe词向量
GloVe基于全局词-词共现矩阵:
import numpy as np
def build_cooccurrence_matrix(corpus, window_size=2):
vocab = {word: idx for idx, word in enumerate(set(corpus))}
vocab_size = len(vocab)
cooccurrence = np.zeros((vocab_size, vocab_size))
for i, word in enumerate(corpus):
for j in range(max(0, i-window_size), min(len(corpus), i+window_size+1)):
if i != j:
cooccurrence[vocab[word], vocab[corpus[j]]] += 1
return cooccurrence, vocab
corpus = ["the", "cat", "sat", "on", "the", "mat"]
cooc, vocab = build_cooccurrence_matrix(corpus)
print("共现矩阵形状:", cooc.shape)
FastText子词向量
FastText通过子词单元处理未登录词:
def get_subwords(word, min_n=3, max_n=6):
subwords = []
for n in range(min_n, min(max_n, len(word)+1)):
for i in range(len(word)-n+1):
subwords.append(word[i:i+n])
return subwords
word = "unhappy"
subwords = get_subwords(word)
print(f"'{word}'的子词:", subwords)
使用预训练词向量
import torch
def load_pretrained_embeddings(word_to_index, embedding_dim=300):
num_words = len(word_to_index)
embeddings = torch.randn(num_words, embedding_dim) * 0.1
embedding_layer = nn.Embedding.from_pretrained(embeddings)
return embedding_layer
word_to_index = {"hello": 0, "world": 1, "python": 2}
embedding = load_pretrained_embeddings(word_to_index)
print("嵌入层权重形状:", embedding.weight.shape)
词向量评估
def cosine_similarity(vec1, vec2):
dot_product = torch.dot(vec1, vec2)
norm1 = torch.norm(vec1)
norm2 = torch.norm(vec2)
return dot_product / (norm1 * norm2 + 1e-8)
vec1 = torch.randn(300)
vec2 = torch.randn(300)
sim = cosine_similarity(vec1, vec2)
print("余弦相似度:", sim.item())
总结
词向量是NLP的基础技术。Word2Vec、GloVe和FastText各有优势,选择合适的词向量方法对下游任务至关重要。