← 返回首页
🤖

词向量详解

📂 ai ⏱ 2 min 288 words

词向量详解

什么是词向量

词向量是将词语映射到连续向量空间的技术,使得语义相似的词语在向量空间中距离相近。

Word2Vec原理

Word2Vec通过神经网络学习词语的分布式表示:

import torch
import torch.nn as nn

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, center_word):
        embed = self.embeddings(center_word)
        output = self.output(embed)
        return output

vocab_size = 10000
embedding_dim = 300
model = SkipGram(vocab_size, embedding_dim)

CBOW模型

CBOW根据上下文预测中心词:

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, context_words):
        embeds = self.embeddings(context_words)
        mean_embed = embeds.mean(dim=0)
        output = self.output(mean_embed)
        return output

cbow_model = CBOW(vocab_size, embedding_dim)

训练数据准备

def create_skipgram_pairs(tokens, window_size=2):
    pairs = []
    for i, center in enumerate(tokens):
        for j in range(max(0, i-window_size), min(len(tokens), i+window_size+1)):
            if i != j:
                pairs.append((center, tokens[j]))
    return pairs

tokens = ["我", "喜欢", "学习", "深度", "学习", "技术"]
pairs = create_skipgram_pairs(tokens)
print("训练对:", pairs[:5])

GloVe词向量

GloVe基于全局词-词共现矩阵:

import numpy as np

def build_cooccurrence_matrix(corpus, window_size=2):
    vocab = {word: idx for idx, word in enumerate(set(corpus))}
    vocab_size = len(vocab)
    cooccurrence = np.zeros((vocab_size, vocab_size))
    
    for i, word in enumerate(corpus):
        for j in range(max(0, i-window_size), min(len(corpus), i+window_size+1)):
            if i != j:
                cooccurrence[vocab[word], vocab[corpus[j]]] += 1
    
    return cooccurrence, vocab

corpus = ["the", "cat", "sat", "on", "the", "mat"]
cooc, vocab = build_cooccurrence_matrix(corpus)
print("共现矩阵形状:", cooc.shape)

FastText子词向量

FastText通过子词单元处理未登录词:

def get_subwords(word, min_n=3, max_n=6):
    subwords = []
    for n in range(min_n, min(max_n, len(word)+1)):
        for i in range(len(word)-n+1):
            subwords.append(word[i:i+n])
    return subwords

word = "unhappy"
subwords = get_subwords(word)
print(f"'{word}'的子词:", subwords)

使用预训练词向量

import torch

def load_pretrained_embeddings(word_to_index, embedding_dim=300):
    num_words = len(word_to_index)
    embeddings = torch.randn(num_words, embedding_dim) * 0.1
    
    embedding_layer = nn.Embedding.from_pretrained(embeddings)
    return embedding_layer

word_to_index = {"hello": 0, "world": 1, "python": 2}
embedding = load_pretrained_embeddings(word_to_index)
print("嵌入层权重形状:", embedding.weight.shape)

词向量评估

def cosine_similarity(vec1, vec2):
    dot_product = torch.dot(vec1, vec2)
    norm1 = torch.norm(vec1)
    norm2 = torch.norm(vec2)
    return dot_product / (norm1 * norm2 + 1e-8)

vec1 = torch.randn(300)
vec2 = torch.randn(300)
sim = cosine_similarity(vec1, vec2)
print("余弦相似度:", sim.item())

总结

词向量是NLP的基础技术。Word2Vec、GloVe和FastText各有优势,选择合适的词向量方法对下游任务至关重要。