Tokenizer训练:构建分词器
--- title: "Tokenizer训练:构建分词器" description: "掌握Tokenizer的训练方法和实现,为LLM构建高效的分词器" tags: ["Tokenizer", "分词器", "BPE", "词表构建"] category: "llm" icon: "🧠"
Tokenizer训练:构建分词器
Tokenizer简介
Tokenizer(分词器)是将文本转换为模型可处理的token序列的工具。训练自定义Tokenizer对于特定领域或语言的LLM至关重要,可以提高模型效率和性能。
Tokenizer的核心价值:
- 高效表示:将文本压缩为紧凑的token序列
- 词表控制:控制词表大小和覆盖率
- 领域适配:针对特定领域优化分词
- 多语言支持:处理多种语言的分词
分词算法
BPE(Byte Pair Encoding)
import re
from collections import Counter, defaultdict
class SimpleBPE:
"""简单的BPE实现"""
def __init__(self, vocab_size=30000):
self.vocab_size = vocab_size
self.merges = {}
self.vocab = {}
def get_pairs(self, word_freqs):
"""获取所有字符对"""
pairs = Counter()
for word, freq in word_freqs.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[(symbols[i], symbols[i+1])] += freq
return pairs
def merge_pair(self, pair, word_freqs):
"""合并一个字符对"""
new_word_freqs = {}
bigram = ' '.join(pair)
replacement = ''.join(pair)
for word, freq in word_freqs.items():
new_word = word.replace(bigram, replacement)
new_word_freqs[new_word] = freq
return new_word_freqs
def train(self, corpus):
"""训练BPE"""
# 初始化词频
word_freqs = Counter()
for text in corpus:
for word in text.split():
# 将单词分割为字符
word_freqs[' '.join(word) + ' </w>'] += 1
# 迭代合并
num_merges = self.vocab_size - len(set(' '.join(word_freqs.keys()).split()))
for i in range(num_merges):
pairs = self.get_pairs(word_freqs)
if not pairs:
break
# 选择最高频的对
best_pair = max(pairs, key=pairs.get)
# 合并
word_freqs = self.merge_pair(best_pair, word_freqs)
self.merges[best_pair] = i
# 构建词表
self.vocab = {word: idx for idx, word in enumerate(word_freqs.keys())}
SentencePiece
import sentencepiece as spm
def train_sentencepiece(input_file, model_prefix, vocab_size=30000):
"""训练SentencePiece模型"""
spm.SentencePieceTrainer.train(
input=input_file,
model_prefix=model_prefix,
vocab_size=vocab_size,
model_type='bpe', # 或 'unigram'
character_coverage=0.9995,
num_threads=16,
split_digits=True,
byte_fallback=True,
normalization_rule_name='identity'
)
# 加载模型
sp = spm.SentencePieceProcessor()
sp.load(f'{model_prefix}.model')
return sp
# 使用
corpus_file = 'corpus.txt'
sp = train_sentencepiece(corpus_file, 'my_tokenizer', vocab_size=30000)
# 测试
text = "Hello, how are you?"
tokens = sp.encode(text, out_type=str)
print(tokens)
Hugging Face Tokenizers
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
def train_hf_tokenizer(files, vocab_size=30000):
"""使用Hugging Face Tokenizers训练"""
# 初始化BPE
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
# 预分词器
tokenizer.pre_tokenizer = Whitespace()
# 训练器
trainer = BpeTrainer(
vocab_size=vocab_size,
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
min_frequency=2
)
# 训练
tokenizer.train(files, trainer)
return tokenizer
# 使用
tokenizer = train_hf_tokenizer(['corpus.txt'])
# 编码
encoding = tokenizer.encode("Hello, how are you?")
print(encoding.tokens)
多语言Tokenizer
def train_multilingual_tokenizer(corpus_files, vocab_size=50000):
"""训练多语言Tokenizer"""
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
trainer = BpeTrainer(
vocab_size=vocab_size,
special_tokens=["<s>", "</s>", "<pad>", "<unk>"],
show_progress=True
)
tokenizer.train(corpus_files, trainer)
return tokenizer
# 多语言语料
corpus_files = [
'en_corpus.txt',
'zh_corpus.txt',
'ja_corpus.txt',
'code_corpus.txt'
]
tokenizer = train_multilingual_tokenizer(corpus_files, vocab_size=64000)
词表设计
# 词表组成
vocab_composition = {
"基础词": "常见单词和子词",
"特殊token": "[PAD], [UNK], [CLS], [SEP], [MASK]",
"数字": "0-9的单个数字",
"代码token": "编程语言相关token",
"多语言": "各语言的基础字符"
}
# 词表大小选择
vocab_sizes = {
"小模型(1-3B)": 32000,
"中模型(7-13B)": 32000-64000,
"大模型(30B+)": 64000-128000,
"多语言模型": 64000-128000
}
评估Tokenizer
def evaluate_tokenizer(tokenizer, test_corpus):
"""评估Tokenizer质量"""
import numpy as np
stats = {
"total_tokens": 0,
"total_chars": 0,
"compression_ratio": 0,
"unk_rate": 0,
"unique_tokens": set()
}
for text in test_corpus:
tokens = tokenizer.encode(text, out_type=str)
stats["total_tokens"] += len(tokens)
stats["total_chars"] += len(text)
stats["unique_tokens"].update(tokens)
stats["unk_rate"] += tokens.count("[UNK]")
stats["compression_ratio"] = stats["total_chars"] / stats["total_tokens"]
stats["unk_rate"] /= len(test_corpus)
return {
"压缩比": stats["compression_ratio"],
"未知词率": stats["unk_rate"],
"词表覆盖率": len(stats["unique_tokens"]),
"平均token长度": stats["total_chars"] / stats["total_tokens"]
}
与LLM集成
from transformers import LlamaTokenizer, LlamaTokenizerFast
# 自定义LLaMA Tokenizer
class CustomLlamaTokenizer(LlamaTokenizerFast):
"""自定义LLaMA Tokenizer"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 添加特殊token
self.add_special_tokens({
"pad_token": "<pad>",
"bos_token": "<s>",
"eos_token": "</s>"
})
def train_new_from_iterator(self, text_iterator, vocab_size=32000):
"""从数据迭代器训练新Tokenizer"""
# 使用SentencePiece训练
from sentencepiece import SentencePieceTrainer
# 临时文件
temp_file = "temp_corpus.txt"
with open(temp_file, 'w') as f:
for text in text_iterator:
f.write(text + '\n')
# 训练
SentencePieceTrainer.train(
input=temp_file,
model_prefix='custom_tokenizer',
vocab_size=vocab_size,
model_type='bpe'
)
return CustomLlamaTokenizer(
vocab_file='custom_tokenizer.model'
)
最佳实践
# Tokenizer训练最佳实践
best_practices = {
"数据质量": "使用干净、多样化的训练语料",
"词表大小": "根据模型大小和任务选择",
"特殊token": "设计合适的特殊token",
"压缩比": "追求高压缩比,减少序列长度",
"未知词处理": "使用byte-level fallback"
}
使用LLaMA Tokenizer
from transformers import AutoTokenizer
# 使用预训练Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
"meta-llama/Llama-2-7b-hf",
trust_remote_code=True
)
# 自定义训练
def customize_tokenizer(base_tokenizer, new_corpus, added_tokens=1000):
"""在现有Tokenizer基础上添加新token"""
# 添加新token
new_tokens = [f"token_{i}" for i in range(added_tokens)]
base_tokenizer.add_tokens(new_tokens)
return base_tokenizer
# 测试
text = "这是一个测试"
tokens = tokenizer.encode(text)
print(f"原始: {text}")
print(f"Token数量: {len(tokens)}")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(tokens)}")
Tokenizer是LLM的基础组件,训练合适的Tokenizer对于模型性能和效率至关重要。