← 返回首页
🧠

Tokenizer训练:构建分词器

📂 llm ⏱ 3 min 557 words

--- title: "Tokenizer训练:构建分词器" description: "掌握Tokenizer的训练方法和实现,为LLM构建高效的分词器" tags: ["Tokenizer", "分词器", "BPE", "词表构建"] category: "llm" icon: "🧠"

Tokenizer训练:构建分词器

Tokenizer简介

Tokenizer(分词器)是将文本转换为模型可处理的token序列的工具。训练自定义Tokenizer对于特定领域或语言的LLM至关重要,可以提高模型效率和性能。

Tokenizer的核心价值:

分词算法

BPE(Byte Pair Encoding)

import re
from collections import Counter, defaultdict

class SimpleBPE:
    """简单的BPE实现"""
    
    def __init__(self, vocab_size=30000):
        self.vocab_size = vocab_size
        self.merges = {}
        self.vocab = {}
    
    def get_pairs(self, word_freqs):
        """获取所有字符对"""
        pairs = Counter()
        for word, freq in word_freqs.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i+1])] += freq
        return pairs
    
    def merge_pair(self, pair, word_freqs):
        """合并一个字符对"""
        new_word_freqs = {}
        bigram = ' '.join(pair)
        replacement = ''.join(pair)
        
        for word, freq in word_freqs.items():
            new_word = word.replace(bigram, replacement)
            new_word_freqs[new_word] = freq
        
        return new_word_freqs
    
    def train(self, corpus):
        """训练BPE"""
        # 初始化词频
        word_freqs = Counter()
        for text in corpus:
            for word in text.split():
                # 将单词分割为字符
                word_freqs[' '.join(word) + ' </w>'] += 1
        
        # 迭代合并
        num_merges = self.vocab_size - len(set(' '.join(word_freqs.keys()).split()))
        
        for i in range(num_merges):
            pairs = self.get_pairs(word_freqs)
            if not pairs:
                break
            
            # 选择最高频的对
            best_pair = max(pairs, key=pairs.get)
            
            # 合并
            word_freqs = self.merge_pair(best_pair, word_freqs)
            self.merges[best_pair] = i
        
        # 构建词表
        self.vocab = {word: idx for idx, word in enumerate(word_freqs.keys())}

SentencePiece

import sentencepiece as spm

def train_sentencepiece(input_file, model_prefix, vocab_size=30000):
    """训练SentencePiece模型"""
    spm.SentencePieceTrainer.train(
        input=input_file,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        model_type='bpe',  # 或 'unigram'
        character_coverage=0.9995,
        num_threads=16,
        split_digits=True,
        byte_fallback=True,
        normalization_rule_name='identity'
    )
    
    # 加载模型
    sp = spm.SentencePieceProcessor()
    sp.load(f'{model_prefix}.model')
    
    return sp

# 使用
corpus_file = 'corpus.txt'
sp = train_sentencepiece(corpus_file, 'my_tokenizer', vocab_size=30000)

# 测试
text = "Hello, how are you?"
tokens = sp.encode(text, out_type=str)
print(tokens)

Hugging Face Tokenizers

from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

def train_hf_tokenizer(files, vocab_size=30000):
    """使用Hugging Face Tokenizers训练"""
    # 初始化BPE
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    
    # 预分词器
    tokenizer.pre_tokenizer = Whitespace()
    
    # 训练器
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
        min_frequency=2
    )
    
    # 训练
    tokenizer.train(files, trainer)
    
    return tokenizer

# 使用
tokenizer = train_hf_tokenizer(['corpus.txt'])

# 编码
encoding = tokenizer.encode("Hello, how are you?")
print(encoding.tokens)

多语言Tokenizer

def train_multilingual_tokenizer(corpus_files, vocab_size=50000):
    """训练多语言Tokenizer"""
    from tokenizers import Tokenizer
    from tokenizers.models import BPE
    from tokenizers.trainers import BpeTrainer
    from tokenizers.pre_tokenizers import ByteLevel
    
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
    
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["<s>", "</s>", "<pad>", "<unk>"],
        show_progress=True
    )
    
    tokenizer.train(corpus_files, trainer)
    
    return tokenizer

# 多语言语料
corpus_files = [
    'en_corpus.txt',
    'zh_corpus.txt',
    'ja_corpus.txt',
    'code_corpus.txt'
]

tokenizer = train_multilingual_tokenizer(corpus_files, vocab_size=64000)

词表设计

# 词表组成
vocab_composition = {
    "基础词": "常见单词和子词",
    "特殊token": "[PAD], [UNK], [CLS], [SEP], [MASK]",
    "数字": "0-9的单个数字",
    "代码token": "编程语言相关token",
    "多语言": "各语言的基础字符"
}

# 词表大小选择
vocab_sizes = {
    "小模型(1-3B)": 32000,
    "中模型(7-13B)": 32000-64000,
    "大模型(30B+)": 64000-128000,
    "多语言模型": 64000-128000
}

评估Tokenizer

def evaluate_tokenizer(tokenizer, test_corpus):
    """评估Tokenizer质量"""
    import numpy as np
    
    stats = {
        "total_tokens": 0,
        "total_chars": 0,
        "compression_ratio": 0,
        "unk_rate": 0,
        "unique_tokens": set()
    }
    
    for text in test_corpus:
        tokens = tokenizer.encode(text, out_type=str)
        stats["total_tokens"] += len(tokens)
        stats["total_chars"] += len(text)
        stats["unique_tokens"].update(tokens)
        stats["unk_rate"] += tokens.count("[UNK]")
    
    stats["compression_ratio"] = stats["total_chars"] / stats["total_tokens"]
    stats["unk_rate"] /= len(test_corpus)
    
    return {
        "压缩比": stats["compression_ratio"],
        "未知词率": stats["unk_rate"],
        "词表覆盖率": len(stats["unique_tokens"]),
        "平均token长度": stats["total_chars"] / stats["total_tokens"]
    }

与LLM集成

from transformers import LlamaTokenizer, LlamaTokenizerFast

# 自定义LLaMA Tokenizer
class CustomLlamaTokenizer(LlamaTokenizerFast):
    """自定义LLaMA Tokenizer"""
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        # 添加特殊token
        self.add_special_tokens({
            "pad_token": "<pad>",
            "bos_token": "<s>",
            "eos_token": "</s>"
        })
    
    def train_new_from_iterator(self, text_iterator, vocab_size=32000):
        """从数据迭代器训练新Tokenizer"""
        # 使用SentencePiece训练
        from sentencepiece import SentencePieceTrainer
        
        # 临时文件
        temp_file = "temp_corpus.txt"
        with open(temp_file, 'w') as f:
            for text in text_iterator:
                f.write(text + '\n')
        
        # 训练
        SentencePieceTrainer.train(
            input=temp_file,
            model_prefix='custom_tokenizer',
            vocab_size=vocab_size,
            model_type='bpe'
        )
        
        return CustomLlamaTokenizer(
            vocab_file='custom_tokenizer.model'
        )

最佳实践

# Tokenizer训练最佳实践
best_practices = {
    "数据质量": "使用干净、多样化的训练语料",
    "词表大小": "根据模型大小和任务选择",
    "特殊token": "设计合适的特殊token",
    "压缩比": "追求高压缩比,减少序列长度",
    "未知词处理": "使用byte-level fallback"
}

使用LLaMA Tokenizer

from transformers import AutoTokenizer

# 使用预训练Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    trust_remote_code=True
)

# 自定义训练
def customize_tokenizer(base_tokenizer, new_corpus, added_tokens=1000):
    """在现有Tokenizer基础上添加新token"""
    # 添加新token
    new_tokens = [f"token_{i}" for i in range(added_tokens)]
    base_tokenizer.add_tokens(new_tokens)
    
    return base_tokenizer

# 测试
text = "这是一个测试"
tokens = tokenizer.encode(text)
print(f"原始: {text}")
print(f"Token数量: {len(tokens)}")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(tokens)}")

Tokenizer是LLM的基础组件,训练合适的Tokenizer对于模型性能和效率至关重要。