← 返回首页
🧠

Tokenizer训练指南

📂 llm ⏱ 2 min 364 words

--- title: "Tokenizer训练指南" description: "介绍如何训练自定义分词器,满足特定领域的需求" tags: ["Tokenizer", "BPE", "分词器训练", "NLP"] category: "llm" icon: "🧠"

Tokenizer训练指南

为什么需要自定义Tokenizer

预训练模型的分词器可能不适合特定领域(如医学、法律、代码),自定义训练可以:

使用Tokenizers库

安装

pip install tokenizers

BPE训练示例

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# 创建BPE分词器
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

# 设置预分词器
tokenizer.pre_tokenizer = Whitespace()

# 训练器配置
trainer = BpeTrainer(
    vocab_size=30000,
    min_frequency=2,
    special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
    show_progress=True
)

# 准备训练语料(文本文件列表)
files = ["corpus_1.txt", "corpus_2.txt", "corpus_3.txt"]

# 训练分词器
tokenizer.train(files, trainer)

# 保存分词器
tokenizer.save("my_tokenizer.json")

# 使用分词器
output = tokenizer.encode("Hello, how are you?")
print(f"Tokens: {output.tokens}")
print(f"IDs: {output.ids}")

WordPiece训练

from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(
    vocab_size=30000,
    min_frequency=2,
    special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]
)

tokenizer.train(["corpus.txt"], trainer)
tokenizer.save("wordpiece_tokenizer.json")

Unigram训练

from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer

tokenizer = Tokenizer(Unigram())

trainer = UnigramTrainer(
    vocab_size=30000,
    special_tokens=["[UNK]", "[PAD]"]
)

tokenizer.train(["corpus.txt"], trainer)

中文分词器训练

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Character

# 中文使用字符级预分词
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Character()

trainer = BpeTrainer(
    vocab_size=20000,
    min_frequency=2,
    special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]
)

# 中文语料文件
tokenizer.train(["chinese_corpus.txt"], trainer)

代码分词器训练

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel

# 代码使用ByteLevel预分词
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)

trainer = BpeTrainer(
    vocab_size=50000,
    min_frequency=2,
    special_tokens=["<pad>", "<sos>", "<eos>"]
)

tokenizer.train(["code_corpus.py", "code_corpus.js"], trainer)

与Transformers集成

转换为Transformers格式

from transformers import PreTrainedTokenizerFast

# 从Tokenizers加载
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("my_tokenizer.json")

# 转换为Transformers格式
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>"
)

# 保存
fast_tokenizer.save_pretrained("./my_tokenizer")

# 使用
from transformers import AutoTokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained("./my_tokenizer")

训练后的效果验证

def evaluate_tokenizer(tokenizer, test_texts):
    """评估分词器效果"""
    total_tokens = 0
    total_chars = 0
    
    for text in test_texts:
        encoded = tokenizer.encode(text)
        total_tokens += len(encoded.ids)
        total_chars += len(text)
    
    compression_ratio = total_chars / total_tokens
    print(f"平均压缩率: {compression_ratio:.2f} 字符/token")
    
    return compression_ratio

# 测试
test_texts = [
    "机器学习是人工智能的一个分支",
    "深度学习在计算机视觉中表现出色",
    "自然语言处理处理文本数据"
]

# evaluate_tokenizer(loaded_tokenizer, test_texts)

训练数据准备

# 准备训练语料
def prepare_corpus(texts, output_file):
    """准备训练语料"""
    with open(output_file, "w", encoding="utf-8") as f:
        for text in texts:
            # 预处理:移除多余空白、统一编码等
            cleaned = " ".join(text.split())
            f.write(cleaned + "\n")

# 从多个来源合并语料
corpus_files = ["data/book1.txt", "data/article1.txt", "data/wiki.txt"]
with open("merged_corpus.txt", "w", encoding="utf-8") as outfile:
    for corpus_file in corpus_files:
        with open(corpus_file, "r", encoding="utf-8") as infile:
            outfile.write(infile.read())

常见问题

# Q: 词汇表大小如何选择?
# A: 一般30K-50K,中文可适当减小到20K

# Q: 如何处理罕见词?
# A: 使用UNK token,或增加训练数据覆盖

# Q: 训练需要多长时间?
# A: 取决于语料大小,通常几分钟到几小时

总结

训练自定义Tokenizer可以更好地适应特定领域和语言的需求。通过tokenizers库和Transformers的集成,可以方便地创建和使用自定义分词器。