← 返回首页
🧠

Transformers库完全指南

📂 llm ⏱ 3 min 444 words

--- title: "Transformers库完全指南" description: "深入介绍HuggingFace Transformers库的使用方法和高级特性" tags: ["Transformers", "HuggingFace", "模型加载", "推理"] category: "llm" icon: "🧠"

Transformers库完全指南

Transformers库概述

Transformers是HuggingFace的核心库,提供了数千种预训练模型的统一接口,支持PyTorch、TensorFlow和JAX。

模型加载与使用

Auto类自动加载

from transformers import AutoModel, AutoTokenizer, AutoConfig

# 自动选择正确的模型类
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

print(f"模型类型: {config.model_type}")
print(f"参数量: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")

不同任务的模型类

from transformers import (
    AutoModelForCausalLM,           # 文本生成
    AutoModelForSequenceClassification,  # 文本分类
    AutoModelForTokenClassification,     # 命名实体识别
    AutoModelForQuestionAnswering,       # 问答
    AutoModelForMaskedLM,                # 掩码语言模型
    AutoModelForSeq2SeqLM               # 序列到序列
)

# 根据任务选择
task_models = {
    "文本生成": "AutoModelForCausalLM",
    "文本分类": "AutoModelForSequenceClassification",
    "命名实体识别": "AutoModelForTokenClassification",
    "问答": "AutoModelForQuestionAnswering"
}

for task, model_class in task_models.items():
    print(f"{task}: {model_class}")

Pipeline API

基础Pipeline

from transformers import pipeline

# 文本分类
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
print(result)

# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time", max_length=50)
print(result[0]["generated_text"])

# 问答
qa = pipeline("question-answering")
result = qa(question="What is Python?", context="Python is a programming language.")
print(f"答案: {result['answer']}, 置信度: {result['score']:.2f}")

高级Pipeline配置

# 使用不同模型
classifier = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    device=0  # GPU设备ID
)

# 批量处理
results = classifier([
    "Great product!",
    "Terrible experience.",
    "It's okay."
])

# 流式输出
generator = pipeline("text-generation", model="gpt2", device=0)
for result in generator("Hello", max_length=20, stream=True):
    print(result["generated_text"], end="")

分词器详解

编码与解码

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 单句编码
encoded = tokenizer("Hello, how are you?")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(encoded['input_ids'])}")
print(f"Token IDs: {encoded['input_ids']}")

# 解码
decoded = tokenizer.decode(encoded["input_ids"])
print(f"Decoded: {decoded}")

# 批量编码
batch = tokenizer(
    ["Hello world", "How are you?"],
    padding=True,
    truncation=True,
    max_length=10
)
print(f"Batch shapes: {len(batch['input_ids'])} x {len(batch['input_ids'][0])}")

特殊Token处理

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# 添加特殊token
special_tokens = {
    "pad_token": "[PAD]",
    "bos_token": "<bos>",
    "eos_token": "<eos>"
}

tokenizer.add_special_tokens(special_tokens)

# 使用特殊token
text = "Hello, world!"
encoded = tokenizer(
    text,
    add_special_tokens=True,
    return_tensors="pt"
)

模型推理

基础推理

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 准备输入
input_text = "The future of AI is"
inputs = tokenizer(input_text, return_tensors="pt")

# 生成
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=50,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

生成参数

# 丰富的生成参数
outputs = model.generate(
    **inputs,
    max_length=100,           # 最大生成长度
    min_length=10,            # 最小生成长度
    temperature=0.7,          # 温度
    top_k=50,                 # Top-K采样
    top_p=0.9,                # 核采样
    repetition_penalty=1.2,   # 重复惩罚
    do_sample=True,           # 启用采样
    num_beams=5,              # 束搜索
    early_stopping=True       # 提前停止
)

微调训练

Trainer API

from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

# 加载数据
dataset = load_dataset("glue", "sst2")

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

# 预处理数据
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(preprocess, batched=True)

# 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True
)

# 训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

# trainer.train()

模型保存与导出

# 保存模型
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

# 加载自定义模型
model = AutoModelForCausalLM.from_pretrained("./my_model")

# 导出为ONNX
# pip install optimum
from optimum.onnxruntime import ORTModelForCausalLM

ort_model = ORTModelForCausalLM.from_pretrained(model_name, export=True)
ort_model.save_pretrained("./onnx_model")

多GPU与分布式

from transformers import AutoModelForCausalLM
import torch

# 自动分配到多GPU
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-70b-hf",
    device_map="auto",
    torch_dtype=torch.float16
)

# 手动指定设备
model = AutoModelForCausalLM.from_pretrained(
    "model-name",
    device_map={"": 0}  # 放在第一个GPU
)

最佳实践

# 1. 使用device_map="auto"自动管理设备
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# 2. 使用torch_dtype节省显存
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# 3. 使用pad_token_id处理填充
tokenizer.pad_token = tokenizer.eos_token

# 4. 使用batched=True加速数据处理
dataset.map(process_fn, batched=True)

总结

Transformers库是使用预训练模型的标准工具,通过统一的API简化了模型加载、推理和微调的流程。掌握其核心功能可以高效地构建NLP应用。