Transformers库完全指南
--- title: "Transformers库完全指南" description: "深入介绍HuggingFace Transformers库的使用方法和高级特性" tags: ["Transformers", "HuggingFace", "模型加载", "推理"] category: "llm" icon: "🧠"
Transformers库完全指南
Transformers库概述
Transformers是HuggingFace的核心库,提供了数千种预训练模型的统一接口,支持PyTorch、TensorFlow和JAX。
模型加载与使用
Auto类自动加载
from transformers import AutoModel, AutoTokenizer, AutoConfig
# 自动选择正确的模型类
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
print(f"模型类型: {config.model_type}")
print(f"参数量: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
不同任务的模型类
from transformers import (
AutoModelForCausalLM, # 文本生成
AutoModelForSequenceClassification, # 文本分类
AutoModelForTokenClassification, # 命名实体识别
AutoModelForQuestionAnswering, # 问答
AutoModelForMaskedLM, # 掩码语言模型
AutoModelForSeq2SeqLM # 序列到序列
)
# 根据任务选择
task_models = {
"文本生成": "AutoModelForCausalLM",
"文本分类": "AutoModelForSequenceClassification",
"命名实体识别": "AutoModelForTokenClassification",
"问答": "AutoModelForQuestionAnswering"
}
for task, model_class in task_models.items():
print(f"{task}: {model_class}")
Pipeline API
基础Pipeline
from transformers import pipeline
# 文本分类
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
print(result)
# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time", max_length=50)
print(result[0]["generated_text"])
# 问答
qa = pipeline("question-answering")
result = qa(question="What is Python?", context="Python is a programming language.")
print(f"答案: {result['answer']}, 置信度: {result['score']:.2f}")
高级Pipeline配置
# 使用不同模型
classifier = pipeline(
"sentiment-analysis",
model="nlptown/bert-base-multilingual-uncased-sentiment",
device=0 # GPU设备ID
)
# 批量处理
results = classifier([
"Great product!",
"Terrible experience.",
"It's okay."
])
# 流式输出
generator = pipeline("text-generation", model="gpt2", device=0)
for result in generator("Hello", max_length=20, stream=True):
print(result["generated_text"], end="")
分词器详解
编码与解码
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# 单句编码
encoded = tokenizer("Hello, how are you?")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(encoded['input_ids'])}")
print(f"Token IDs: {encoded['input_ids']}")
# 解码
decoded = tokenizer.decode(encoded["input_ids"])
print(f"Decoded: {decoded}")
# 批量编码
batch = tokenizer(
["Hello world", "How are you?"],
padding=True,
truncation=True,
max_length=10
)
print(f"Batch shapes: {len(batch['input_ids'])} x {len(batch['input_ids'][0])}")
特殊Token处理
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# 添加特殊token
special_tokens = {
"pad_token": "[PAD]",
"bos_token": "<bos>",
"eos_token": "<eos>"
}
tokenizer.add_special_tokens(special_tokens)
# 使用特殊token
text = "Hello, world!"
encoded = tokenizer(
text,
add_special_tokens=True,
return_tensors="pt"
)
模型推理
基础推理
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# 准备输入
input_text = "The future of AI is"
inputs = tokenizer(input_text, return_tensors="pt")
# 生成
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=50,
temperature=0.7,
top_p=0.9,
num_return_sequences=1
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
生成参数
# 丰富的生成参数
outputs = model.generate(
**inputs,
max_length=100, # 最大生成长度
min_length=10, # 最小生成长度
temperature=0.7, # 温度
top_k=50, # Top-K采样
top_p=0.9, # 核采样
repetition_penalty=1.2, # 重复惩罚
do_sample=True, # 启用采样
num_beams=5, # 束搜索
early_stopping=True # 提前停止
)
微调训练
Trainer API
from transformers import (
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
from datasets import load_dataset
# 加载数据
dataset = load_dataset("glue", "sst2")
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2
)
# 预处理数据
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def preprocess(examples):
return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)
tokenized_dataset = dataset.map(preprocess, batched=True)
# 训练参数
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
learning_rate=2e-5,
weight_decay=0.01,
evaluation_strategy="steps",
eval_steps=500,
save_steps=1000,
load_best_model_at_end=True
)
# 训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"]
)
# trainer.train()
模型保存与导出
# 保存模型
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")
# 加载自定义模型
model = AutoModelForCausalLM.from_pretrained("./my_model")
# 导出为ONNX
# pip install optimum
from optimum.onnxruntime import ORTModelForCausalLM
ort_model = ORTModelForCausalLM.from_pretrained(model_name, export=True)
ort_model.save_pretrained("./onnx_model")
多GPU与分布式
from transformers import AutoModelForCausalLM
import torch
# 自动分配到多GPU
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-70b-hf",
device_map="auto",
torch_dtype=torch.float16
)
# 手动指定设备
model = AutoModelForCausalLM.from_pretrained(
"model-name",
device_map={"": 0} # 放在第一个GPU
)
最佳实践
# 1. 使用device_map="auto"自动管理设备
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
# 2. 使用torch_dtype节省显存
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
# 3. 使用pad_token_id处理填充
tokenizer.pad_token = tokenizer.eos_token
# 4. 使用batched=True加速数据处理
dataset.map(process_fn, batched=True)
总结
Transformers库是使用预训练模型的标准工具,通过统一的API简化了模型加载、推理和微调的流程。掌握其核心功能可以高效地构建NLP应用。