LLM预训练:从零构建大模型
--- title: "LLM预训练:从零构建大模型" description: "掌握LLM预训练的完整流程,包括数据准备、训练策略和评估方法" tags: ["预训练", "Pretraining", "语言模型训练", "基础模型"] category: "llm" icon: "🧠"
LLM预训练:从零构建大模型
预训练概述
预训练(Pretraining)是构建大语言模型的第一阶段,通过在大规模无标注文本上进行自回归或掩码语言建模,学习语言的通用表示。预训练模型是后续微调和对齐的基础。
预训练的核心目标:
- 语言理解:学习语法、语义和世界知识
- 通用表示:为下游任务提供基础
- 知识注入:将大量知识编码到模型中
- 规模定律:遵循Chinchilla scaling laws
训练目标
因果语言模型(Causal LM)
import torch
import torch.nn as nn
class CausalLanguageModel(nn.Module):
"""因果语言模型(GPT风格)"""
def __init__(self, vocab_size, hidden_size, num_layers, num_heads):
super().__init__()
self.embedding = nn.Embedding(vocab_size, hidden_size)
self.transformer = TransformerEncoder(hidden_size, num_layers, num_heads)
self.head = nn.Linear(hidden_size, vocab_size)
def forward(self, input_ids, labels=None):
# 嵌入
x = self.embedding(input_ids)
# Transformer编码
x = self.transformer(x)
# 语言建模头
logits = self.head(x)
# 计算损失
if labels is not None:
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss = nn.functional.cross_entropy(
shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1)
)
return {"loss": loss, "logits": logits}
return {"logits": logits}
掩码语言模型(Masked LM)
class MaskedLanguageModel(nn.Module):
"""掩码语言模型(BERT风格)"""
def __init__(self, vocab_size, hidden_size, num_layers, num_heads):
super().__init__()
self.embedding = nn.Embedding(vocab_size, hidden_size)
self.transformer = TransformerEncoder(hidden_size, num_layers, num_heads)
self.head = nn.Linear(hidden_size, vocab_size)
def forward(self, input_ids, labels=None):
# 嵌入
x = self.embedding(input_ids)
# Transformer编码(双向)
x = self.transformer(x, causal=False)
# 语言建模头
logits = self.head(x)
# 计算损失(只计算mask位置)
if labels is not None:
mask = (labels != -100)
loss = nn.functional.cross_entropy(
logits[mask].view(-1, logits.size(-1)),
labels[mask].view(-1)
)
return {"loss": loss, "logits": logits}
return {"logits": logits}
数据准备
数据格式
# 预训练数据格式
pretraining_data = {
"text": "长文本内容...",
"metadata": {
"source": "wikipedia",
"language": "en",
"timestamp": "2024-01-01"
}
}
# Tokenization
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
def preprocess_function(examples):
"""预处理训练数据"""
return tokenizer(
examples["text"],
truncation=True,
max_length=2048,
padding="max_length"
)
数据配比
# 多领域数据配比
data_mixture = {
"web": 0.6, # 网页数据
"books": 0.15, # 书籍
"code": 0.1, # 代码
"wikipedia": 0.1, # 维基百科
"academic": 0.05 # 学术论文
}
# 数据采样策略
def sample_with_mixture(dataset, mixture_weights, num_samples):
"""按比例采样数据"""
sampled_data = []
for source, weight in mixture_weights.items():
source_samples = int(num_samples * weight)
source_data = dataset[source].shuffle().select(range(source_samples))
sampled_data.extend(source_data)
return sampled_data
训练配置
模型配置
# LLaMA-7B预训练配置
model_config = {
"vocab_size": 32000,
"hidden_size": 4096,
"intermediate_size": 11008,
"num_hidden_layers": 32,
"num_attention_heads": 32,
"num_key_value_heads": 32,
"max_position_embeddings": 4096,
"rope_theta": 10000.0,
"rms_norm_eps": 1e-6,
"tie_word_embeddings": False
}
训练配置
# 预训练超参数
training_config = {
# 模型
"model_name": "llama-7b",
"vocab_size": 32000,
# 优化器
"optimizer": "adamw",
"learning_rate": 3e-4,
"min_lr": 3e-5,
"weight_decay": 0.1,
"beta1": 0.9,
"beta2": 0.95,
# 调度
"schedule": "cosine",
"warmup_steps": 2000,
"total_steps": 300000,
# 批次大小
"micro_batch_size": 4,
"gradient_accumulation_steps": 8,
"global_batch_size": 1024,
# 精度
"fp16": True,
"grad_clip": 1.0,
# 序列长度
"max_seq_length": 2048
}
使用Megatron-LM训练
# Megatron预训练启动命令
torchrun --nproc_per_node=8 \
--nnodes=16 \
pretrain_gpt.py \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 4096 \
--micro-batch-size 4 \
--global-batch-size 1024 \
--train-iters 300000 \
--lr 3e-4 \
--min-lr 3e-5 \
--lr-decay-style cosine \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--clip-grad 1.0 \
--fp16 \
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 4
评估方法
def evaluate_pretraining(model, tokenizer, eval_dataset):
"""评估预训练效果"""
model.eval()
total_loss = 0
num_batches = 0
for batch in eval_dataset:
with torch.no_grad():
outputs = model(**batch)
total_loss += outputs.loss.item()
num_batches += 1
avg_loss = total_loss / num_batches
perplexity = torch.exp(torch.tensor(avg_loss))
return {
"loss": avg_loss,
"perplexity": perplexity.item()
}
Scaling Laws
# Chinchilla Scaling Laws
def chinchilla_optimal(model_size_gb, tokens_per_param=20):
"""计算Chinchilla最优配置"""
# 模型参数量
model_params = model_size_gb * 1e9 / 2 # FP16
# 最优token数量
optimal_tokens = model_params * tokens_per_param
return {
"model_params_b": model_params / 1e9,
"optimal_tokens_t": optimal_tokens / 1e12
}
# 示例
configs = [
chinchilla_optimal(0.014), # 7B
chinchilla_optimal(0.026), # 13B
chinchilla_optimal(0.14), # 70B
]
for config in configs:
print(f"模型: {config['model_params_b']:.1f}B, 最优tokens: {config['optimal_tokens_t']:.1f}T")
常见问题
训练不稳定
# 解决方案
solutions = {
"Loss spike": "降低学习率,增加warmup",
"梯度爆炸": "使用梯度裁剪",
"NaN loss": "检查数据质量,使用BF16",
"收敛慢": "调整学习率和批次大小"
}
资源优化
# 资源估算
def estimate_training_resources(model_size_b, tokens_t):
"""估算训练资源"""
# GPU小时估算(A100 80GB)
gpu_hours = model_size_b * tokens_t * 1000 / 8 # 粗略估算
return {
"GPU小时": gpu_hours,
"A100数量": int(gpu_hours / 1000),
"训练天数": gpu_hours / (int(gpu_hours / 1000) * 24)
}
最佳实践
- 数据质量:高质量数据是预训练成功的关键
- 规模定律:遵循Chinchilla scaling laws
- 训练稳定性:使用warmup和梯度裁剪
- 评估监控:定期评估验证集损失
- 检查点保存:定期保存训练检查点
预训练是构建大语言模型的基础,需要大量计算资源和精心的工程实践。