← 返回首页
🧠

LLM预训练:从零构建大模型

📂 llm ⏱ 3 min 547 words

--- title: "LLM预训练:从零构建大模型" description: "掌握LLM预训练的完整流程,包括数据准备、训练策略和评估方法" tags: ["预训练", "Pretraining", "语言模型训练", "基础模型"] category: "llm" icon: "🧠"

LLM预训练:从零构建大模型

预训练概述

预训练(Pretraining)是构建大语言模型的第一阶段,通过在大规模无标注文本上进行自回归或掩码语言建模,学习语言的通用表示。预训练模型是后续微调和对齐的基础。

预训练的核心目标:

训练目标

因果语言模型(Causal LM)

import torch
import torch.nn as nn

class CausalLanguageModel(nn.Module):
    """因果语言模型(GPT风格)"""
    
    def __init__(self, vocab_size, hidden_size, num_layers, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.transformer = TransformerEncoder(hidden_size, num_layers, num_heads)
        self.head = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_ids, labels=None):
        # 嵌入
        x = self.embedding(input_ids)
        
        # Transformer编码
        x = self.transformer(x)
        
        # 语言建模头
        logits = self.head(x)
        
        # 计算损失
        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss = nn.functional.cross_entropy(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1)
            )
            return {"loss": loss, "logits": logits}
        
        return {"logits": logits}

掩码语言模型(Masked LM)

class MaskedLanguageModel(nn.Module):
    """掩码语言模型(BERT风格)"""
    
    def __init__(self, vocab_size, hidden_size, num_layers, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.transformer = TransformerEncoder(hidden_size, num_layers, num_heads)
        self.head = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_ids, labels=None):
        # 嵌入
        x = self.embedding(input_ids)
        
        # Transformer编码(双向)
        x = self.transformer(x, causal=False)
        
        # 语言建模头
        logits = self.head(x)
        
        # 计算损失(只计算mask位置)
        if labels is not None:
            mask = (labels != -100)
            loss = nn.functional.cross_entropy(
                logits[mask].view(-1, logits.size(-1)),
                labels[mask].view(-1)
            )
            return {"loss": loss, "logits": logits}
        
        return {"logits": logits}

数据准备

数据格式

# 预训练数据格式
pretraining_data = {
    "text": "长文本内容...",
    "metadata": {
        "source": "wikipedia",
        "language": "en",
        "timestamp": "2024-01-01"
    }
}

# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

def preprocess_function(examples):
    """预处理训练数据"""
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=2048,
        padding="max_length"
    )

数据配比

# 多领域数据配比
data_mixture = {
    "web": 0.6,      # 网页数据
    "books": 0.15,   # 书籍
    "code": 0.1,     # 代码
    "wikipedia": 0.1, # 维基百科
    "academic": 0.05  # 学术论文
}

# 数据采样策略
def sample_with_mixture(dataset, mixture_weights, num_samples):
    """按比例采样数据"""
    sampled_data = []
    
    for source, weight in mixture_weights.items():
        source_samples = int(num_samples * weight)
        source_data = dataset[source].shuffle().select(range(source_samples))
        sampled_data.extend(source_data)
    
    return sampled_data

训练配置

模型配置

# LLaMA-7B预训练配置
model_config = {
    "vocab_size": 32000,
    "hidden_size": 4096,
    "intermediate_size": 11008,
    "num_hidden_layers": 32,
    "num_attention_heads": 32,
    "num_key_value_heads": 32,
    "max_position_embeddings": 4096,
    "rope_theta": 10000.0,
    "rms_norm_eps": 1e-6,
    "tie_word_embeddings": False
}

训练配置

# 预训练超参数
training_config = {
    # 模型
    "model_name": "llama-7b",
    "vocab_size": 32000,
    
    # 优化器
    "optimizer": "adamw",
    "learning_rate": 3e-4,
    "min_lr": 3e-5,
    "weight_decay": 0.1,
    "beta1": 0.9,
    "beta2": 0.95,
    
    # 调度
    "schedule": "cosine",
    "warmup_steps": 2000,
    "total_steps": 300000,
    
    # 批次大小
    "micro_batch_size": 4,
    "gradient_accumulation_steps": 8,
    "global_batch_size": 1024,
    
    # 精度
    "fp16": True,
    "grad_clip": 1.0,
    
    # 序列长度
    "max_seq_length": 2048
}

使用Megatron-LM训练

# Megatron预训练启动命令
torchrun --nproc_per_node=8 \
    --nnodes=16 \
    pretrain_gpt.py \
    --num-layers 32 \
    --hidden-size 4096 \
    --num-attention-heads 32 \
    --seq-length 2048 \
    --max-position-embeddings 4096 \
    --micro-batch-size 4 \
    --global-batch-size 1024 \
    --train-iters 300000 \
    --lr 3e-4 \
    --min-lr 3e-5 \
    --lr-decay-style cosine \
    --weight-decay 0.1 \
    --adam-beta1 0.9 \
    --adam-beta2 0.95 \
    --clip-grad 1.0 \
    --fp16 \
    --tensor-model-parallel-size 2 \
    --pipeline-model-parallel-size 4

评估方法

def evaluate_pretraining(model, tokenizer, eval_dataset):
    """评估预训练效果"""
    model.eval()
    
    total_loss = 0
    num_batches = 0
    
    for batch in eval_dataset:
        with torch.no_grad():
            outputs = model(**batch)
            total_loss += outputs.loss.item()
            num_batches += 1
    
    avg_loss = total_loss / num_batches
    perplexity = torch.exp(torch.tensor(avg_loss))
    
    return {
        "loss": avg_loss,
        "perplexity": perplexity.item()
    }

Scaling Laws

# Chinchilla Scaling Laws
def chinchilla_optimal(model_size_gb, tokens_per_param=20):
    """计算Chinchilla最优配置"""
    # 模型参数量
    model_params = model_size_gb * 1e9 / 2  # FP16
    
    # 最优token数量
    optimal_tokens = model_params * tokens_per_param
    
    return {
        "model_params_b": model_params / 1e9,
        "optimal_tokens_t": optimal_tokens / 1e12
    }

# 示例
configs = [
    chinchilla_optimal(0.014),  # 7B
    chinchilla_optimal(0.026),  # 13B
    chinchilla_optimal(0.14),   # 70B
]

for config in configs:
    print(f"模型: {config['model_params_b']:.1f}B, 最优tokens: {config['optimal_tokens_t']:.1f}T")

常见问题

训练不稳定

# 解决方案
solutions = {
    "Loss spike": "降低学习率,增加warmup",
    "梯度爆炸": "使用梯度裁剪",
    "NaN loss": "检查数据质量,使用BF16",
    "收敛慢": "调整学习率和批次大小"
}

资源优化

# 资源估算
def estimate_training_resources(model_size_b, tokens_t):
    """估算训练资源"""
    # GPU小时估算(A100 80GB)
    gpu_hours = model_size_b * tokens_t * 1000 / 8  # 粗略估算
    
    return {
        "GPU小时": gpu_hours,
        "A100数量": int(gpu_hours / 1000),
        "训练天数": gpu_hours / (int(gpu_hours / 1000) * 24)
    }

最佳实践

  1. 数据质量:高质量数据是预训练成功的关键
  2. 规模定律:遵循Chinchilla scaling laws
  3. 训练稳定性:使用warmup和梯度裁剪
  4. 评估监控:定期评估验证集损失
  5. 检查点保存:定期保存训练检查点

预训练是构建大语言模型的基础,需要大量计算资源和精心的工程实践。