← 返回首页
🧠

学习率调度:训练收敛的关键

📂 llm ⏱ 3 min 465 words

--- title: "学习率调度:训练收敛的关键" description: "掌握学习率调度的原理和实现,优化LLM训练过程" tags: ["学习率调度", "LR Scheduler", "训练优化", "收敛"] category: "llm" icon: "🧠"

学习率调度:训练收敛的关键

学习率调度简介

学习率调度(Learning Rate Scheduling)是训练过程中动态调整学习率的技术。合适的学习率调度策略可以加速收敛、避免局部最优、提高模型性能。

学习率调度的核心价值:

常用调度策略

Warmup + Cosine Decay

import torch
from torch.optim.lr_scheduler import LambdaLR

def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """Cosine衰减 + Warmup"""
    def lr_lambda(current_step):
        # Warmup阶段
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        
        # Cosine衰减阶段
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
    
    return LambdaLR(optimizer, lr_lambda)

# 使用示例
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
steps_per_epoch = len(dataloader)
num_training_steps = num_epochs * steps_per_epoch
num_warmup_steps = int(0.1 * num_training_steps)  # 10% warmup

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# 训练循环
for batch in dataloader:
    loss = model(batch).loss
    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

Linear Decay

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """线性衰减 + Warmup"""
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    
    return LambdaLR(optimizer, lr_lambda)

Constant with Warmup

def get_constant_schedule_with_warmup(optimizer, num_warmup_steps):
    """常量 + Warmup"""
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return 1.0
    
    return LambdaLR(optimizer, lr_lambda)

多阶段调度

from torch.optim.lr_scheduler import MultiStepLR

# 多阶段衰减
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
scheduler = MultiStepLR(
    optimizer,
    milestones=[3000, 6000, 9000],  # 衰减点
    gamma=0.1  # 衰减因子
)

Hugging Face实现

from transformers import get_scheduler

# 创建调度器
scheduler = get_scheduler(
    name="cosine",  # 调度策略
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=10000
)

# 训练循环
for batch in dataloader:
    loss = model(batch).loss
    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

调度策略对比

# 调度策略选择
strategies = {
    "Cosine": {
        "优点": "平滑衰减,效果好",
        "缺点": "需要预设总步数",
        "适用": "大多数场景"
    },
    "Linear": {
        "优点": "简单,易于理解",
        "缺点": "可能不如Cosine",
        "适用": "快速原型"
    },
    "Constant": {
        "优点": "简单",
        "缺点": "后期学习率可能过大",
        "适用": "短训练"
    },
    "MultiStep": {
        "优点": "灵活控制衰减点",
        "缺点": "需要手动设置",
        "适用": "已知最佳衰减点"
    }
}

学习率查找

def lr_finder(model, dataloader, optimizer, start_lr=1e-7, end_lr=10, num_steps=100):
    """学习率查找器"""
    lrs = []
    losses = []
    
    # 设置学习率
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
        optimizer,
        gamma=(end_lr / start_lr) ** (1 / num_steps)
    )
    
    best_loss = float('inf')
    
    for i, batch in enumerate(dataloader):
        if i >= num_steps:
            break
        
        # 前向传播
        loss = model(batch).loss
        
        # 记录
        lrs.append(optimizer.param_groups[0]['lr'])
        losses.append(loss.item())
        
        # 反向传播
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        # 更新学习率
        lr_scheduler.step()
        
        # 检查是否发散
        if loss.item() > best_loss * 4:
            break
        best_loss = min(best_loss, loss.item())
    
    return lrs, losses

# 使用
lrs, losses = lr_finder(model, dataloader, optimizer)

训练可视化

import matplotlib.pyplot as plt

def plot_lr_schedule(scheduler, num_steps):
    """绘制学习率曲线"""
    lrs = []
    
    for step in range(num_steps):
        lrs.append(scheduler.get_last_lr()[0])
        scheduler.step()
    
    plt.figure(figsize=(10, 6))
    plt.plot(lrs)
    plt.xlabel('Step')
    plt.ylabel('Learning Rate')
    plt.title('Learning Rate Schedule')
    plt.grid(True)
    plt.savefig('lr_schedule.png')
    plt.show()

与批次大小关系

def adjust_lr_for_batch_size(base_lr, batch_size, reference_batch_size=32):
    """根据批次大小调整学习率"""
    # 线性缩放规则
    scaled_lr = base_lr * (batch_size / reference_batch_size)
    return scaled_lr

# 示例
base_lr = 1e-4
batch_sizes = [32, 64, 128, 256]

for bs in batch_sizes:
    adjusted_lr = adjust_lr_for_batch_size(base_lr, bs)
    print(f"Batch {bs}: LR = {adjusted_lr}")

常见问题

训练不稳定

# 解决方案
solutions = {
    "Loss震荡": "增加warmup步数",
    "收敛慢": "增大学习率",
    "发散": "减小学习率,增加warmup",
    "过拟合": "使用学习率衰减"
}

最佳实践

# LLaMA训练学习率配置
llama_lr_config = {
    "peak_lr": 3e-4,
    "min_lr": 3e-5,
    "warmup_steps": 2000,
    "total_steps": 300000,
    "schedule": "cosine",
    "weight_decay": 0.1
}

性能对比

# 不同调度策略的收敛速度
convergence = {
    "Cosine": "最快收敛",
    "Linear": "中等收敛",
    "Constant": "较慢收敛",
    "Step": "依赖衰减点"
}

最佳实践

  1. 使用Warmup:几乎所有场景都需要warmup
  2. Cosine衰减:默认选择,效果好
  3. 学习率查找:确定最佳学习率范围
  4. 监控学习率:记录实际使用的学习率
  5. 调整Batch:根据Batch大小调整学习率

学习率调度是LLM训练成功的关键因素,合适的调度策略能显著提升训练效果。