学习率调度:训练收敛的关键
--- title: "学习率调度:训练收敛的关键" description: "掌握学习率调度的原理和实现,优化LLM训练过程" tags: ["学习率调度", "LR Scheduler", "训练优化", "收敛"] category: "llm" icon: "🧠"
学习率调度:训练收敛的关键
学习率调度简介
学习率调度(Learning Rate Scheduling)是训练过程中动态调整学习率的技术。合适的学习率调度策略可以加速收敛、避免局部最优、提高模型性能。
学习率调度的核心价值:
- 加速收敛:前期大学习率快速收敛
- 避免震荡:后期小学习率精细调整
- 跳出局部最优:适当的学习率变化帮助探索
- 提高泛化:合适的学习率有助于泛化
常用调度策略
Warmup + Cosine Decay
import torch
from torch.optim.lr_scheduler import LambdaLR
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
"""Cosine衰减 + Warmup"""
def lr_lambda(current_step):
# Warmup阶段
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
# Cosine衰减阶段
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
return LambdaLR(optimizer, lr_lambda)
# 使用示例
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
steps_per_epoch = len(dataloader)
num_training_steps = num_epochs * steps_per_epoch
num_warmup_steps = int(0.1 * num_training_steps) # 10% warmup
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps
)
# 训练循环
for batch in dataloader:
loss = model(batch).loss
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
Linear Decay
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
"""线性衰减 + Warmup"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
return LambdaLR(optimizer, lr_lambda)
Constant with Warmup
def get_constant_schedule_with_warmup(optimizer, num_warmup_steps):
"""常量 + Warmup"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
return 1.0
return LambdaLR(optimizer, lr_lambda)
多阶段调度
from torch.optim.lr_scheduler import MultiStepLR
# 多阶段衰减
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
scheduler = MultiStepLR(
optimizer,
milestones=[3000, 6000, 9000], # 衰减点
gamma=0.1 # 衰减因子
)
Hugging Face实现
from transformers import get_scheduler
# 创建调度器
scheduler = get_scheduler(
name="cosine", # 调度策略
optimizer=optimizer,
num_warmup_steps=100,
num_training_steps=10000
)
# 训练循环
for batch in dataloader:
loss = model(batch).loss
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
调度策略对比
# 调度策略选择
strategies = {
"Cosine": {
"优点": "平滑衰减,效果好",
"缺点": "需要预设总步数",
"适用": "大多数场景"
},
"Linear": {
"优点": "简单,易于理解",
"缺点": "可能不如Cosine",
"适用": "快速原型"
},
"Constant": {
"优点": "简单",
"缺点": "后期学习率可能过大",
"适用": "短训练"
},
"MultiStep": {
"优点": "灵活控制衰减点",
"缺点": "需要手动设置",
"适用": "已知最佳衰减点"
}
}
学习率查找
def lr_finder(model, dataloader, optimizer, start_lr=1e-7, end_lr=10, num_steps=100):
"""学习率查找器"""
lrs = []
losses = []
# 设置学习率
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
optimizer,
gamma=(end_lr / start_lr) ** (1 / num_steps)
)
best_loss = float('inf')
for i, batch in enumerate(dataloader):
if i >= num_steps:
break
# 前向传播
loss = model(batch).loss
# 记录
lrs.append(optimizer.param_groups[0]['lr'])
losses.append(loss.item())
# 反向传播
loss.backward()
optimizer.step()
optimizer.zero_grad()
# 更新学习率
lr_scheduler.step()
# 检查是否发散
if loss.item() > best_loss * 4:
break
best_loss = min(best_loss, loss.item())
return lrs, losses
# 使用
lrs, losses = lr_finder(model, dataloader, optimizer)
训练可视化
import matplotlib.pyplot as plt
def plot_lr_schedule(scheduler, num_steps):
"""绘制学习率曲线"""
lrs = []
for step in range(num_steps):
lrs.append(scheduler.get_last_lr()[0])
scheduler.step()
plt.figure(figsize=(10, 6))
plt.plot(lrs)
plt.xlabel('Step')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule')
plt.grid(True)
plt.savefig('lr_schedule.png')
plt.show()
与批次大小关系
def adjust_lr_for_batch_size(base_lr, batch_size, reference_batch_size=32):
"""根据批次大小调整学习率"""
# 线性缩放规则
scaled_lr = base_lr * (batch_size / reference_batch_size)
return scaled_lr
# 示例
base_lr = 1e-4
batch_sizes = [32, 64, 128, 256]
for bs in batch_sizes:
adjusted_lr = adjust_lr_for_batch_size(base_lr, bs)
print(f"Batch {bs}: LR = {adjusted_lr}")
常见问题
训练不稳定
# 解决方案
solutions = {
"Loss震荡": "增加warmup步数",
"收敛慢": "增大学习率",
"发散": "减小学习率,增加warmup",
"过拟合": "使用学习率衰减"
}
最佳实践
# LLaMA训练学习率配置
llama_lr_config = {
"peak_lr": 3e-4,
"min_lr": 3e-5,
"warmup_steps": 2000,
"total_steps": 300000,
"schedule": "cosine",
"weight_decay": 0.1
}
性能对比
# 不同调度策略的收敛速度
convergence = {
"Cosine": "最快收敛",
"Linear": "中等收敛",
"Constant": "较慢收敛",
"Step": "依赖衰减点"
}
最佳实践
- 使用Warmup:几乎所有场景都需要warmup
- Cosine衰减:默认选择,效果好
- 学习率查找:确定最佳学习率范围
- 监控学习率:记录实际使用的学习率
- 调整Batch:根据Batch大小调整学习率
学习率调度是LLM训练成功的关键因素,合适的调度策略能显著提升训练效果。