RLHF基础:人类反馈强化学习
--- title: "RLHF基础:人类反馈强化学习" description: "理解RLHF的核心原理、三阶段训练流程和在LLM对齐中的应用" tags: ["RLHF", "人类反馈", "强化学习", "模型对齐"] category: "llm" icon: "🧠"
RLHF基础:人类反馈强化学习
RLHF简介
RLHF(Reinforcement Learning from Human Feedback)是将人类偏好引入模型训练的强化学习方法。它是GPT-4、Claude等大型语言模型实现人类对齐的核心技术。RLHF通过人类反馈来优化模型行为,使其更符合人类价值观。
RLHF的核心优势:
- 人类对齐:让模型行为符合人类偏好
- 安全控制:减少有害、偏见和虚假内容
- 质量提升:提高回答的有用性和准确性
- 灵活调整:通过反馈持续优化模型
三阶段训练流程
阶段一:监督微调(SFT)
from transformers import AutoModelForCausalLM, AutoTokenizer
# 加载预训练模型
model = AutoModelForCausalLM.from_pretrained("base_model")
tokenizer = AutoTokenizer.from_pretrained("base_model")
# 使用高质量指令数据进行SFT
def sft_training(model, dataset, epochs=3):
"""监督微调阶段"""
training_args = TrainingArguments(
output_dir="./sft_output",
num_train_epochs=epochs,
per_device_train_batch_size=4,
learning_rate=2e-5,
fp16=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset
)
trainer.train()
return model
阶段二:训练奖励模型(Reward Model)
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
class RewardModel(nn.Module):
"""奖励模型"""
def __init__(self, base_model, hidden_size=4096):
super().__init__()
self.base_model = base_model
self.reward_head = nn.Linear(hidden_size, 1)
def forward(self, input_ids, attention_mask):
outputs = self.base_model(input_ids, attention_mask=attention_mask)
hidden_state = outputs.last_hidden_state[:, -1, :]
reward = self.reward_head(hidden_state)
return reward
def train_reward_model(model, preference_data):
"""训练奖励模型"""
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
for batch in preference_data:
# 偏好数据格式:(chosen, rejected)
chosen_rewards = model(batch["chosen_input_ids"], batch["chosen_attention_mask"])
rejected_rewards = model(batch["rejected_input_ids"], batch["rejected_attention_mask"])
# 偏好损失:chosen应该比rejected获得更高奖励
loss = -torch.log(torch.sigmoid(chosen_rewards - rejected_rewards))
loss = loss.mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
阶段三:PPO强化学习
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
# PPO配置
ppo_config = PPOConfig(
learning_rate=1.41e-5,
batch_size=64,
mini_batch_size=16,
ppo_epochs=4,
kl_penalty="kl", # KL散度惩罚
init_kl_coef=0.2,
target_kl=6.0
)
# 初始化PPO训练器
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained("sft_model")
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained("sft_model")
ppo_trainer = PPOTrainer(
config=ppo_config,
model=ppo_model,
ref_model=ref_model,
tokenizer=tokenizer
)
# PPO训练循环
def ppo_training(ppo_trainer, queries, reward_model):
for query in queries:
# 1. 生成响应
response_tensors = ppo_trainer.generate([query], max_new_tokens=256)
# 2. 计算奖励
rewards = reward_model(response_tensors)
# 3. PPO更新
stats = ppo_trainer.step([query], response_tensors, [rewards])
奖励模型设计
数据格式
# 偏好数据格式
preference_data = {
"prompt": "解释量子计算",
"chosen": "量子计算利用量子力学原理...",
"rejected": "量子计算就是很快的计算机..."
}
# 批量数据
dataset = [
{
"prompt": "什么是机器学习?",
"chosen": "机器学习是人工智能的子领域,通过算法让计算机从数据中学习模式...",
"rejected": "机器学习就是让电脑学习..."
},
# ... 更多数据
]
评估指标
def evaluate_reward_model(model, test_data):
"""评估奖励模型"""
correct = 0
total = 0
for item in test_data:
chosen_reward = model(
tokenizer(item["chosen"], return_tensors="pt").input_ids
)
rejected_reward = model(
tokenizer(item["rejected"], return_tensors="pt").input_ids
)
if chosen_reward > rejected_reward:
correct += 1
total += 1
accuracy = correct / total
return accuracy
KL散度约束
def compute_kl_penalty(log_probs, ref_log_probs):
"""计算KL散度惩罚"""
kl = log_probs - ref_log_probs
kl_penalty = kl.mean()
return kl_penalty
# 在奖励计算中添加KL惩罚
def compute_reward(response_log_probs, ref_log_probs, reward_model_score, kl_coef=0.2):
"""计算带KL惩罚的奖励"""
kl_penalty = compute_kl_penalty(response_log_probs, ref_log_probs)
reward = reward_model_score - kl_coef * kl_penalty
return reward
完整训练脚本
from transformers import AutoModelForCausalLM
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from datasets import load_dataset
def rlhf_training():
"""完整的RLHF训练流程"""
# 1. 加载模型
model = AutoModelForCausalLMWithValueHead.from_pretrained("sft_model")
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained("sft_model")
tokenizer = AutoTokenizer.from_pretrained("sft_model")
# 2. 配置PPO
config = PPOConfig(
learning_rate=1.41e-5,
batch_size=64,
mini_batch_size=16,
ppo_epochs=4
)
# 3. 创建训练器
ppo_trainer = PPOTrainer(
config=config,
model=model,
ref_model=ref_model,
tokenizer=tokenizer
)
# 4. 加载查询数据
dataset = load_dataset("json", data_files="prompts.json")
# 5. 训练循环
for epoch in range(3):
for batch in dataset["train"]:
query = batch["prompt"]
# 生成响应
response = ppo_trainer.generate(query)
# 计算奖励(实际使用训练好的奖励模型)
reward = compute_reward_score(response)
# PPO更新
stats = ppo_trainer.step([query], [response], [reward])
print(f"Reward: {reward:.2f}, KL: {stats['kl']:.2f}")
if __name__ == "__main__":
rlhf_training()
常见问题与解决方案
训练不稳定
# 1. 降低学习率
config = PPOConfig(learning_rate=5e-6)
# 2. 增加KL惩罚系数
config = PPOConfig(init_kl_coef=0.5)
# 3. 使用梯度裁剪
config = PPOConfig(max_grad_norm=0.5)
奖励黑客
# 1. 增加KL约束
kl_coef = 0.5 # 增加KL惩罚
# 2. 定期更新参考模型
if step % 100 == 0:
ref_model.load_state_dict(model.state_dict())
# 3. 使用多个奖励模型
def ensemble_reward(responses):
rewards = [rm(response) for rm in reward_models]
return sum(rewards) / len(rewards)
RLHF是实现LLM与人类价值观对齐的关键技术,通过持续优化可以构建更安全、更有用的AI助手。