RLHF数据:基于人类反馈强化学习的训练数据准备
RLHF数据:基于人类反馈强化学习的训练数据准备
RLHF 概述
RLHF(Reinforcement Learning from Human Feedback)是当前最主流的LLM对齐训练方法之一。该方法最早由OpenAI在InstructGPT中系统化应用,现已成为ChatGPT、Claude、Llama-2等主流模型训练流程的核心环节。
RLHF 包含三个关键阶段,每个阶段都需要不同类型和格式的训练数据:
- 监督微调(SFT):用高质量指令数据微调预训练模型
- 奖励模型训练:用人类偏好数据训练奖励模型
- 强化学习优化:用PPO等算法优化策略模型
第一阶段:SFT 数据准备
SFT 阶段使用的是标准的指令-响应对数据,格式为 (instruction, input, output) 三元组。
数据格式设计
{
"instruction": "请总结以下文章的主要观点",
"input": "近年来,人工智能技术取得了突破性进展...",
"output": "本文主要讨论了AI技术的三个关键突破:1) 大语言模型的涌现能力;2) 多模态融合的发展;3) AI在科学研究中的应用。这些突破标志着AI进入了新阶段。"
}
SFT 数据构建代码
import json
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class SFTExample:
instruction: str
input: Optional[str]
output: str
system_prompt: Optional[str] = None
category: str = "general"
class SFTDataBuilder:
"""SFT数据构建器"""
def __init__(self):
self.system_prompt = "你是一个有帮助的AI助手。请根据用户的指令提供准确、有帮助的回答。"
def format_alpaca(self, instruction: str, input_text: str, output: str) -> dict:
"""Alpaca格式"""
return {
"instruction": instruction,
"input": input_text,
"output": output
}
def format_sharegpt(self, conversations: List[dict]) -> dict:
"""ShareGPT格式(多轮对话)"""
return {"conversations": conversations}
def create_multiturn(self, turns: List[dict]) -> dict:
"""创建多轮对话数据"""
conversations = []
for i, turn in enumerate(turns):
role = "user" if i % 2 == 0 else "assistant"
conversations.append({
"from": role,
"value": turn["content"]
})
return self.format_sharegpt(conversations)
def balance_categories(self, data: List[SFTExample], target_size: int = 10000) -> List[dict]:
"""平衡不同类别的数据量"""
from collections import Counter
import random
categories = Counter(ex.category for ex in data)
balanced = []
for category, count in categories.items():
category_data = [ex for ex in data if ex.category == category]
if count > target_size // len(categories):
balanced.extend(random.sample(category_data, target_size // len(categories)))
else:
balanced.extend(category_data)
random.shuffle(balanced)
return balanced
# 示例使用
builder = SFTDataBuilder()
sft_data = [
builder.format_alpaca("翻译成英文", "今天天气很好", "The weather is nice today."),
builder.format_alpaca("写一段代码", "Python快速排序", "def quicksort(arr):...")
]
第二阶段:奖励模型数据
奖励模型训练需要成对的偏好比较数据,格式为 (prompt, chosen, rejected)。
数据特点
class RewardModelData:
"""奖励模型数据处理"""
def __init__(self):
self.comparison_requirements = {
"min_length_diff": 0, # 响应长度差异不应过大
"difficulty_range": (0.3, 0.9), # 比较难度适中
"diversity_score": 0.7 # 响应多样性
}
def validate_pair(self, chosen: str, rejected: str) -> dict:
"""验证比较对的质量"""
length_ratio = len(chosen) / max(len(rejected), 1)
return {
"length_balanced": 0.5 < length_ratio < 2.0,
"content_different": self._check_semantic_diff(chosen, rejected),
"quality_gap_appropriate": self._assess_quality_gap(chosen, rejected)
}
def _check_semantic_diff(self, text1: str, text2: str) -> bool:
"""检查语义差异"""
words1 = set(text1.split())
words2 = set(text2.split())
overlap = len(words1 & words2) / max(len(words1 | words2), 1)
return overlap < 0.8
def _assess_quality_gap(self, chosen: str, rejected: str) -> bool:
"""评估质量差距是否适中"""
# 简单启发式:chosen应该明显优于rejected但不能差距过大
return True
def format_for_training(self, prompt: str, chosen: str, rejected: str) -> dict:
"""格式化为训练数据"""
return {
"prompt": prompt,
"chosen": chosen,
"rejected": rejected,
"metadata": self.validate_pair(chosen, rejected)
}
# 批量处理
def prepare_reward_data(raw_preferences: List[dict]) -> List[dict]:
"""准备奖励模型训练数据"""
processor = RewardModelData()
processed = []
for item in raw_preferences:
validation = processor.validate_pair(item["chosen"], item["rejected"])
if all(validation.values()):
processed.append(processor.format_for_training(
item["prompt"], item["chosen"], item["rejected"]
))
return processed
奖励模型训练示例
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, Trainer
class RewardModelDataset(Dataset):
"""奖励模型数据集"""
def __init__(self, data, tokenizer, max_length=512):
self.data = data
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
# 编码 chosen
chosen_encoding = self.tokenizer(
f"{item['prompt']}\n\n{item['chosen']}",
max_length=self.max_length,
truncation=True,
padding="max_length",
return_tensors="pt"
)
# 编码 rejected
rejected_encoding = self.tokenizer(
f"{item['prompt']}\n\n{item['rejected']}",
max_length=self.max_length,
truncation=True,
padding="max_length",
return_tensors="pt"
)
return {
"chosen_input_ids": chosen_encoding["input_ids"].squeeze(),
"chosen_attention_mask": chosen_encoding["attention_mask"].squeeze(),
"rejected_input_ids": rejected_encoding["input_ids"].squeeze(),
"rejected_attention_mask": rejected_encoding["attention_mask"].squeeze()
}
# 训练奖励模型
def train_reward_model(model, dataset, epochs=3):
"""训练奖励模型"""
trainer = Trainer(
model=model,
train_dataset=dataset,
epochs=epochs,
batch_size=8,
learning_rate=1e-5,
loss_fn="pairwise_loss"
)
trainer.train()
return model
第三阶段:PPO 训练数据
PPO阶段需要一个初始prompt集合,模型在此基础上生成响应并获得奖励信号。
class PPODataGenerator:
"""PPO训练数据生成器"""
def __init__(self, prompt_dataset):
self.prompts = prompt_dataset
self.generation_config = {
"max_new_tokens": 256,
"temperature": 0.7,
"top_p": 0.9
}
def generate_trajectories(self, policy_model, reward_model, n_samples=1000):
"""生成训练轨迹"""
trajectories = []
for prompt in self.prompts[:n_samples]:
# 策略模型生成响应
response = policy_model.generate(prompt, **self.generation_config)
# 奖励模型打分
reward = reward_model.score(prompt, response)
trajectories.append({
"prompt": prompt,
"response": response,
"reward": reward
})
return trajectories
def filter_high_reward(self, trajectories, threshold=0.7):
"""筛选高奖励样本用于分析"""
return [t for t in trajectories if t["reward"] > threshold]
# PPO训练配置
ppo_config = {
"batch_size": 64,
"ppo_epochs": 4,
"mini_batch_size": 16,
"kl_coeff": 0.2,
"clip_range": 0.2,
"gamma": 1.0,
"lam": 0.95,
"value_loss_coef": 0.5,
"entropy_coef": 0.01
}
数据质量保障策略
1. 数据去重
from hashlib import md5
def deduplicate_by_content(data):
"""基于内容哈希去重"""
seen = set()
unique_data = []
for item in data:
content_hash = md5(item["prompt"].encode()).hexdigest()
if content_hash not in seen:
seen.add(content_hash)
unique_data.append(item)
return unique_data
def deduplicate_by_semantic(data, model, threshold=0.95):
"""基于语义相似度去重"""
from sentence_transformers import SentenceTransformer
import numpy as np
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode([item["prompt"] for item in data])
# 计算相似度矩阵
similarity = np.dot(embeddings, embeddings.T)
# 贪心去重
to_keep = []
for i in range(len(data)):
if all(similarity[i][j] < threshold for j in to_keep):
to_keep.append(i)
return [data[i] for i in to_keep]
2. 数据增强
class RLHFDataAugmenter:
"""RLHF数据增强"""
def augment_prompt(self, original_prompt: str) -> List[str]:
"""增强prompt多样性"""
augmented = [original_prompt]
# 添加变体
augmented.append(f"请帮我回答:{original_prompt}")
augmented.append(f"关于{original_prompt},你能告诉我什么?")
augmented.append(f"我需要了解{original_prompt}的信息")
return augmented
def augment_with_negatives(self, chosen_response: str) -> List[str]:
"""生成负样本变体"""
negatives = []
# 添加不完整回答
negatives.append(chosen_response[:len(chosen_response)//2] + "...")
# 添加低质量变体
negatives.append(chosen_response.replace(",", ",嗯,"))
return negatives
数据规模建议
| 阶段 | 推荐数据量 | 质量要求 |
|---|---|---|
| SFT | 10K-100K 条 | 高质量、多样化 |
| 奖励模型 | 50K-200K 比较对 | 一致性 > 75% |
| PPO | 1K-10K prompts | 覆盖核心场景 |
最佳实践
- 渐进式构建:先构建小规模高质量数据,验证效果后再扩展
- 迭代优化:根据训练结果反馈调整数据分布
- 质量优先:宁可数据量小,也要保证质量
- 多样性保障:覆盖不同难度、不同领域、不同风格
- 自动化流水线:建立数据处理的自动化pipeline减少人工错误
- 版本控制:使用数据版本管理工具追踪变更
RLHF数据的准备工作看似繁琐,但却是决定最终模型质量的关键因素。投入足够的精力构建高质量数据集,将为后续训练带来事半功倍的效果。