LLM微调实战:从数据准备到模型部署
LLM微调实战:从数据准备到模型部署
为什么要微调?
预训练的LLM是通用模型,通过微调可以:
- 适应特定领域(医疗、法律、金融)
- 学习特定任务格式
- 提高特定场景的性能
- 减少推理成本(小模型微调)
微调方法对比
| 方法 | 显存需求 | 训练速度 | 效果 | 适用场景 |
|---|---|---|---|---|
| 全量微调 | 高 | 慢 | 最好 | 数据充足、资源充足 |
| LoRA | 低 | 快 | 接近全量 | 资源受限 |
| QLoRA | 最低 | 中 | 良好 | 显存极度受限 |
| Prompt Tuning | 最低 | 最快 | 一般 | 简单任务 |
数据准备
数据格式
{
"instruction": "请将以下英文翻译成中文",
"input": "Hello, how are you?",
"output": "你好,你怎么样?"
}
数据清洗
import json
import re
def clean_data(data):
cleaned = []
for item in data:
# 移除空值
if not item.get("instruction") or not item.get("output"):
continue
# 清洗文本
instruction = clean_text(item["instruction"])
input_text = clean_text(item.get("input", ""))
output = clean_text(item["output"])
# 过滤过长/过短的样本
total_length = len(instruction) + len(input_text) + len(output)
if total_length < 10 or total_length > 2000:
continue
cleaned.append({
"instruction": instruction,
"input": input_text,
"output": output
})
return cleaned
def clean_text(text):
# 移除多余空白
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符
text = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:,。!?;:]', '', text)
return text.strip()
数据增强
class DataAugmentor:
def __init__(self, llm):
self.llm = llm
def augment_with_rephrasing(self, data, num_augments=2):
augmented = []
for item in data:
augmented.append(item)
for _ in range(num_augments):
prompt = f"""
请用不同的方式重新表达以下指令,保持含义不变:
原始指令:{item['instruction']}
重新表达的指令:
"""
new_instruction = self.llm.generate(prompt)
augmented.append({
"instruction": new_instruction,
"input": item["input"],
"output": item["output"]
})
return augmented
def augment_with_back_translation(self, data):
augmented = []
for item in data:
# 翻译成其他语言再翻译回来
prompt = f"""
请将以下中文翻译成英文,然后再翻译回中文:
原文:{item['instruction']}
回译结果:
"""
back_translated = self.llm.generate(prompt)
augmented.append({
"instruction": back_translated,
"input": item["input"],
"output": item["output"]
})
return augmented
LoRA微调
原理
LoRA(Low-Rank Adaptation)通过低秩矩阵分解来减少可训练参数。
import torch
import torch.nn as nn
import math
class LoRALayer(nn.Module):
def __init__(self, original_layer, rank=8, alpha=16):
super().__init__()
self.original_layer = original_layer
self.rank = rank
self.alpha = alpha
# 冻结原始层
for param in self.original_layer.parameters():
param.requires_grad = False
# LoRA矩阵
in_features = original_layer.in_features
out_features = original_layer.out_features
self.lora_A = nn.Parameter(torch.randn(in_features, rank) / math.sqrt(rank))
self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
self.scaling = alpha / rank
def forward(self, x):
# 原始输出
original_output = self.original_layer(x)
# LoRA输出
lora_output = (x @ self.lora_A @ self.lora_B) * self.scaling
return original_output + lora_output
使用PEFT库
from peft import LoraConfig, get_peft_model, TaskType
# 配置LoRA
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8, # 秩
lora_alpha=32, # 缩放因子
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"] # 应用LoRA的层
)
# 加载基础模型
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
# 应用LoRA
model = get_peft_model(model, lora_config)
# 打印可训练参数
model.print_trainable_parameters()
# 输出:trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622
QLoRA微调
QLoRA在LoRA基础上加入量化,进一步降低显存需求。
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
# 4位量化配置
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# 加载量化模型
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto"
)
# 准备模型进行训练
model = prepare_model_for_kbit_training(model)
# 应用LoRA
model = get_peft_model(model, lora_config)
训练配置
训练参数
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="./output",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
weight_decay=0.01,
warmup_steps=100,
logging_steps=10,
save_steps=500,
save_total_limit=3,
fp16=True,
optim="paged_adamw_32bit",
lr_scheduler_type="cosine",
report_to="tensorboard"
)
数据加载
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
# 加载数据集
dataset = load_dataset("json", data_files="train.json")
# 数据预处理
def preprocess_function(examples):
# 格式化为指令格式
texts = []
for instruction, input_text, output in zip(
examples["instruction"],
examples["input"],
examples["output"]
):
if input_text:
text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
else:
text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
texts.append(text)
return tokenizer(texts, truncation=True, max_length=512)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# 数据整理器
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
开始训练
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
data_collator=data_collator,
)
trainer.train()
评估方法
自动评估
def evaluate_model(model, tokenizer, test_dataset):
model.eval()
results = []
for item in test_dataset:
# 构造输入
prompt = f"### Instruction:\n{item['instruction']}\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt")
# 生成
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
do_sample=True
)
# 解码
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated = generated.split("### Response:\n")[-1].strip()
# 计算BLEU分数
bleu = compute_bleu(item["output"], generated)
results.append({
"instruction": item["instruction"],
"expected": item["output"],
"generated": generated,
"bleu": bleu
})
return results
人工评估
def human_evaluation(results, num_samples=50):
"""
抽样进行人工评估
评估维度:
- 相关性(0-5)
- 流畅性(0-5)
- 准确性(0-5)
"""
samples = random.sample(results, min(num_samples, len(results)))
evaluation_template = """
请评估以下AI回答的质量:
问题:{instruction}
AI回答:{generated}
标准答案:{expected}
请评分(0-5):
1. 相关性:回答是否与问题相关
2. 流畅性:回答是否通顺自然
3. 准确性:回答是否准确
"""
scores = []
for sample in samples:
# 这里可以调用LLM进行评估
# 或者发送给人工评估
pass
return scores
模型合并和导出
合并LoRA权重
def merge_lora_weights(model, lora_path):
from peft import PeftModel
# 加载LoRA权重
lora_model = PeftModel.from_pretrained(model, lora_path)
# 合并
merged_model = lora_model.merge_and_unload()
return merged_model
# 保存
merged_model.save_pretrained("merged_model")
tokenizer.save_pretrained("merged_model")
转换为GGUF格式
# 使用llama.cpp转换
python convert_hf_to_gguf.py merged_model --outtype f16
总结
LLM微调是将通用模型适应特定任务的关键技术。通过合理选择微调方法、准备高质量数据、配置训练参数,可以高效地微调出性能优异的定制模型。