模型合并:组合多个模型的能力
--- title: "模型合并:组合多个模型的能力" description: "掌握模型合并的各种技术,包括权重平均、任务算术和DARE等方法" tags: ["模型合并", "权重平均", "模型组合", "能力融合"] category: "llm" icon: "🧠"
模型合并:组合多个模型的能力
模型合并简介
模型合并(Model Merging)是将多个微调后的模型组合成一个模型的技术。通过合并,可以将不同模型的专长融合到一个模型中,无需重新训练或额外数据。模型合并已成为开源LLM社区的重要实践。
模型合并的核心价值:
- 能力融合:将不同领域的专业知识整合
- 成本节省:避免从头训练多任务模型
- 快速迭代:快速创建具有新能力的模型
- 知识蒸馏:将大模型能力转移到小模型
合并方法
权重平均(Weight Averaging)
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM
def average_merge(model_paths, weights=None):
"""权重平均合并"""
if weights is None:
weights = [1.0 / len(model_paths)] * len(model_paths)
# 加载第一个模型作为基础
merged_model = AutoModelForCausalLM.from_pretrained(model_paths[0])
merged_state_dict = merged_model.state_dict()
# 加权平均
for model_path, weight in zip(model_paths[1:], weights[1:]):
model = AutoModelForCausalLM.from_pretrained(model_path)
state_dict = model.state_dict()
for key in merged_state_dict:
if key in state_dict:
merged_state_dict[key] = (
merged_state_dict[key] * weights[0] +
state_dict[key] * weight
)
# 更新模型权重
merged_model.load_state_dict(merged_state_dict)
return merged_model
# 使用
model_paths = ["model_task_a", "model_task_b", "model_task_c"]
weights = [0.4, 0.3, 0.3]
merged_model = average_merge(model_paths, weights)
任务算术(Task Arithmetic)
def task_arithmetic_merge(base_model_path, task_model_paths,
task_strengths, alpha=1.0):
"""任务算术合并"""
# 加载基础模型
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
base_state_dict = base_model.state_dict()
# 计算任务向量并累加
for task_path, strength in zip(task_model_paths, task_strengths):
task_model = AutoModelForCausalLM.from_pretrained(task_path)
task_state_dict = task_model.state_dict()
# 计算任务向量(微调后 - 微调前)
for key in base_state_dict:
if key in task_state_dict:
task_vector = task_state_dict[key] - base_state_dict[key]
# 添加任务向量
base_state_dict[key] = base_state_dict[key] + alpha * strength * task_vector
base_model.load_state_dict(base_state_dict)
return base_model
# 使用
merged_model = task_arithmetic_merge(
base_model_path="base_llama",
task_model_paths=["math_model", "code_model"],
task_strengths=[1.0, 0.8],
alpha=0.5
)
TIES合并
def ties_merge(models, base_model, top_k=0.2, density=0.5):
"""TIES合并方法"""
import numpy as np
# 收集所有参数差异
all_diffs = {}
for model in models:
state_dict = model.state_dict()
for key in state_dict:
if key not in all_diffs:
all_diffs[key] = []
diff = state_dict[key] - base_model.state_dict()[key]
all_diffs[key].append(diff)
# TIES处理
merged_state_dict = base_model.state_dict().copy()
for key in all_diffs:
diffs = torch.stack(all_diffs[key])
# 1. Trim:只保留top-k比例的参数
k = int(diffs.numel() * top_k)
threshold = torch.kthvalue(diffs.abs().flatten(), k).values
mask = diffs.abs() >= threshold
# 2. Disjoint:解决符号冲突
sign_votes = diffs.sign().sum(dim=0)
sign_mask = sign_votes.sign() == diffs.sign()
final_mask = mask & sign_mask
# 3. 统一符号
merged_diff = (diffs * final_mask).sum(dim=0)
sign = torch.sign(sign_votes)
merged_diff = merged_diff * sign
# 合并
merged_state_dict[key] = base_model.state_dict()[key] + merged_diff
base_model.load_state_dict(merged_state_dict)
return base_model
DARE(Drop And REscale)
def dare_merge(models, base_model, target_p=0.5, density=0.3):
"""DARE合并方法"""
merged_state_dict = base_model.state_dict().copy()
for model in models:
state_dict = model.state_dict()
for key in state_dict:
if key in merged_state_dict:
# 计算差异
diff = state_dict[key] - base_model.state_dict()[key]
# 随机丢弃
mask = torch.bernoulli(torch.full_like(diff, 1 - target_p)).bool()
diff = diff * mask
# 重新缩放
diff = diff / (1 - target_p)
# 合并
merged_state_dict[key] += diff
base_model.load_state_dict(merged_state_dict)
return base_model
使用mergekit工具
# mergekit配置文件示例 (config.yaml)
"""
models:
- model: model_a
parameters:
weight: 0.6
- model: model_b
parameters:
weight: 0.4
merge_method: linear
dtype: float16
"""
# 使用mergekit
# mergekit-yaml config.yaml output_model
mergekit任务算术
# 任务算术配置 (task_arithmetic.yaml)
"""
models:
- model: base_model
parameters:
weight: 1.0
- model: math_model
parameters:
weight: 0.5
- model: code_model
parameters:
weight: 0.3
merge_method: task_arithmetic
base_model: base_model
dtype: float16
"""
合并策略选择
def select_merge_strategy(models, task_requirements):
"""根据需求选择合并策略"""
strategies = {
"简单融合": {
"method": "linear",
"适用场景": "模型能力互补,无冲突",
"参数": {"weights": [1/len(models)] * len(models)}
},
"能力增强": {
"method": "task_arithmetic",
"适用场景": "需要在基础模型上增强特定能力",
"参数": {"alpha": 0.5, "strengths": [0.8] * len(models)}
},
"高质量合并": {
"method": "dare_ties",
"适用场景": "追求最佳质量,有计算资源",
"参数": {"density": 0.3, "weight": 0.5}
},
"快速原型": {
"method": "slerp",
"适用场景": "快速验证合并效果",
"参数": {"t": 0.5}
}
}
# 根据任务需求选择
if task_requirements.get("quality_first"):
return strategies["高质量合并"]
elif task_requirements.get("speed_first"):
return strategies["快速原型"]
else:
return strategies["简单融合"]
评估合并效果
def evaluate_merged_model(merged_model, test_datasets, tokenizer):
"""评估合并模型"""
results = {}
for dataset_name, dataset in test_datasets.items():
scores = []
for sample in dataset:
# 生成回答
inputs = tokenizer(sample["prompt"], return_tensors="pt")
outputs = merged_model.generate(**inputs, max_new_tokens=256)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 评估(使用LLM评估或自动指标)
score = evaluate_response(response, sample["expected"])
scores.append(score)
results[dataset_name] = {
"mean_score": np.mean(scores),
"std_score": np.std(scores)
}
return results
# 使用
test_datasets = {
"数学": math_test_data,
"编程": code_test_data,
"常识": commonsense_test_data
}
results = evaluate_merged_model(merged_model, test_datasets, tokenizer)
print(results)
最佳实践
- 备份原始模型:合并前保存原始模型权重
- 渐进合并:先合并两个模型,验证效果后再继续
- 权重调优:通过网格搜索找到最佳权重组合
- 全面评估:在多个维度评估合并效果
- 社区共享:将成功的合并配置分享给社区
模型合并为LLM能力扩展提供了灵活高效的途径,是开源社区的重要实践。