随机搜索在LLM超参数调优中的应用
--- title: "随机搜索在LLM超参数调优中的应用" description: "介绍随机搜索算法在大型语言模型超参数调优中的原理、实现和最佳实践。" tags: ["随机搜索", "超参数调优", "llm", "机器学习", "优化算法"] category: "llm" icon: "🧠"
随机搜索在LLM超参数调优中的应用
什么是随机搜索?
随机搜索是一种超参数优化方法,它从指定的参数分布中随机采样参数组合进行评估。
随机搜索原理
1. 基本实现
import random
class RandomSearch:
def __init__(self, param_distributions, n_iter=50):
self.param_distributions = param_distributions
self.n_iter = n_iter
self.results = []
def sample_params(self):
"""从分布中采样参数"""
params = {}
for param_name, distribution in self.param_distributions.items():
if distribution["type"] == "uniform":
params[param_name] = random.uniform(
distribution["low"],
distribution["high"]
)
elif distribution["type"] == "log_uniform":
params[param_name] = random.uniform(
distribution["low"],
distribution["high"]
)
params[param_name] = 10 ** params[param_name]
elif distribution["type"] == "choice":
params[param_name] = random.choice(distribution["values"])
return params
def run(self, objective_function):
"""运行随机搜索"""
for i in range(self.n_iter):
params = self.sample_params()
print(f"迭代 {i+1}/{self.n_iter}: {params}")
result = objective_function(params)
self.results.append({
"params": params,
"result": result
})
best_result = max(self.results, key=lambda x: x["result"])
return best_result["params"], best_result["result"]
2. 支持条件参数
class ConditionalRandomSearch:
def __init__(self, param_distributions, n_iter=50):
self.param_distributions = param_distributions
self.n_iter = n_iter
self.results = []
def sample_params(self):
"""采样支持条件依赖的参数"""
params = {}
# 首先采样无依赖的参数
for param_name, distribution in self.param_distributions.items():
if "depends_on" not in distribution:
params[param_name] = self._sample_from_distribution(distribution)
# 然后采样有依赖的参数
for param_name, distribution in self.param_distributions.items():
if "depends_on" in distribution:
dependency = distribution["depends_on"]
if dependency in params:
# 根据依赖参数调整分布
adjusted_distribution = self._adjust_distribution(
distribution, params[dependency]
)
params[param_name] = self._sample_from_distribution(adjusted_distribution)
else:
# 如果依赖参数未采样,使用默认值
params[param_name] = distribution.get("default", None)
return params
def _sample_from_distribution(self, distribution):
"""从分布中采样"""
if distribution["type"] == "uniform":
return random.uniform(distribution["low"], distribution["high"])
elif distribution["type"] == "log_uniform":
value = random.uniform(distribution["low"], distribution["high"])
return 10 ** value
elif distribution["type"] == "choice":
return random.choice(distribution["values"])
elif distribution["type"] == "int_uniform":
return random.randint(distribution["low"], distribution["high"])
def _adjust_distribution(self, distribution, dependency_value):
"""根据依赖参数调整分布"""
# 示例:如果依赖参数是"model_type",调整其他参数
if distribution["depends_on"] == "model_type":
if dependency_value == "transformer":
return {"type": "choice", "values": [256, 512, 1024, 2048]}
elif dependency_value == "rnn":
return {"type": "choice", "values": [64, 128, 256, 512]}
return distribution
LLM随机搜索实践
1. 学习率随机搜索
# 学习率参数分布
lr_param_distributions = {
"learning_rate": {
"type": "log_uniform",
"low": -6, # 10^-6
"high": -3 # 10^-3
}
}
# 目标函数
def learning_rate_objective(params):
lr = params["learning_rate"]
# 训练模型
model = train_model(learning_rate=lr)
# 评估模型
accuracy = evaluate_model(model)
return accuracy
# 执行随机搜索
random_search = RandomSearch(lr_param_distributions, n_iter=30)
best_lr, best_accuracy = random_search.run(learning_rate_objective)
print(f"最佳学习率: {best_lr}")
print(f"最佳准确率: {best_accuracy}")
2. 多参数随机搜索
# 多参数分布
param_distributions = {
"learning_rate": {
"type": "log_uniform",
"low": -6,
"high": -3
},
"batch_size": {
"type": "choice",
"values": [8, 16, 32, 64, 128]
},
"dropout": {
"type": "uniform",
"low": 0.1,
"high": 0.5
},
"num_layers": {
"type": "int_uniform",
"low": 2,
"high": 8
},
"hidden_size": {
"type": "choice",
"values": [128, 256, 512, 1024]
}
}
# 目标函数
def multi_param_objective(params):
# 训练模型
model = train_model(
learning_rate=params["learning_rate"],
batch_size=params["batch_size"],
dropout=params["dropout"],
num_layers=params["num_layers"],
hidden_size=params["hidden_size"]
)
# 评估模型
accuracy = evaluate_model(model)
return accuracy
# 执行随机搜索
random_search = RandomSearch(param_distributions, n_iter=50)
best_params, best_accuracy = random_search.run(multi_param_objective)
3. 条件参数随机搜索
# 条件参数分布
conditional_param_distributions = {
"model_type": {
"type": "choice",
"values": ["transformer", "rnn", "lstm"]
},
"hidden_size": {
"type": "choice",
"values": [128, 256, 512, 1024],
"depends_on": "model_type"
},
"num_layers": {
"type": "int_uniform",
"low": 1,
"high": 6,
"depends_on": "model_type"
},
"learning_rate": {
"type": "log_uniform",
"low": -5,
"high": -3
}
}
# 目标函数
def conditional_objective(params):
# 根据模型类型创建不同架构的模型
if params["model_type"] == "transformer":
model = create_transformer(
hidden_size=params["hidden_size"],
num_layers=params["num_layers"]
)
elif params["model_type"] == "rnn":
model = create_rnn(
hidden_size=params["hidden_size"],
num_layers=params["num_layers"]
)
elif params["model_type"] == "lstm":
model = create_lstm(
hidden_size=params["hidden_size"],
num_layers=params["num_layers"]
)
# 训练和评估
train_model(model, learning_rate=params["learning_rate"])
accuracy = evaluate_model(model)
return accuracy
# 执行随机搜索
random_search = ConditionalRandomSearch(conditional_param_distributions, n_iter=40)
best_params, best_accuracy = random_search.run(conditional_objective)
高级随机搜索技术
1. 分层随机搜索
class StratifiedRandomSearch:
def __init__(self, param_distributions, n_iter=50, n_strata=5):
self.param_distributions = param_distributions
self.n_iter = n_iter
self.n_strata = n_strata
self.results = []
def stratified_sample(self):
"""分层采样"""
params = {}
for param_name, distribution in self.param_distributions.items():
if distribution["type"] == "uniform":
# 将范围分成层
low, high = distribution["low"], distribution["high"]
stratum_size = (high - low) / self.n_strata
# 随机选择层
stratum_idx = random.randint(0, self.n_strata - 1)
stratum_low = low + stratum_idx * stratum_size
stratum_high = stratum_low + stratum_size
# 在层内采样
params[param_name] = random.uniform(stratum_low, stratum_high)
elif distribution["type"] == "choice":
# 对于类别参数,确保每层都有代表
values = distribution["values"]
stratum_size = len(values) / self.n_strata
stratum_idx = random.randint(0, self.n_strata - 1)
start_idx = int(stratum_idx * stratum_size)
end_idx = int((stratum_idx + 1) * stratum_size)
stratum_values = values[start_idx:end_idx]
params[param_name] = random.choice(stratum_values)
else:
# 其他分布类型使用默认采样
params[param_name] = self._sample_from_distribution(distribution)
return params
def run(self, objective_function):
"""运行分层随机搜索"""
for i in range(self.n_iter):
params = self.stratified_sample()
result = objective_function(params)
self.results.append({"params": params, "result": result})
best_result = max(self.results, key=lambda x: x["result"])
return best_result["params"], best_result["result"]
2. 自适应随机搜索
class AdaptiveRandomSearch:
def __init__(self, param_distributions, n_iter=50):
self.param_distributions = param_distributions
self.n_iter = n_iter
self.results = []
self.best_results = []
def adapt_distributions(self):
"""根据历史结果调整分布"""
if len(self.results) < 5:
return self.param_distributions
# 找到前20%的结果
sorted_results = sorted(self.results, key=lambda x: x["result"], reverse=True)
top_results = sorted_results[:max(1, len(sorted_results) // 5)]
adapted_distributions = {}
for param_name, distribution in self.param_distributions.items():
if distribution["type"] in ["uniform", "log_uniform"]:
# 收集最佳结果中该参数的值
top_values = [r["params"][param_name] for r in top_results]
if distribution["type"] == "uniform":
# 调整范围到最佳值附近
min_val = min(top_values)
max_val = max(top_values)
# 扩展一点范围
range扩展 = (max_val - min_val) * 0.2
adapted_distributions[param_name] = {
"type": "uniform",
"low": max(distribution["low"], min_val - range扩展),
"high": min(distribution["high"], max_val + range扩展)
}
elif distribution["type"] == "log_uniform":
# 对数尺度调整
log_values = [np.log10(v) for v in top_values]
min_log = min(log_values)
max_log = max(log_values)
log扩展 = (max_log - min_log) * 0.2
adapted_distributions[param_name] = {
"type": "log_uniform",
"low": max(distribution["low"], min_log - log扩展),
"high": min(distribution["high"], max_log + log扩展)
}
else:
adapted_distributions[param_name] = distribution
return adapted_distributions
def sample_params(self, distributions):
"""从分布中采样参数"""
params = {}
for param_name, distribution in distributions.items():
if distribution["type"] == "uniform":
params[param_name] = random.uniform(distribution["low"], distribution["high"])
elif distribution["type"] == "log_uniform":
value = random.uniform(distribution["low"], distribution["high"])
params[param_name] = 10 ** value
elif distribution["type"] == "choice":
params[param_name] = random.choice(distribution["values"])
return params
def run(self, objective_function):
"""运行自适应随机搜索"""
current_distributions = self.param_distributions.copy()
for i in range(self.n_iter):
# 每10次迭代调整分布
if i % 10 == 0 and i > 0:
current_distributions = self.adapt_distributions()
params = self.sample_params(current_distributions)
result = objective_function(params)
self.results.append({"params": params, "result": result})
# 记录最佳结果
if not self.best_results or result > self.best_results[-1]["result"]:
self.best_results.append({"params": params, "result": result})
best_result = max(self.results, key=lambda x: x["result"])
return best_result["params"], best_result["result"]
3. 多保真度随机搜索
class MultiFidelityRandomSearch:
def __init__(self, param_distributions, n_iter=50, fidelities=[100, 500, 1000]):
self.param_distributions = param_distributions
self.n_iter = n_iter
self.fidelities = fidelities
self.results = []
def low_fidelity_evaluate(self, params, fidelity):
"""低保真度评估"""
# 使用小数据集或少训练步数
model = train_model_quick(params, steps=fidelity)
return evaluate_model_quick(model)
def high_fidelity_evaluate(self, params):
"""高保真度评估"""
model = train_model_full(params)
return evaluate_model_full(model)
def run(self):
"""运行多保真度随机搜索"""
# 阶段1:低保真度筛选
print("阶段1:低保真度筛选")
promising_params = []
for i in range(self.n_iter // 2):
params = self._sample_params()
score = self.low_fidelity_evaluate(params, self.fidelities[0])
promising_params.append((params, score))
# 选择前30%的参数
promising_params.sort(key=lambda x: x[1], reverse=True)
promising_params = promising_params[:len(promising_params) // 3]
# 阶段2:中保真度评估
print("阶段2:中保真度评估")
for i, (params, _) in enumerate(promising_params):
score = self.low_fidelity_evaluate(params, self.fidelities[1])
promising_params[i] = (params, score)
# 阶段3:高保真度评估
print("阶段3:高保真度评估")
best_params, best_score = None, -float('inf')
for params, _ in promising_params:
score = self.high_fidelity_evaluate(params)
if score > best_score:
best_score = score
best_params = params
return best_params, best_score
def _sample_params(self):
"""采样参数"""
params = {}
for param_name, distribution in self.param_distributions.items():
if distribution["type"] == "uniform":
params[param_name] = random.uniform(distribution["low"], distribution["high"])
elif distribution["type"] == "log_uniform":
value = random.uniform(distribution["low"], distribution["high"])
params[param_name] = 10 ** value
elif distribution["type"] == "choice":
params[param_name] = random.choice(distribution["values"])
return params
实际应用案例
案例:LLM训练超参数优化
# 定义参数分布
llm_param_distributions = {
"learning_rate": {
"type": "log_uniform",
"low": -6,
"high": -3
},
"batch_size": {
"type": "choice",
"values": [8, 16, 32, 64, 128]
},
"warmup_steps": {
"type": "int_uniform",
"low": 100,
"high": 1000
},
"weight_decay": {
"type": "uniform",
"low": 0.0,
"high": 0.1
},
"dropout": {
"type": "uniform",
"low": 0.1,
"high": 0.5
},
"gradient_accumulation_steps": {
"type": "choice",
"values": [1, 2, 4, 8]
}
}
# 目标函数
def llm_training_objective(params):
# 训练模型
model = train_llm(
learning_rate=params["learning_rate"],
batch_size=params["batch_size"],
warmup_steps=params["warmup_steps"],
weight_decay=params["weight_decay"],
dropout=params["dropout"],
gradient_accumulation_steps=params["gradient_accumulation_steps"]
)
# 评估模型
accuracy = evaluate_llm(model)
# 考虑训练效率
training_time = estimate_training_time(params)
# 返回综合分数(准确率和效率的权衡)
return accuracy * (1 - training_time / 7200) # 简化的时间惩罚
# 执行随机搜索
random_search = RandomSearch(llm_param_distributions, n_iter=100)
best_params, best_score = random_search.run(llm_training_objective)
print(f"最佳参数: {best_params}")
print(f"最佳分数: {best_score}")
案例:提示优化随机搜索
# 提示参数分布
prompt_param_distributions = {
"temperature": {
"type": "uniform",
"low": 0.1,
"high": 1.0
},
"top_p": {
"type": "uniform",
"low": 0.1,
"high": 1.0
},
"max_tokens": {
"type": "int_uniform",
"low": 50,
"high": 1000
},
"frequency_penalty": {
"type": "uniform",
"low": 0.0,
"high": 2.0
},
"presence_penalty": {
"type": "uniform",
"low": 0.0,
"high": 2.0
}
}
# 目标函数
def prompt_optimization_objective(params):
# 生成响应
responses = generate_responses(
test_prompts,
temperature=params["temperature"],
top_p=params["top_p"],
max_tokens=params["max_tokens"],
frequency_penalty=params["frequency_penalty"],
presence_penalty=params["presence_penalty"]
)
# 评估质量
quality_score = evaluate_response_quality(responses)
return quality_score
# 执行随机搜索
prompt_random_search = RandomSearch(prompt_param_distributions, n_iter=50)
best_prompt_params, best_prompt_score = prompt_random_search.run(prompt_optimization_objective)
最佳实践
1. 参数分布设计
# 合理的参数分布设计
def design_param_distributions(model_type):
if model_type == "transformer":
return {
"learning_rate": {
"type": "log_uniform",
"low": -6,
"high": -3
},
"batch_size": {
"type": "choice",
"values": [8, 16, 32, 64]
},
"hidden_size": {
"type": "choice",
"values": [256, 512, 1024, 2048]
},
"num_layers": {
"type": "int_uniform",
"low": 2,
"high": 8
},
"dropout": {
"type": "uniform",
"low": 0.1,
"high": 0.5
}
}
elif model_type == "rnn":
return {
"learning_rate": {
"type": "log_uniform",
"low": -4,
"high": -2
},
"batch_size": {
"type": "choice",
"values": [16, 32, 64, 128]
},
"hidden_size": {
"type": "choice",
"values": [64, 128, 256, 512]
},
"num_layers": {
"type": "int_uniform",
"low": 1,
"high": 4
},
"dropout": {
"type": "uniform",
"low": 0.2,
"high": 0.5
}
}
2. 搜索次数确定
# 根据参数空间大小确定搜索次数
def determine_search_iterations(param_distributions, confidence_level=0.95):
"""基于参数空间大小确定搜索次数"""
# 计算参数空间复杂度
complexity = 1
for distribution in param_distributions.values():
if distribution["type"] == "uniform":
# 连续参数:估计有效参数值数量
complexity *= 10 # 假设每个连续参数有10个有效值
elif distribution["type"] == "choice":
complexity *= len(distribution["values"])
elif distribution["type"] == "int_uniform":
complexity *= (distribution["high"] - distribution["low"] + 1)
# 基于复杂度和置信水平确定搜索次数
import math
n_iter = int(math.ceil(complexity * math.log(1 / (1 - confidence_level))))
# 限制搜索次数
n_iter = max(20, min(n_iter, 200))
return n_iter
3. 资源管理
# 资源感知的随机搜索
class ResourceAwareRandomSearch:
def __init__(self, param_distributions, n_iter=50, max_time=3600):
self.param_distributions = param_distributions
self.n_iter = n_iter
self.max_time = max_time
self.results = []
def estimate_training_time(self, params):
"""估计训练时间"""
# 简化的时间估计
base_time = 100 # 基础时间(秒)
# 根据参数调整
if "batch_size" in params:
base_time *= (16 / params["batch_size"])
if "hidden_size" in params:
base_time *= (params["hidden_size"] / 512) ** 2
if "num_layers" in params:
base_time *= params["num_layers"] / 4
return base_time
def run(self, objective_function):
"""运行资源感知的随机搜索"""
total_time = 0
for i in range(self.n_iter):
params = self._sample_params()
# 估计训练时间
estimated_time = self.estimate_training_time(params)
# 检查时间限制
if total_time + estimated_time > self.max_time:
print(f"跳过参数 {params}: 时间限制")
continue
# 训练和评估
result = objective_function(params)
self.results.append({
"params": params,
"result": result,
"estimated_time": estimated_time
})
# 更新总时间
total_time += estimated_time
if self.results:
best_result = max(self.results, key=lambda x: x["result"])
return best_result["params"], best_result["result"]
else:
return None, None
def _sample_params(self):
"""采样参数"""
params = {}
for param_name, distribution in self.param_distributions.items():
if distribution["type"] == "uniform":
params[param_name] = random.uniform(distribution["low"], distribution["high"])
elif distribution["type"] == "log_uniform":
value = random.uniform(distribution["low"], distribution["high"])
params[param_name] = 10 ** value
elif distribution["type"] == "choice":
params[param_name] = random.choice(distribution["values"])
return params
总结
随机搜索是LLM超参数调优的高效方法:
- 简单易用 - 实现简单,易于理解
- 高效采样 - 比网格搜索更高效地探索参数空间
- 并行化 - 容易并行化加速
- 自适应 - 可以根据历史结果调整搜索策略
- 灵活性 - 支持各种参数分布类型
通过合理设计参数分布、确定搜索次数、管理计算资源,随机搜索可以在有限的计算预算内找到高质量的超参数组合。在LLM开发中,随机搜索常用于初始超参数探索和快速原型开发。