← 返回首页
🧠

随机搜索在LLM超参数调优中的应用

📂 llm ⏱ 8 min 1555 words

--- title: "随机搜索在LLM超参数调优中的应用" description: "介绍随机搜索算法在大型语言模型超参数调优中的原理、实现和最佳实践。" tags: ["随机搜索", "超参数调优", "llm", "机器学习", "优化算法"] category: "llm" icon: "🧠"

随机搜索在LLM超参数调优中的应用

什么是随机搜索?

随机搜索是一种超参数优化方法,它从指定的参数分布中随机采样参数组合进行评估。

随机搜索原理

1. 基本实现

import random

class RandomSearch:
    def __init__(self, param_distributions, n_iter=50):
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.results = []
    
    def sample_params(self):
        """从分布中采样参数"""
        params = {}
        for param_name, distribution in self.param_distributions.items():
            if distribution["type"] == "uniform":
                params[param_name] = random.uniform(
                    distribution["low"], 
                    distribution["high"]
                )
            elif distribution["type"] == "log_uniform":
                params[param_name] = random.uniform(
                    distribution["low"], 
                    distribution["high"]
                )
                params[param_name] = 10 ** params[param_name]
            elif distribution["type"] == "choice":
                params[param_name] = random.choice(distribution["values"])
        return params
    
    def run(self, objective_function):
        """运行随机搜索"""
        for i in range(self.n_iter):
            params = self.sample_params()
            print(f"迭代 {i+1}/{self.n_iter}: {params}")
            
            result = objective_function(params)
            self.results.append({
                "params": params,
                "result": result
            })
        
        best_result = max(self.results, key=lambda x: x["result"])
        return best_result["params"], best_result["result"]

2. 支持条件参数

class ConditionalRandomSearch:
    def __init__(self, param_distributions, n_iter=50):
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.results = []
    
    def sample_params(self):
        """采样支持条件依赖的参数"""
        params = {}
        
        # 首先采样无依赖的参数
        for param_name, distribution in self.param_distributions.items():
            if "depends_on" not in distribution:
                params[param_name] = self._sample_from_distribution(distribution)
        
        # 然后采样有依赖的参数
        for param_name, distribution in self.param_distributions.items():
            if "depends_on" in distribution:
                dependency = distribution["depends_on"]
                if dependency in params:
                    # 根据依赖参数调整分布
                    adjusted_distribution = self._adjust_distribution(
                        distribution, params[dependency]
                    )
                    params[param_name] = self._sample_from_distribution(adjusted_distribution)
                else:
                    # 如果依赖参数未采样,使用默认值
                    params[param_name] = distribution.get("default", None)
        
        return params
    
    def _sample_from_distribution(self, distribution):
        """从分布中采样"""
        if distribution["type"] == "uniform":
            return random.uniform(distribution["low"], distribution["high"])
        elif distribution["type"] == "log_uniform":
            value = random.uniform(distribution["low"], distribution["high"])
            return 10 ** value
        elif distribution["type"] == "choice":
            return random.choice(distribution["values"])
        elif distribution["type"] == "int_uniform":
            return random.randint(distribution["low"], distribution["high"])
    
    def _adjust_distribution(self, distribution, dependency_value):
        """根据依赖参数调整分布"""
        # 示例:如果依赖参数是"model_type",调整其他参数
        if distribution["depends_on"] == "model_type":
            if dependency_value == "transformer":
                return {"type": "choice", "values": [256, 512, 1024, 2048]}
            elif dependency_value == "rnn":
                return {"type": "choice", "values": [64, 128, 256, 512]}
        return distribution

LLM随机搜索实践

1. 学习率随机搜索

# 学习率参数分布
lr_param_distributions = {
    "learning_rate": {
        "type": "log_uniform",
        "low": -6,  # 10^-6
        "high": -3  # 10^-3
    }
}

# 目标函数
def learning_rate_objective(params):
    lr = params["learning_rate"]
    
    # 训练模型
    model = train_model(learning_rate=lr)
    
    # 评估模型
    accuracy = evaluate_model(model)
    
    return accuracy

# 执行随机搜索
random_search = RandomSearch(lr_param_distributions, n_iter=30)
best_lr, best_accuracy = random_search.run(learning_rate_objective)
print(f"最佳学习率: {best_lr}")
print(f"最佳准确率: {best_accuracy}")

2. 多参数随机搜索

# 多参数分布
param_distributions = {
    "learning_rate": {
        "type": "log_uniform",
        "low": -6,
        "high": -3
    },
    "batch_size": {
        "type": "choice",
        "values": [8, 16, 32, 64, 128]
    },
    "dropout": {
        "type": "uniform",
        "low": 0.1,
        "high": 0.5
    },
    "num_layers": {
        "type": "int_uniform",
        "low": 2,
        "high": 8
    },
    "hidden_size": {
        "type": "choice",
        "values": [128, 256, 512, 1024]
    }
}

# 目标函数
def multi_param_objective(params):
    # 训练模型
    model = train_model(
        learning_rate=params["learning_rate"],
        batch_size=params["batch_size"],
        dropout=params["dropout"],
        num_layers=params["num_layers"],
        hidden_size=params["hidden_size"]
    )
    
    # 评估模型
    accuracy = evaluate_model(model)
    
    return accuracy

# 执行随机搜索
random_search = RandomSearch(param_distributions, n_iter=50)
best_params, best_accuracy = random_search.run(multi_param_objective)

3. 条件参数随机搜索

# 条件参数分布
conditional_param_distributions = {
    "model_type": {
        "type": "choice",
        "values": ["transformer", "rnn", "lstm"]
    },
    "hidden_size": {
        "type": "choice",
        "values": [128, 256, 512, 1024],
        "depends_on": "model_type"
    },
    "num_layers": {
        "type": "int_uniform",
        "low": 1,
        "high": 6,
        "depends_on": "model_type"
    },
    "learning_rate": {
        "type": "log_uniform",
        "low": -5,
        "high": -3
    }
}

# 目标函数
def conditional_objective(params):
    # 根据模型类型创建不同架构的模型
    if params["model_type"] == "transformer":
        model = create_transformer(
            hidden_size=params["hidden_size"],
            num_layers=params["num_layers"]
        )
    elif params["model_type"] == "rnn":
        model = create_rnn(
            hidden_size=params["hidden_size"],
            num_layers=params["num_layers"]
        )
    elif params["model_type"] == "lstm":
        model = create_lstm(
            hidden_size=params["hidden_size"],
            num_layers=params["num_layers"]
        )
    
    # 训练和评估
    train_model(model, learning_rate=params["learning_rate"])
    accuracy = evaluate_model(model)
    
    return accuracy

# 执行随机搜索
random_search = ConditionalRandomSearch(conditional_param_distributions, n_iter=40)
best_params, best_accuracy = random_search.run(conditional_objective)

高级随机搜索技术

1. 分层随机搜索

class StratifiedRandomSearch:
    def __init__(self, param_distributions, n_iter=50, n_strata=5):
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.n_strata = n_strata
        self.results = []
    
    def stratified_sample(self):
        """分层采样"""
        params = {}
        for param_name, distribution in self.param_distributions.items():
            if distribution["type"] == "uniform":
                # 将范围分成层
                low, high = distribution["low"], distribution["high"]
                stratum_size = (high - low) / self.n_strata
                
                # 随机选择层
                stratum_idx = random.randint(0, self.n_strata - 1)
                stratum_low = low + stratum_idx * stratum_size
                stratum_high = stratum_low + stratum_size
                
                # 在层内采样
                params[param_name] = random.uniform(stratum_low, stratum_high)
            
            elif distribution["type"] == "choice":
                # 对于类别参数,确保每层都有代表
                values = distribution["values"]
                stratum_size = len(values) / self.n_strata
                stratum_idx = random.randint(0, self.n_strata - 1)
                start_idx = int(stratum_idx * stratum_size)
                end_idx = int((stratum_idx + 1) * stratum_size)
                stratum_values = values[start_idx:end_idx]
                params[param_name] = random.choice(stratum_values)
            
            else:
                # 其他分布类型使用默认采样
                params[param_name] = self._sample_from_distribution(distribution)
        
        return params
    
    def run(self, objective_function):
        """运行分层随机搜索"""
        for i in range(self.n_iter):
            params = self.stratified_sample()
            result = objective_function(params)
            self.results.append({"params": params, "result": result})
        
        best_result = max(self.results, key=lambda x: x["result"])
        return best_result["params"], best_result["result"]

2. 自适应随机搜索

class AdaptiveRandomSearch:
    def __init__(self, param_distributions, n_iter=50):
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.results = []
        self.best_results = []
    
    def adapt_distributions(self):
        """根据历史结果调整分布"""
        if len(self.results) < 5:
            return self.param_distributions
        
        # 找到前20%的结果
        sorted_results = sorted(self.results, key=lambda x: x["result"], reverse=True)
        top_results = sorted_results[:max(1, len(sorted_results) // 5)]
        
        adapted_distributions = {}
        for param_name, distribution in self.param_distributions.items():
            if distribution["type"] in ["uniform", "log_uniform"]:
                # 收集最佳结果中该参数的值
                top_values = [r["params"][param_name] for r in top_results]
                
                if distribution["type"] == "uniform":
                    # 调整范围到最佳值附近
                    min_val = min(top_values)
                    max_val = max(top_values)
                    # 扩展一点范围
                    range扩展 = (max_val - min_val) * 0.2
                    adapted_distributions[param_name] = {
                        "type": "uniform",
                        "low": max(distribution["low"], min_val - range扩展),
                        "high": min(distribution["high"], max_val + range扩展)
                    }
                elif distribution["type"] == "log_uniform":
                    # 对数尺度调整
                    log_values = [np.log10(v) for v in top_values]
                    min_log = min(log_values)
                    max_log = max(log_values)
                    log扩展 = (max_log - min_log) * 0.2
                    adapted_distributions[param_name] = {
                        "type": "log_uniform",
                        "low": max(distribution["low"], min_log - log扩展),
                        "high": min(distribution["high"], max_log + log扩展)
                    }
            else:
                adapted_distributions[param_name] = distribution
        
        return adapted_distributions
    
    def sample_params(self, distributions):
        """从分布中采样参数"""
        params = {}
        for param_name, distribution in distributions.items():
            if distribution["type"] == "uniform":
                params[param_name] = random.uniform(distribution["low"], distribution["high"])
            elif distribution["type"] == "log_uniform":
                value = random.uniform(distribution["low"], distribution["high"])
                params[param_name] = 10 ** value
            elif distribution["type"] == "choice":
                params[param_name] = random.choice(distribution["values"])
        return params
    
    def run(self, objective_function):
        """运行自适应随机搜索"""
        current_distributions = self.param_distributions.copy()
        
        for i in range(self.n_iter):
            # 每10次迭代调整分布
            if i % 10 == 0 and i > 0:
                current_distributions = self.adapt_distributions()
            
            params = self.sample_params(current_distributions)
            result = objective_function(params)
            self.results.append({"params": params, "result": result})
            
            # 记录最佳结果
            if not self.best_results or result > self.best_results[-1]["result"]:
                self.best_results.append({"params": params, "result": result})
        
        best_result = max(self.results, key=lambda x: x["result"])
        return best_result["params"], best_result["result"]

3. 多保真度随机搜索

class MultiFidelityRandomSearch:
    def __init__(self, param_distributions, n_iter=50, fidelities=[100, 500, 1000]):
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.fidelities = fidelities
        self.results = []
    
    def low_fidelity_evaluate(self, params, fidelity):
        """低保真度评估"""
        # 使用小数据集或少训练步数
        model = train_model_quick(params, steps=fidelity)
        return evaluate_model_quick(model)
    
    def high_fidelity_evaluate(self, params):
        """高保真度评估"""
        model = train_model_full(params)
        return evaluate_model_full(model)
    
    def run(self):
        """运行多保真度随机搜索"""
        # 阶段1:低保真度筛选
        print("阶段1:低保真度筛选")
        promising_params = []
        
        for i in range(self.n_iter // 2):
            params = self._sample_params()
            score = self.low_fidelity_evaluate(params, self.fidelities[0])
            promising_params.append((params, score))
        
        # 选择前30%的参数
        promising_params.sort(key=lambda x: x[1], reverse=True)
        promising_params = promising_params[:len(promising_params) // 3]
        
        # 阶段2:中保真度评估
        print("阶段2:中保真度评估")
        for i, (params, _) in enumerate(promising_params):
            score = self.low_fidelity_evaluate(params, self.fidelities[1])
            promising_params[i] = (params, score)
        
        # 阶段3:高保真度评估
        print("阶段3:高保真度评估")
        best_params, best_score = None, -float('inf')
        for params, _ in promising_params:
            score = self.high_fidelity_evaluate(params)
            if score > best_score:
                best_score = score
                best_params = params
        
        return best_params, best_score
    
    def _sample_params(self):
        """采样参数"""
        params = {}
        for param_name, distribution in self.param_distributions.items():
            if distribution["type"] == "uniform":
                params[param_name] = random.uniform(distribution["low"], distribution["high"])
            elif distribution["type"] == "log_uniform":
                value = random.uniform(distribution["low"], distribution["high"])
                params[param_name] = 10 ** value
            elif distribution["type"] == "choice":
                params[param_name] = random.choice(distribution["values"])
        return params

实际应用案例

案例:LLM训练超参数优化

# 定义参数分布
llm_param_distributions = {
    "learning_rate": {
        "type": "log_uniform",
        "low": -6,
        "high": -3
    },
    "batch_size": {
        "type": "choice",
        "values": [8, 16, 32, 64, 128]
    },
    "warmup_steps": {
        "type": "int_uniform",
        "low": 100,
        "high": 1000
    },
    "weight_decay": {
        "type": "uniform",
        "low": 0.0,
        "high": 0.1
    },
    "dropout": {
        "type": "uniform",
        "low": 0.1,
        "high": 0.5
    },
    "gradient_accumulation_steps": {
        "type": "choice",
        "values": [1, 2, 4, 8]
    }
}

# 目标函数
def llm_training_objective(params):
    # 训练模型
    model = train_llm(
        learning_rate=params["learning_rate"],
        batch_size=params["batch_size"],
        warmup_steps=params["warmup_steps"],
        weight_decay=params["weight_decay"],
        dropout=params["dropout"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"]
    )
    
    # 评估模型
    accuracy = evaluate_llm(model)
    
    # 考虑训练效率
    training_time = estimate_training_time(params)
    
    # 返回综合分数(准确率和效率的权衡)
    return accuracy * (1 - training_time / 7200)  # 简化的时间惩罚

# 执行随机搜索
random_search = RandomSearch(llm_param_distributions, n_iter=100)
best_params, best_score = random_search.run(llm_training_objective)

print(f"最佳参数: {best_params}")
print(f"最佳分数: {best_score}")

案例:提示优化随机搜索

# 提示参数分布
prompt_param_distributions = {
    "temperature": {
        "type": "uniform",
        "low": 0.1,
        "high": 1.0
    },
    "top_p": {
        "type": "uniform",
        "low": 0.1,
        "high": 1.0
    },
    "max_tokens": {
        "type": "int_uniform",
        "low": 50,
        "high": 1000
    },
    "frequency_penalty": {
        "type": "uniform",
        "low": 0.0,
        "high": 2.0
    },
    "presence_penalty": {
        "type": "uniform",
        "low": 0.0,
        "high": 2.0
    }
}

# 目标函数
def prompt_optimization_objective(params):
    # 生成响应
    responses = generate_responses(
        test_prompts,
        temperature=params["temperature"],
        top_p=params["top_p"],
        max_tokens=params["max_tokens"],
        frequency_penalty=params["frequency_penalty"],
        presence_penalty=params["presence_penalty"]
    )
    
    # 评估质量
    quality_score = evaluate_response_quality(responses)
    
    return quality_score

# 执行随机搜索
prompt_random_search = RandomSearch(prompt_param_distributions, n_iter=50)
best_prompt_params, best_prompt_score = prompt_random_search.run(prompt_optimization_objective)

最佳实践

1. 参数分布设计

# 合理的参数分布设计
def design_param_distributions(model_type):
    if model_type == "transformer":
        return {
            "learning_rate": {
                "type": "log_uniform",
                "low": -6,
                "high": -3
            },
            "batch_size": {
                "type": "choice",
                "values": [8, 16, 32, 64]
            },
            "hidden_size": {
                "type": "choice",
                "values": [256, 512, 1024, 2048]
            },
            "num_layers": {
                "type": "int_uniform",
                "low": 2,
                "high": 8
            },
            "dropout": {
                "type": "uniform",
                "low": 0.1,
                "high": 0.5
            }
        }
    elif model_type == "rnn":
        return {
            "learning_rate": {
                "type": "log_uniform",
                "low": -4,
                "high": -2
            },
            "batch_size": {
                "type": "choice",
                "values": [16, 32, 64, 128]
            },
            "hidden_size": {
                "type": "choice",
                "values": [64, 128, 256, 512]
            },
            "num_layers": {
                "type": "int_uniform",
                "low": 1,
                "high": 4
            },
            "dropout": {
                "type": "uniform",
                "low": 0.2,
                "high": 0.5
            }
        }

2. 搜索次数确定

# 根据参数空间大小确定搜索次数
def determine_search_iterations(param_distributions, confidence_level=0.95):
    """基于参数空间大小确定搜索次数"""
    # 计算参数空间复杂度
    complexity = 1
    for distribution in param_distributions.values():
        if distribution["type"] == "uniform":
            # 连续参数:估计有效参数值数量
            complexity *= 10  # 假设每个连续参数有10个有效值
        elif distribution["type"] == "choice":
            complexity *= len(distribution["values"])
        elif distribution["type"] == "int_uniform":
            complexity *= (distribution["high"] - distribution["low"] + 1)
    
    # 基于复杂度和置信水平确定搜索次数
    import math
    n_iter = int(math.ceil(complexity * math.log(1 / (1 - confidence_level))))
    
    # 限制搜索次数
    n_iter = max(20, min(n_iter, 200))
    
    return n_iter

3. 资源管理

# 资源感知的随机搜索
class ResourceAwareRandomSearch:
    def __init__(self, param_distributions, n_iter=50, max_time=3600):
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.max_time = max_time
        self.results = []
    
    def estimate_training_time(self, params):
        """估计训练时间"""
        # 简化的时间估计
        base_time = 100  # 基础时间(秒)
        
        # 根据参数调整
        if "batch_size" in params:
            base_time *= (16 / params["batch_size"])
        if "hidden_size" in params:
            base_time *= (params["hidden_size"] / 512) ** 2
        if "num_layers" in params:
            base_time *= params["num_layers"] / 4
        
        return base_time
    
    def run(self, objective_function):
        """运行资源感知的随机搜索"""
        total_time = 0
        
        for i in range(self.n_iter):
            params = self._sample_params()
            
            # 估计训练时间
            estimated_time = self.estimate_training_time(params)
            
            # 检查时间限制
            if total_time + estimated_time > self.max_time:
                print(f"跳过参数 {params}: 时间限制")
                continue
            
            # 训练和评估
            result = objective_function(params)
            self.results.append({
                "params": params,
                "result": result,
                "estimated_time": estimated_time
            })
            
            # 更新总时间
            total_time += estimated_time
        
        if self.results:
            best_result = max(self.results, key=lambda x: x["result"])
            return best_result["params"], best_result["result"]
        else:
            return None, None
    
    def _sample_params(self):
        """采样参数"""
        params = {}
        for param_name, distribution in self.param_distributions.items():
            if distribution["type"] == "uniform":
                params[param_name] = random.uniform(distribution["low"], distribution["high"])
            elif distribution["type"] == "log_uniform":
                value = random.uniform(distribution["low"], distribution["high"])
                params[param_name] = 10 ** value
            elif distribution["type"] == "choice":
                params[param_name] = random.choice(distribution["values"])
        return params

总结

随机搜索是LLM超参数调优的高效方法:

  1. 简单易用 - 实现简单,易于理解
  2. 高效采样 - 比网格搜索更高效地探索参数空间
  3. 并行化 - 容易并行化加速
  4. 自适应 - 可以根据历史结果调整搜索策略
  5. 灵活性 - 支持各种参数分布类型

通过合理设计参数分布、确定搜索次数、管理计算资源,随机搜索可以在有限的计算预算内找到高质量的超参数组合。在LLM开发中,随机搜索常用于初始超参数探索和快速原型开发。