推理增强
--- title: "推理增强" description: "大语言模型推理增强技术,包括过程奖励模型和推理时间计算" tags: ["推理增强", "过程奖励模型", "推理时间计算", "思维链"] category: "llm" icon: "🧠"
推理增强
推理增强(Reasoning Enhancement)是提升大语言模型在复杂推理任务上表现的关键技术。通过训练模型逐步推理、验证中间步骤,显著提升数学、逻辑和代码等领域的表现。
过程奖励模型(PRM)
传统奖励模型只对最终结果评分,而过程奖励模型对推理的每一步进行评估:
import torch
import torch.nn as nn
class ProcessRewardModel(nn.Module):
def __init__(self, base_model):
super().__init__()
self.base_model = base_model
self.step_scorer = nn.Linear(base_model.config.hidden_size, 1)
def forward(self, input_ids, attention_mask, step_positions):
"""
step_positions: 每个推理步骤结束位置的索引
"""
outputs = self.base_model(input_ids, attention_mask=attention_mask)
hidden_states = outputs.last_hidden_state
# 提取每个步骤结束位置的表示
step_representations = hidden_states[:, step_positions, :]
# 为每步生成奖励分数
step_scores = self.step_scorer(step_representations).squeeze(-1)
return step_scores
def prm_loss(step_scores, step_labels):
"""过程奖励模型损失:每步独立评分"""
loss = 0
for scores, labels in zip(step_scores, step_labels):
# 二元交叉熵损失
loss += nn.functional.binary_cross_entropy_with_logits(scores, labels)
return loss / len(step_scores)
推理时间计算(Test-Time Compute)
在推理时通过增加计算量来提升答案质量:
class ReasoningWithSearch:
def __init__(self, model, tokenizer, num_paths=5):
self.model = model
self.tokenizer = tokenizer
self.num_paths = num_paths
def generate_reasoning_paths(self, problem, temperature=0.8):
"""生成多条推理路径"""
paths = []
for _ in range(self.num_paths):
inputs = self.tokenizer(problem, return_tensors="pt")
outputs = self.model.generate(
**inputs,
max_new_tokens=1024,
temperature=temperature,
do_sample=True,
top_p=0.95,
)
path = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
paths.append(path)
return paths
def select_best_path(self, paths, verifier):
"""使用验证器选择最佳推理路径"""
scores = []
for path in paths:
# 提取最终答案
answer = self._extract_answer(path)
# 验证答案正确性
score = verifier.verify(answer)
scores.append(score)
best_idx = torch.argmax(torch.tensor(scores))
return paths[best_idx], scores[best_idx]
def _extract_answer(self, text):
"""从推理链中提取答案"""
# 查找"答案:"或"最终答案"等模式
markers = ["答案:", "最终答案:", "Answer:", "因此"]
for marker in markers:
if marker in text:
return text.split(marker)[-1].strip()
return text.split("\n")[-1].strip()
自我反思机制
模型通过自我反思识别并纠正推理错误:
class SelfRefinement:
def __init__(self, model, tokenizer, max_refinements=3):
self.model = model
self.tokenizer = tokenizer
self.max_refinements = max_refinements
def solve_with_reflection(self, problem):
"""带自我反思的解题"""
current_solution = self._initial_solve(problem)
for i in range(self.max_refinements):
# 自我批判
critique = self._self_critique(problem, current_solution)
if critique["is_correct"]:
break
# 基于批判改进
current_solution = self._refine(
problem, current_solution, critique["feedback"]
)
return {
"solution": current_solution,
"refinements": i + 1,
"final_correct": self._verify(problem, current_solution),
}
def _self_critique(self, problem, solution):
"""自我批判"""
prompt = f"""请仔细检查以下解答是否正确。
问题:{problem}
解答:{solution}
请指出解答中的任何错误或遗漏,并给出改进建议。如果解答正确,请回复"正确"。"""
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.model.generate(**inputs, max_new_tokens=512)
critique = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return {
"feedback": critique,
"is_correct": "正确" in critique and "错误" not in critique,
}
def _initial_solve(self, problem):
prompt = f"请逐步解决以下问题:\n{problem}"
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.model.generate(**inputs, max_new_tokens=1024)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
def _refine(self, problem, solution, feedback):
prompt = f"""问题:{problem}
之前的解答:{solution}
反馈:{feedback}
请根据反馈改进解答:"""
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.model.generate(**inputs, max_new_tokens=1024)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
多步推理框架
class MultiStepReasoner:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def solve(self, problem, decomposition_strategy="sequential"):
"""多步推理框架"""
# 第一步:问题分解
subproblems = self._decompose(problem, decomposition_strategy)
# 第二步:逐步求解
solutions = []
context = problem
for sub in subproblems:
solution = self._solve_subproblem(sub, context)
solutions.append(solution)
context += f"\n{solution}"
# 第三步:综合答案
final_answer = self._synthesize(problem, solutions)
return {
"subproblems": subproblems,
"solutions": solutions,
"final_answer": final_answer,
}
def _decompose(self, problem, strategy):
prompt = f"请将以下问题分解为可逐步解决的子问题:\n{problem}"
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.model.generate(**inputs, max_new_tokens=512)
decomposition = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return decomposition.split("\n")
推理增强技术通过增加推理时的计算投入,让模型在复杂任务上获得质的提升,是当前LLM发展的重要方向。