LLM推理优化技术
--- title: "LLM推理优化技术" description: "掌握大语言模型推理优化的各种技术,包括量化、缓存、批处理和并行推理" tags: ["推理优化", "量化推理", "推理加速", "性能优化"] category: "llm" icon: "🧠"
LLM推理优化技术
推理优化概述
大语言模型的推理面临巨大挑战:模型参数量大、生成序列长、延迟要求高。推理优化是将LLM部署到生产环境的关键。本文介绍主要的推理优化技术。
推理优化的核心目标:
- 降低延迟:减少首次响应时间和生成速度
- 提高吞吐量:增加每秒处理的请求数
- 降低内存:减少GPU显存占用
- 降低成本:减少计算资源消耗
量化推理
INT8量化
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
# INT8量化配置
bnb_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0,
llm_int8_skip_modules=None
)
# 加载量化模型
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto"
)
# 推理
inputs = tokenizer("Hello, how are you?", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
INT4量化
# INT4量化配置
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto"
)
GGUF量化
# 使用llama.cpp加载GGUF模型
from llama_cpp import Llama
# 加载量化模型
llm = Llama(
model_path="./models/llama-2-7b-q4_k_m.gguf",
n_ctx=2048,
n_threads=4 # CPU线程数
)
# 推理
output = llm.create_chat_completion(
messages=[{"role": "user", "content": "Hello!"}],
max_tokens=256
)
print(output["choices"][0]["message"]["content"])
KV Cache优化
启用KV Cache
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# 使用KV Cache进行自回归生成
def generate_with_cache(model, tokenizer, prompt, max_new_tokens=100):
"""使用KV Cache生成"""
inputs = tokenizer(prompt, return_tensors="pt")
# 首次前向传播(预填充)
with torch.no_grad():
outputs = model(**inputs, use_cache=True)
past_key_values = outputs.past_key_values
next_token = outputs.logits[:, -1:].argmax(dim=-1)
generated_tokens = [next_token.item()]
# 后续生成(仅处理新token)
for _ in range(max_new_tokens - 1):
with torch.no_grad():
outputs = model(
input_ids=next_token,
past_key_values=past_key_values,
use_cache=True
)
past_key_values = outputs.past_key_values
next_token = outputs.logits[:, -1:].argmax(dim=-1)
generated_tokens.append(next_token.item())
return tokenizer.decode(generated_tokens)
PagedAttention(vLLM)
from vllm import LLM, SamplingParams
# 初始化vLLM引擎
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=1, # GPU数量
max_model_len=2048,
gpu_memory_utilization=0.9,
block_size=16 # PagedAttention块大小
)
# 批量推理
prompts = ["Hello!", "How are you?", "Tell me a joke"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(output.outputs[0].text)
批处理优化
动态批处理
import asyncio
from collections import deque
class DynamicBatcher:
"""动态批处理器"""
def __init__(self, model, tokenizer, max_batch_size=32, max_wait_time=0.1):
self.model = model
self.tokenizer = tokenizer
self.max_batch_size = max_batch_size
self.max_wait_time = max_wait_time
self.queue = deque()
async def add_request(self, prompt):
"""添加请求到队列"""
future = asyncio.Future()
self.queue.append((prompt, future))
if len(self.queue) >= self.max_batch_size:
await self.process_batch()
return await future
async def process_batch(self):
"""处理一批请求"""
if not self.queue:
return
# 取出请求
batch = []
while self.queue and len(batch) < self.max_batch_size:
batch.append(self.queue.popleft())
prompts = [item[0] for item in batch]
futures = [item[1] for item in batch]
# 批量推理
inputs = self.tokenizer(prompts, return_tensors="pt", padding=True)
with torch.no_grad():
outputs = self.model.generate(**inputs, max_new_tokens=256)
# 返回结果
for future, output in zip(futures, outputs):
response = self.tokenizer.decode(output, skip_special_tokens=True)
future.set_result(response)
连续批处理
class ContinuousBatcher:
"""连续批处理器"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
self.active_requests = {}
self.completed_requests = {}
async def generate(self, request_id, prompt, max_tokens=256):
"""流式生成"""
inputs = self.tokenizer(prompt, return_tensors="pt")
# 预填充阶段
with torch.no_grad():
outputs = self.model(**inputs, use_cache=True)
past_key_values = outputs.past_key_values
# 逐token生成
generated_tokens = []
for _ in range(max_tokens):
with torch.no_grad():
outputs = self.model(
input_ids=inputs["input_ids"][:, -1:],
past_key_values=past_key_values,
use_cache=True
)
past_key_values = outputs.past_key_values
next_token = outputs.logits[:, -1:].argmax(dim=-1)
generated_tokens.append(next_token.item())
# 检查是否完成
if next_token.item() == self.tokenizer.eos_token_id:
break
return self.tokenizer.decode(generated_tokens)
张量并行
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
def setup_tensor_parallel(model, world_size):
"""设置张量并行"""
dist.init_process_group("nccl")
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
# 分片模型
model = DDP(model, device_ids=[local_rank])
return model
# 使用vLLM的张量并行
from vllm import LLM
llm = LLM(
model="meta-llama/Llama-2-70b-hf",
tensor_parallel_size=4, # 使用4个GPU
max_model_len=4096
)
推理框架对比
# 框架选择指南
frameworks = {
"vLLM": {
"特点": "PagedAttention,高吞吐量",
"适用": "高并发服务",
"安装": "pip install vllm"
},
"TensorRT-LLM": {
"特点": "NVIDIA优化,极致性能",
"适用": "NVIDIA GPU部署",
"安装": "pip install tensorrt-llm"
},
"llama.cpp": {
"特点": "CPU推理,轻量级",
"适用": "边缘设备,CPU部署",
"安装": "从源码编译"
},
"TGI": {
"特点": "Hugging Face官方,易用",
"适用": "快速部署",
"安装": "pip install text-generation-inference"
}
}
性能监控
import time
import numpy as np
class InferenceProfiler:
"""推理性能分析器"""
def __init__(self):
self.latencies = []
self.token_counts = []
def measure_latency(self, func, *args, **kwargs):
"""测量推理延迟"""
start = time.time()
result = func(*args, **kwargs)
latency = time.time() - start
self.latencies.append(latency)
return result
def measure_throughput(self, func, inputs, num_requests=100):
"""测量吞吐量"""
start = time.time()
for _ in range(num_requests):
func(inputs)
total_time = time.time() - start
throughput = num_requests / total_time
return throughput
def get_statistics(self):
"""获取统计信息"""
return {
"avg_latency": np.mean(self.latencies),
"p50_latency": np.percentile(self.latencies, 50),
"p95_latency": np.percentile(self.latencies, 95),
"p99_latency": np.percentile(self.latencies, 99),
"throughput": len(self.latencies) / sum(self.latencies)
}
优化策略总结
# 优化策略优先级
optimization_strategies = [
("1. 量化", "INT8/INT4量化减少内存和计算"),
("2. KV Cache", "避免重复计算,加速自回归生成"),
("3. 批处理", "提高GPU利用率"),
("4. 并行推理", "使用多GPU处理大模型"),
("5. 框架优化", "使用vLLM、TensorRT-LLM等专用框架")
]
# 选择建议
def select_optimization(model_size, latency_requirement, throughput_requirement):
"""根据需求选择优化策略"""
if latency_requirement == "low":
return "量化 + KV Cache + 专用框架"
elif throughput_requirement == "high":
return "批处理 + 并行推理 + PagedAttention"
else:
return "量化 + KV Cache"
推理优化是LLM部署成功的关键,需要根据具体场景选择合适的优化策略。