← 返回首页
🧠

LLM推理优化技术

📂 llm ⏱ 4 min 623 words

--- title: "LLM推理优化技术" description: "掌握大语言模型推理优化的各种技术,包括量化、缓存、批处理和并行推理" tags: ["推理优化", "量化推理", "推理加速", "性能优化"] category: "llm" icon: "🧠"

LLM推理优化技术

推理优化概述

大语言模型的推理面临巨大挑战:模型参数量大、生成序列长、延迟要求高。推理优化是将LLM部署到生产环境的关键。本文介绍主要的推理优化技术。

推理优化的核心目标:

量化推理

INT8量化

from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

# INT8量化配置
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None
)

# 加载量化模型
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)

# 推理
inputs = tokenizer("Hello, how are you?", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)

INT4量化

# INT4量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)

GGUF量化

# 使用llama.cpp加载GGUF模型
from llama_cpp import Llama

# 加载量化模型
llm = Llama(
    model_path="./models/llama-2-7b-q4_k_m.gguf",
    n_ctx=2048,
    n_threads=4  # CPU线程数
)

# 推理
output = llm.create_chat_completion(
    messages=[{"role": "user", "content": "Hello!"}],
    max_tokens=256
)
print(output["choices"][0]["message"]["content"])

KV Cache优化

启用KV Cache

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# 使用KV Cache进行自回归生成
def generate_with_cache(model, tokenizer, prompt, max_new_tokens=100):
    """使用KV Cache生成"""
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # 首次前向传播(预填充)
    with torch.no_grad():
        outputs = model(**inputs, use_cache=True)
        past_key_values = outputs.past_key_values
        next_token = outputs.logits[:, -1:].argmax(dim=-1)
    
    generated_tokens = [next_token.item()]
    
    # 后续生成(仅处理新token)
    for _ in range(max_new_tokens - 1):
        with torch.no_grad():
            outputs = model(
                input_ids=next_token,
                past_key_values=past_key_values,
                use_cache=True
            )
            past_key_values = outputs.past_key_values
            next_token = outputs.logits[:, -1:].argmax(dim=-1)
            generated_tokens.append(next_token.item())
    
    return tokenizer.decode(generated_tokens)

PagedAttention(vLLM)

from vllm import LLM, SamplingParams

# 初始化vLLM引擎
llm = LLM(
    model="meta-llama/Llama-2-7b-hf",
    tensor_parallel_size=1,  # GPU数量
    max_model_len=2048,
    gpu_memory_utilization=0.9,
    block_size=16  # PagedAttention块大小
)

# 批量推理
prompts = ["Hello!", "How are you?", "Tell me a joke"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)

outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

批处理优化

动态批处理

import asyncio
from collections import deque

class DynamicBatcher:
    """动态批处理器"""
    
    def __init__(self, model, tokenizer, max_batch_size=32, max_wait_time=0.1):
        self.model = model
        self.tokenizer = tokenizer
        self.max_batch_size = max_batch_size
        self.max_wait_time = max_wait_time
        self.queue = deque()
    
    async def add_request(self, prompt):
        """添加请求到队列"""
        future = asyncio.Future()
        self.queue.append((prompt, future))
        
        if len(self.queue) >= self.max_batch_size:
            await self.process_batch()
        
        return await future
    
    async def process_batch(self):
        """处理一批请求"""
        if not self.queue:
            return
        
        # 取出请求
        batch = []
        while self.queue and len(batch) < self.max_batch_size:
            batch.append(self.queue.popleft())
        
        prompts = [item[0] for item in batch]
        futures = [item[1] for item in batch]
        
        # 批量推理
        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True)
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=256)
        
        # 返回结果
        for future, output in zip(futures, outputs):
            response = self.tokenizer.decode(output, skip_special_tokens=True)
            future.set_result(response)

连续批处理

class ContinuousBatcher:
    """连续批处理器"""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.active_requests = {}
        self.completed_requests = {}
    
    async def generate(self, request_id, prompt, max_tokens=256):
        """流式生成"""
        inputs = self.tokenizer(prompt, return_tensors="pt")
        
        # 预填充阶段
        with torch.no_grad():
            outputs = self.model(**inputs, use_cache=True)
            past_key_values = outputs.past_key_values
        
        # 逐token生成
        generated_tokens = []
        for _ in range(max_tokens):
            with torch.no_grad():
                outputs = self.model(
                    input_ids=inputs["input_ids"][:, -1:],
                    past_key_values=past_key_values,
                    use_cache=True
                )
                past_key_values = outputs.past_key_values
                next_token = outputs.logits[:, -1:].argmax(dim=-1)
                generated_tokens.append(next_token.item())
            
            # 检查是否完成
            if next_token.item() == self.tokenizer.eos_token_id:
                break
        
        return self.tokenizer.decode(generated_tokens)

张量并行

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

def setup_tensor_parallel(model, world_size):
    """设置张量并行"""
    dist.init_process_group("nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
    
    # 分片模型
    model = DDP(model, device_ids=[local_rank])
    return model

# 使用vLLM的张量并行
from vllm import LLM

llm = LLM(
    model="meta-llama/Llama-2-70b-hf",
    tensor_parallel_size=4,  # 使用4个GPU
    max_model_len=4096
)

推理框架对比

# 框架选择指南
frameworks = {
    "vLLM": {
        "特点": "PagedAttention,高吞吐量",
        "适用": "高并发服务",
        "安装": "pip install vllm"
    },
    "TensorRT-LLM": {
        "特点": "NVIDIA优化,极致性能",
        "适用": "NVIDIA GPU部署",
        "安装": "pip install tensorrt-llm"
    },
    "llama.cpp": {
        "特点": "CPU推理,轻量级",
        "适用": "边缘设备,CPU部署",
        "安装": "从源码编译"
    },
    "TGI": {
        "特点": "Hugging Face官方,易用",
        "适用": "快速部署",
        "安装": "pip install text-generation-inference"
    }
}

性能监控

import time
import numpy as np

class InferenceProfiler:
    """推理性能分析器"""
    
    def __init__(self):
        self.latencies = []
        self.token_counts = []
    
    def measure_latency(self, func, *args, **kwargs):
        """测量推理延迟"""
        start = time.time()
        result = func(*args, **kwargs)
        latency = time.time() - start
        self.latencies.append(latency)
        return result
    
    def measure_throughput(self, func, inputs, num_requests=100):
        """测量吞吐量"""
        start = time.time()
        for _ in range(num_requests):
            func(inputs)
        total_time = time.time() - start
        throughput = num_requests / total_time
        return throughput
    
    def get_statistics(self):
        """获取统计信息"""
        return {
            "avg_latency": np.mean(self.latencies),
            "p50_latency": np.percentile(self.latencies, 50),
            "p95_latency": np.percentile(self.latencies, 95),
            "p99_latency": np.percentile(self.latencies, 99),
            "throughput": len(self.latencies) / sum(self.latencies)
        }

优化策略总结

# 优化策略优先级
optimization_strategies = [
    ("1. 量化", "INT8/INT4量化减少内存和计算"),
    ("2. KV Cache", "避免重复计算,加速自回归生成"),
    ("3. 批处理", "提高GPU利用率"),
    ("4. 并行推理", "使用多GPU处理大模型"),
    ("5. 框架优化", "使用vLLM、TensorRT-LLM等专用框架")
]

# 选择建议
def select_optimization(model_size, latency_requirement, throughput_requirement):
    """根据需求选择优化策略"""
    if latency_requirement == "low":
        return "量化 + KV Cache + 专用框架"
    elif throughput_requirement == "high":
        return "批处理 + 并行推理 + PagedAttention"
    else:
        return "量化 + KV Cache"

推理优化是LLM部署成功的关键,需要根据具体场景选择合适的优化策略。