← 返回首页
🧠

llama.cpp:高效CPU/GPU推理

📂 llm ⏱ 3 min 517 words

--- title: "llama.cpp:高效CPU/GPU推理" description: "介绍llama.cpp的使用方法,实现高效的本地模型推理" tags: ["llama.cpp", "GGUF", "CPU推理", "量化"] category: "llm" icon: "🧠"

llama.cpp:高效CPU/GPU推理

llama.cpp简介

llama.cpp是一个纯C/C++实现的LLaMA推理库,支持在CPU和GPU上高效运行量化后的大语言模型,是本地部署的重要工具。

安装与编译

源码编译

# 克隆仓库
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp

# Linux/Mac编译
cmake -B build
cmake --build build --config Release

# Windows(需要Visual Studio或MinGW)
cmake -B build -DLLAMA_CUDA=ON  # 启用CUDA
cmake --build build --config Release

使用预编译版本

# 从GitHub Releases下载对应平台的二进制文件
# Windows: llama-*-win-x64.zip
# Mac: llama-*-apple-arm64.zip

模型转换

HuggingFace转GGUF

# 下载模型
git lfs install
git clone https://huggingface.co/meta-llama/Llama-2-7b-hf

# 转换为GGUF格式
python convert_hf_to_gguf.py Llama-2-7b-hf --outfile llama-2-7b-f16.gguf

# 量化模型
./llama-quantize llama-2-7b-f16.gguf llama-2-7b-q4_k_m.gguf Q4_K_M

量化方法对比

quantization_methods = {
    "Q2_K": "2-bit,体积最小,质量损失大",
    "Q3_K_S": "3-bit small,体积小",
    "Q4_K_M": "4-bit medium,推荐的平衡选择",
    "Q5_K_M": "5-bit medium,质量更好",
    "Q6_K": "6-bit,接近原始精度",
    "Q8_0": "8-bit,几乎无质量损失",
    "F16": "16-bit浮点,原始精度"
}

for method, desc in quantization_methods.items():
    print(f"{method}: {desc}")

命令行使用

基础推理

# 文本生成
./llama-cli -m model.gguf -p "Hello, how are you?" -n 500

# 交互式对话
./llama-cli -m model.gguf --interactive

# 使用聊天模板
./llama-cli -m model.gguf \
  --chat-template chatml \
  -p "What is machine learning?" \
  -n 1000

参数说明

# 关键参数
./llama-cli \
  -m model.gguf \          # 模型路径
  -p "prompt" \            # 提示词
  -n 512 \                 # 生成token数
  -t 4 \                   # CPU线程数
  --temp 0.7 \             # 温度
  --top_p 0.9 \            # 核采样
  --ctx-size 4096 \        # 上下文长度
  --gpu-layers 35           # GPU加速层数

服务器模式

# 启动API服务器
./llama-server -m model.gguf \
  --host 0.0.0.0 \
  --port 8080 \
  -t 4

# API调用
curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "model",
    "messages": [{"role": "user", "content": "Hello!"}]
  }'

Python绑定

安装

pip install llama-cpp-python

基础使用

from llama_cpp import Llama

# 加载模型
llm = Llama(
    model_path="./models/llama-2-7b-q4_k_m.gguf",
    n_ctx=2048,
    n_threads=4,
    verbose=True
)

# 文本生成
output = llm(
    "The capital of France is",
    max_tokens=100,
    temperature=0.7,
    stop=["\n\n"]
)

print(output["choices"][0]["text"])

聊天模式

# 使用聊天模板
output = llm.create_chat_completion(
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain quantum computing"}
    ],
    max_tokens=500,
    temperature=0.7
)

print(output["choices"][0]["message"]["content"])

流式输出

def stream_generate(prompt, max_tokens=200):
    """流式生成"""
    output = llm(
        prompt,
        max_tokens=max_tokens,
        stream=True
    )
    
    full_response = ""
    for chunk in output:
        if chunk["choices"][0]["delta"].get("content"):
            content = chunk["choices"][0]["delta"]["content"]
            print(content, end="", flush=True)
            full_response += content
    
    print()
    return full_response

# 使用
# stream_generate("Write a short poem about AI:")

批量推理

from concurrent.futures import ThreadPoolExecutor

def batch_inference(prompts, max_workers=4):
    """批量推理"""
    def generate_single(prompt):
        output = llm(prompt, max_tokens=100)
        return output["choices"][0]["text"]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(generate_single, prompts))
    
    return results

prompts = ["What is AI?", "What is ML?", "What is DL?"]
results = batch_inference(prompts)
for p, r in zip(prompts, results):
    print(f"{p} -> {r[:50]}...")

GPU加速

CUDA加速

# 编译时启用CUDA
cmake -B build -DLLAMA_CUDA=ON
cmake --build build --config Release

# 运行时指定GPU层数
./llama-cli -m model.gguf -p "Hello" --gpu-layers 35

Metal加速(Mac)

# 编译时启用Metal
cmake -B build -DLLAMA_METAL=ON
cmake --build build --config Release

性能优化

# 优化配置
llm = Llama(
    model_path="model.gguf",
    n_ctx=4096,
    n_threads=8,          # 匹配CPU核心数
    n_batch=512,          # 批处理大小
    use_mmap=True,        # 内存映射
    use_mlock=True,       # 锁定内存
    verbose=False
)

性能测试

import time

def benchmark(prompt, n_tokens=100, n_runs=5):
    """性能基准测试"""
    times = []
    
    for _ in range(n_runs):
        start = time.time()
        llm(prompt, max_tokens=n_tokens)
        times.append(time.time() - start)
    
    avg_time = sum(times) / len(times)
    tokens_per_sec = n_tokens / avg_time
    
    print(f"平均时间: {avg_time:.2f}s")
    print(f"速度: {tokens_per_sec:.1f} tokens/s")
    
    return tokens_per_sec

# benchmark("What is deep learning?")

实际应用示例

from llama_cpp import Llama

class LocalLLM:
    def __init__(self, model_path):
        self.llm = Llama(
            model_path=model_path,
            n_ctx=2048,
            n_threads=4
        )
    
    def chat(self, user_input, system_prompt=None):
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": user_input})
        
        output = self.llm.create_chat_completion(
            messages=messages,
            max_tokens=500,
            temperature=0.7
        )
        
        return output["choices"][0]["message"]["content"]

# 使用
llm = LocalLLM("./models/llama-2-7b-q4_k_m.gguf")
response = llm.chat(
    "写一个Python函数计算斐波那契数列",
    system_prompt="你是一个Python编程助手"
)
print(response)

常见问题

# 显存不足
# 解决方案:减少n_gpu_layers或使用CPU推理

# 生成质量差
# 解决方案:使用更高精度的量化版本(Q5/Q6/Q8)

# 速度慢
# 解决方案:
# 1. 增加n_threads
# 2. 启用GPU加速
# 3. 减小n_ctx

总结

llama.cpp是本地运行LLM的高效选择,支持多种量化格式,可在CPU和GPU上运行。通过Python绑定,可以方便地集成到各种应用中。