← 返回首页
🧠

本地LLM部署指南

📂 llm ⏱ 3 min 469 words

--- title: "本地LLM部署指南" description: "全面介绍如何在本地部署和运行大语言模型,包括硬件要求和优化技巧" tags: ["本地部署", "LLM", "Ollama", "推理"] category: "llm" icon: "🧠"

本地LLM部署指南

为什么选择本地部署

硬件要求

# 不同规模模型的硬件需求
hardware_requirements = {
    "1-3B参数": {
        "显存": "4-6GB",
        "内存": "8GB",
        "推荐显卡": "RTX 3060, RTX 4060"
    },
    "7-8B参数": {
        "显存": "8-12GB",
        "内存": "16GB",
        "推荐显卡": "RTX 3080, RTX 4070"
    },
    "13-14B参数": {
        "显存": "16-24GB",
        "内存": "32GB",
        "推荐显卡": "RTX 3090, RTX 4090"
    },
    "70B参数": {
        "显存": "48GB+ (或多GPU)",
        "内存": "64GB",
        "推荐显卡": "A100, 多卡配置"
    }
}

for model_size, req in hardware_requirements.items():
    print(f"\n{model_size}:")
    for k, v in req.items():
        print(f"  {k}: {v}")

使用Ollama部署

安装Ollama

# Windows/Mac: 下载安装包
# https://ollama.com/download

# Linux
curl -fsSL https://ollama.com/install.sh | sh

# 启动服务
ollama serve

下载和运行模型

# 下载并运行模型
ollama run llama3:8b

# 下载特定大小
ollama pull qwen2:7b
ollama pull mistral:7b

# 查看已下载模型
ollama list

# 删除模型
ollama rm model_name

Python调用

import requests
import json

def ollama_chat(prompt, model="llama3:8b"):
    """使用Ollama API进行对话"""
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(url, json=payload)
    return response.json()["response"]

# 使用
response = ollama_chat("用Python写一个快速排序算法")
print(response)

流式输出

def ollama_stream(prompt, model="llama3:8b"):
    """Ollama流式输出"""
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": True
    }
    
    response = requests.post(url, json=payload, stream=True)
    
    full_response = ""
    for line in response.iter_lines():
        if line:
            data = json.loads(line)
            if "response" in data:
                print(data["response"], end="", flush=True)
                full_response += data["response"]
    
    print()
    return full_response

# 使用
# ollama_stream("写一个简短的Python函数")

使用llama.cpp

安装和编译

# 克隆仓库
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp

# 编译(Windows需要CMake)
cmake -B build
cmake --build build --config Release

# 或使用预编译版本
# 从GitHub Releases下载

转换模型格式

# 将HuggingFace模型转换为GGUF格式
python convert_hf_to_gguf.py model_path --outfile model.gguf

# 量化模型(减小文件大小)
./llama-quantize model.gguf model-q4_k_m.gguf Q4_K_M

运行推理

# 交互式对话
./llama-cli -m model.gguf -p "你好,请介绍一下自己" -n 500

# 使用聊天模板
./llama-cli -m model.gguf \
  --chat-template chatml \
  -p "What is machine learning?"

Python绑定

# 使用llama-cpp-python
# pip install llama-cpp-python

from llama_cpp import Llama

# 加载模型
llm = Llama(
    model_path="./models/model.gguf",
    n_ctx=2048,      # 上下文长度
    n_threads=4       # CPU线程数
)

# 生成文本
output = llm(
    "什么是深度学习?",
    max_tokens=500,
    temperature=0.7,
    stop=["\n\n"]
)

print(output["choices"][0]["text"])

使用vLLM部署

安装和启动

# 安装
pip install vllm

# 启动OpenAI兼容服务
python -m vllm.entrypoints.openai.api_server \
  --model meta-llama/Llama-2-7b-hf \
  --host 0.0.0.0 \
  --port 8000

Python调用

from openai import OpenAI

# vLLM提供OpenAI兼容接口
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="not-needed"
)

response = client.chat.completions.create(
    model="meta-llama/Llama-2-7b-hf",
    messages=[
        {"role": "user", "content": "解释什么是Transformer"}
    ]
)

print(response.choices[0].message.content)

使用Transformers库

直接加载模型

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_model(model_name, device="auto"):
    """加载HuggingFace模型"""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map=device
    )
    return model, tokenizer

def generate_text(model, tokenizer, prompt, max_length=500):
    """生成文本"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=0.7,
            do_sample=True
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 使用
# model, tokenizer = load_model("Qwen/Qwen2-7B")
# response = generate_text(model, tokenizer, "你好!")

量化加载

from transformers import BitsAndBytesConfig

# 4bit量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)

性能优化技巧

# 1. 使用Flash Attention
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.float16
)

# 2. 使用KV Cache
# vLLM默认启用

# 3. 批量推理
def batch_inference(prompts, batch_size=8):
    """批量推理提高吞吐量"""
    results = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        # 批量处理
        # ...
    return results

常见问题解决

# 显存不足
# 1. 使用量化
# 2. 减小上下文长度
# 3. 使用CPU+GPU混合推理
# 4. 使用多GPU

# 多GPU配置
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # 自动分配到多个GPU
    torch_dtype=torch.float16
)

总结

本地部署LLM有多种选择,从简单的Ollama到高性能的vLLM。根据硬件条件和需求选择合适的方案,可以实现高效、隐私的本地AI推理。