Docker部署LLM:使用容器化技术部署大语言模型
Docker部署LLM:使用容器化技术部署大语言模型
Docker部署优势
使用Docker部署大语言模型可以确保环境一致性、简化依赖管理、提高可移植性,并支持快速扩展。容器化技术使得LLM服务可以在任何支持Docker的环境中稳定运行。
Docker镜像构建
基础镜像选择
# Dockerfile.base
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
# 安装Python
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
python3.10-dev \
&& rm -rf /var/lib/apt/lists/*
# 设置工作目录
WORKDIR /app
# 安装基础依赖
COPY requirements-base.txt .
RUN pip3 install --no-cache-dir -r requirements-base.txt
# 设置环境变量
ENV PYTHONUNBUFFERED=1
ENV CUDA_VISIBLE_DEVICES=0
完整的应用镜像
# Dockerfile
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
# 安装系统依赖
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
python3.10-venv \
libgl1-mesa-glx \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# 创建虚拟环境
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# 安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY src/ ./src/
COPY models/ ./models/
COPY configs/ ./configs/
# 创建非root用户
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
USER appuser
# 暴露端口
EXPOSE 8000
# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# 启动命令
CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]
多阶段构建
# Dockerfile.multistage
# 阶段1:构建阶段
FROM python:3.10-slim as builder
WORKDIR /build
# 安装构建依赖
RUN apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --user --no-cache-dir -r requirements.txt
# 阶段2:运行阶段
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
# 从构建阶段复制依赖
COPY --from=builder /root/.local /root/.local
ENV PATH=/root/.local/bin:$PATH
# 安装运行时依赖
RUN apt-get update && apt-get install -y \
python3.10 \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# 复制应用
COPY src/ ./src/
COPY models/ ./models/
# 设置环境
ENV PYTHONUNBUFFERED=1
ENV PYTHONPATH=/app
EXPOSE 8000
CMD ["python3", "-m", "uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]
应用代码结构
FastAPI应用
# src/main.py
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import logging
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="LLM Docker Service")
# CORS配置
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class GenerateRequest(BaseModel):
prompt: str
max_length: int = 100
temperature: float = 0.7
top_k: int = 50
top_p: float = 0.95
class GenerateResponse(BaseModel):
generated_text: str
tokens_used: int
class LLMModel:
"""LLM模型管理"""
def __init__(self):
self.model = None
self.tokenizer = None
self.device = None
def load(self, model_path: str):
"""加载模型"""
logger.info(f"Loading model from {model_path}")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="auto"
).to(self.device)
self.model.eval()
logger.info(f"Model loaded on {self.device}")
def generate(self, prompt: str, **kwargs) -> tuple:
"""生成文本"""
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
**kwargs
)
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
tokens_used = outputs.shape[1] - inputs['input_ids'].shape[1]
return generated_text, tokens_used
# 全局模型实例
llm_model = LLMModel()
@app.on_event("startup")
async def startup_event():
"""应用启动时加载模型"""
model_path = os.getenv("MODEL_PATH", "./models/default")
llm_model.load(model_path)
@app.get("/health")
async def health_check():
"""健康检查"""
return {
"status": "healthy",
"model_loaded": llm_model.model is not None,
"device": str(llm_model.device)
}
@app.post("/generate", response_model=GenerateResponse)
async def generate_text(request: GenerateRequest):
"""生成文本"""
try:
generated_text, tokens_used = llm_model.generate(
prompt=request.prompt,
max_length=request.max_length,
temperature=request.temperature,
top_k=request.top_k,
top_p=request.top_p,
do_sample=True
)
return GenerateResponse(
generated_text=generated_text,
tokens_used=tokens_used
)
except Exception as e:
logger.error(f"Generation error: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/info")
async def model_info():
"""获取模型信息"""
return {
"model_path": os.getenv("MODEL_PATH", "./models/default"),
"device": str(llm_model.device),
"model_type": type(llm_model.model).__name__ if llm_model.model else None
}
Docker Compose配置
单服务配置
# docker-compose.yml
version: '3.8'
services:
llm-api:
build:
context: .
dockerfile: Dockerfile
ports:
- "8000:8000"
volumes:
- ./models:/app/models
- ./configs:/app/configs
environment:
- MODEL_PATH=/app/models/your-model
- CUDA_VISIBLE_DEVICES=0
- PYTHONUNBUFFERED=1
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
restart: unless-stopped
完整的生产配置
# docker-compose.prod.yml
version: '3.8'
services:
# LLM API服务
llm-api:
build:
context: .
dockerfile: Dockerfile
ports:
- "8000:8000"
volumes:
- model-storage:/app/models
environment:
- MODEL_PATH=/app/models/production-model
- LOG_LEVEL=INFO
- MAX_WORKERS=4
deploy:
replicas: 2
resources:
limits:
cpus: '4'
memory: 16G
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks:
- llm-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
restart: always
# Nginx反向代理
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf
- ./nginx/ssl:/etc/nginx/ssl
depends_on:
- llm-api
networks:
- llm-network
restart: always
# Redis缓存
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis-data:/data
command: redis-server --appendonly yes
networks:
- llm-network
restart: always
# Prometheus监控
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
networks:
- llm-network
restart: always
# Grafana可视化
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-data:/var/lib/grafana
networks:
- llm-network
restart: always
volumes:
model-storage:
redis-data:
grafana-data:
networks:
llm-network:
driver: bridge
Kubernetes部署
部署配置
# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-deployment
labels:
app: llm-service
spec:
replicas: 3
selector:
matchLabels:
app: llm-service
template:
metadata:
labels:
app: llm-service
spec:
containers:
- name: llm-api
image: your-registry/llm-service:latest
ports:
- containerPort: 8000
resources:
requests:
memory: "8Gi"
cpu: "2"
nvidia.com/gpu: 1
limits:
memory: "16Gi"
cpu: "4"
nvidia.com/gpu: 1
env:
- name: MODEL_PATH
value: "/app/models/production-model"
- name: CUDA_VISIBLE_DEVICES
value: "0"
volumeMounts:
- name: model-storage
mountPath: /app/models
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: model-storage
persistentVolumeClaim:
claimName: model-pvc
---
apiVersion: v1
kind: Service
metadata:
name: llm-service
spec:
selector:
app: llm-service
ports:
- port: 80
targetPort: 8000
type: LoadBalancer
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: llm-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: llm-deployment
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
模型管理
模型下载脚本
# scripts/download_model.py
import os
from huggingface_hub import snapshot_download
def download_model(model_id: str, output_dir: str):
"""下载模型到本地"""
os.makedirs(output_dir, exist_ok=True)
print(f"Downloading model: {model_id}")
snapshot_download(
repo_id=model_id,
local_dir=output_dir,
ignore_patterns=["*.md", "*.txt"]
)
print(f"Model downloaded to: {output_dir}")
if __name__ == "__main__":
import sys
model_id = sys.argv[1] if len(sys.argv) > 1 else "gpt2"
output_dir = sys.argv[2] if len(sys.argv) > 2 else "./models/default"
download_model(model_id, output_dir)
性能优化
GPU内存优化
# scripts/optimize_gpu.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def optimize_for_inference(model_path: str, output_path: str):
"""优化模型用于推理"""
# 加载模型
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="auto"
)
# 优化模型
model = torch.jit.optimize_for_inference(model)
# 保存优化后的模型
torch.jit.save(model, output_path)
print(f"Optimized model saved to: {output_path}")
if __name__ == "__main__":
import sys
model_path = sys.argv[1]
output_path = sys.argv[2]
optimize_for_inference(model_path, output_path)
监控与日志
Docker日志配置
# logging配置
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tag: "{{.ImageName}}/{{.Name}}/{{.ID}}"
健康检查脚本
#!/bin/bash
# scripts/health_check.sh
HEALTH_URL="http://localhost:8000/health"
MAX_RETRIES=30
RETRY_INTERVAL=2
for i in $(seq 1 $MAX_RETRIES); do
if curl -sf $HEALTH_URL > /dev/null 2>&1; then
echo "Service is healthy"
exit 0
fi
echo "Waiting for service... ($i/$MAX_RETRIES)"
sleep $RETRY_INTERVAL
done
echo "Service failed to start"
exit 1
Docker容器化技术为LLM部署提供了可靠、可扩展的解决方案,通过合理的镜像构建、编排配置和监控设置,可以确保大语言模型服务在生产环境中稳定高效地运行。