LLM蓝绿部署
--- title: "LLM蓝绿部署" description: "LLM模型的蓝绿部署策略,实现零停机的模型版本切换和快速回滚能力" tags: ["蓝绿部署", "零停机", "版本切换"] category: "llm" icon: "🧠"
LLM蓝绿部署
概述
蓝绿部署(Blue-Green Deployment)维护两套完全相同的生产环境,分别称为"蓝色"和"绿色"。同一时刻只有一个环境对外提供服务,通过切换流量实现零停机部署。对LLM项目而言,这种策略提供了极快的回滚能力和完整的环境隔离。
架构设计
用户流量 → 负载均衡器
↓
┌─────────────────┐
│ 蓝色环境 │ ← 当前活跃
│ (旧模型v1.0) │
└─────────────────┘
↑
┌─────────────────┐
│ 绿色环境 │ ← 预备/部署中
│ (新模型v2.0) │
└─────────────────┘
↓
测试流量 → 验证通过后切换
实现方案
环境管理
# blue_green/environment.py
from enum import Enum
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
import json
class EnvironmentColor(Enum):
BLUE = "blue"
GREEN = "green"
@dataclass
class EnvironmentConfig:
color: EnvironmentColor
model_name: str
model_version: str
endpoint: str
replicas: int
resources: dict
status: str # "active", "standby", "deploying", "testing"
class BlueGreenManager:
def __init__(self, config_path: str):
self.config_path = config_path
self.environments = {
EnvironmentColor.BLUE: None,
EnvironmentColor.GREEN: None,
}
self.active_color = EnvironmentColor.BLUE
self._load_config()
def _load_config(self):
try:
with open(self.config_path) as f:
config = json.load(f)
self.active_color = EnvironmentColor(config["active_color"])
for color_str, env_data in config["environments"].items():
color = EnvironmentColor(color_str)
self.environments[color] = EnvironmentConfig(**env_data)
except FileNotFoundError:
pass
def _save_config(self):
config = {
"active_color": self.active_color.value,
"environments": {}
}
for color, env in self.environments.items():
if env:
config["environments"][color.value] = {
"color": env.color.value,
"model_name": env.model_name,
"model_version": env.model_version,
"endpoint": env.endpoint,
"replicas": env.replicas,
"resources": env.resources,
"status": env.status,
}
with open(self.config_path, "w") as f:
json.dump(config, f, indent=2, ensure_ascii=False)
def get_active_endpoint(self) -> str:
env = self.environments[self.active_color]
if not env:
raise ValueError("No active environment")
return env.endpoint
def get_standby_color(self) -> EnvironmentColor:
return (EnvironmentColor.GREEN if self.active_color == EnvironmentColor.BLUE
else EnvironmentColor.BLUE)
部署流程
# blue_green/deployer.py
from kubernetes import client, config
import time
class BlueGreenDeployer:
def __init__(self, manager: BlueGreenManager):
self.manager = manager
config.load_incluster_config()
self.k8s = client.AppsV1Api()
def deploy_to_standby(self, model_version: str, image: str):
standby_color = self.manager.get_standby_color()
env = EnvironmentConfig(
color=standby_color,
model_name="llm-model",
model_version=model_version,
endpoint=f"http://llm-{standby_color.value}.svc:8000",
replicas=2,
resources={"gpu": 1, "memory": "16Gi"},
status="deploying"
)
self._create_deployment(standby_color, image, env)
self.manager.environments[standby_color] = env
self.manager._save_config()
print(f"Deployed v{model_version} to {standby_color.value} environment")
def _create_deployment(self, color: EnvironmentColor, image: str, env: EnvironmentConfig):
name = f"llm-{color.value}"
deployment = client.V1Deployment(
metadata=client.V1ObjectMeta(name=name),
spec=client.V1DeploymentSpec(
replicas=env.replicas,
selector=client.V1LabelSelector(
matchLabels={"environment": color.value}
),
template=client.V1PodTemplateSpec(
metadata=client.V1ObjectMeta(
labels={"environment": color.value}
),
spec=client.V1PodSpec(
containers=[
client.V1Container(
name="llm",
image=image,
ports=[client.V1ContainerPort(container_port=8000)],
resources=client.V1ResourceRequirements(
limits={"nvidia.com/gpu": "1"}
)
)
]
)
)
)
)
try:
self.k8s.read_namespaced_deployment(name, "llm-inference")
self.k8s.patch_namespaced_deployment(name, "llm-inference", deployment)
except client.exceptions.ApiException:
self.k8s.create_namespaced_deployment("llm-inference", deployment)
def switch_traffic(self):
old_color = self.manager.active_color
new_color = self.manager.get_standby_color()
# 更新负载均衡器配置
self._update_loadbalancer(new_color)
# 更新状态
if self.manager.environments[old_color]:
self.manager.environments[old_color].status = "standby"
if self.manager.environments[new_color]:
self.manager.environments[new_color].status = "active"
self.manager.active_color = new_color
self.manager._save_config()
print(f"Traffic switched: {old_color.value} → {new_color.value}")
def _update_loadbalancer(self, target_color: EnvironmentColor):
# 更新Kubernetes Service指向新环境
service = client.V1Service(
metadata=client.V1ObjectMeta(name="llm-production"),
spec=client.V1ServiceSpec(
selector={"environment": target_color.value},
ports=[client.V1ServicePort(port=8000)]
)
)
try:
self.k8s.patch_namespaced_service("llm-production", "llm-inference", service)
except client.exceptions.ApiException:
self.k8s.create_namespaced_service("llm-inference", service)
自动化蓝绿切换
# blue_green/automated_switch.py
class BlueGreenAutomation:
def __init__(self, deployer: BlueGreenDeployer,
smoke_test_func):
self.deployer = deployer
self.smoke_test = smoke_test_func
def run_full_deployment(self, model_version: str, image: str):
print("1. Deploying to standby environment...")
self.deployer.deploy_to_standby(model_version, image)
print("2. Waiting for pods to be ready...")
self._wait_for_ready()
print("3. Running smoke tests...")
if not self.smoke_test(self.deployer.manager.get_standby_color().value):
print("❌ Smoke tests failed, aborting deployment")
return False
print("4. Switching traffic...")
self.deployer.switch_traffic()
print("5. Running post-switch health check...")
time.sleep(10)
if not self._post_switch_health_check():
print("⚠️ Health check failed, consider rollback")
return False
print("✅ Deployment completed successfully!")
return True
def rollback(self):
print("Rolling back to previous version...")
self.deployer.switch_traffic()
print("✅ Rollback completed")
def _wait_for_ready(self, timeout: int = 300):
# 等待新环境的所有Pod就绪
pass
def _post_switch_health_check(self) -> bool:
# 切换后的健康检查
return True
蓝绿部署 vs 金丝雀发布
| 特性 | 蓝绿部署 | 金丝雀发布 |
|---|---|---|
| 流量切换 | 一次性全量切换 | 渐进式切换 |
| 回滚速度 | 立即(切换流量) | 需要逐步缩小流量 |
| 资源开销 | 需要双倍资源 | 额外资源较少 |
| 风险控制 | 切换前充分测试 | 实时监控风险 |
| 适用场景 | 关键版本发布 | 常规迭代更新 |
最佳实践
- 切换前验证:务必在备用环境完成充分的冒烟测试
- 监控切换:切换后密切监控新环境的各项指标
- 保留旧环境:切换后保留旧环境至少30分钟,以便快速回滚
- 自动化优先:尽可能自动化整个流程,减少人为错误