← 返回首页
🧠

LLM蓝绿部署

📂 llm ⏱ 3 min 503 words

--- title: "LLM蓝绿部署" description: "LLM模型的蓝绿部署策略,实现零停机的模型版本切换和快速回滚能力" tags: ["蓝绿部署", "零停机", "版本切换"] category: "llm" icon: "🧠"

LLM蓝绿部署

概述

蓝绿部署(Blue-Green Deployment)维护两套完全相同的生产环境,分别称为"蓝色"和"绿色"。同一时刻只有一个环境对外提供服务,通过切换流量实现零停机部署。对LLM项目而言,这种策略提供了极快的回滚能力和完整的环境隔离。

架构设计

用户流量 → 负载均衡器
              ↓
    ┌─────────────────┐
    │   蓝色环境       │ ← 当前活跃
    │  (旧模型v1.0)   │
    └─────────────────┘
              ↑
    ┌─────────────────┐
    │   绿色环境       │ ← 预备/部署中
    │  (新模型v2.0)   │
    └─────────────────┘
              ↓
        测试流量 → 验证通过后切换

实现方案

环境管理

# blue_green/environment.py
from enum import Enum
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
import json

class EnvironmentColor(Enum):
    BLUE = "blue"
    GREEN = "green"

@dataclass
class EnvironmentConfig:
    color: EnvironmentColor
    model_name: str
    model_version: str
    endpoint: str
    replicas: int
    resources: dict
    status: str  # "active", "standby", "deploying", "testing"

class BlueGreenManager:
    def __init__(self, config_path: str):
        self.config_path = config_path
        self.environments = {
            EnvironmentColor.BLUE: None,
            EnvironmentColor.GREEN: None,
        }
        self.active_color = EnvironmentColor.BLUE
        self._load_config()
    
    def _load_config(self):
        try:
            with open(self.config_path) as f:
                config = json.load(f)
                self.active_color = EnvironmentColor(config["active_color"])
                for color_str, env_data in config["environments"].items():
                    color = EnvironmentColor(color_str)
                    self.environments[color] = EnvironmentConfig(**env_data)
        except FileNotFoundError:
            pass
    
    def _save_config(self):
        config = {
            "active_color": self.active_color.value,
            "environments": {}
        }
        for color, env in self.environments.items():
            if env:
                config["environments"][color.value] = {
                    "color": env.color.value,
                    "model_name": env.model_name,
                    "model_version": env.model_version,
                    "endpoint": env.endpoint,
                    "replicas": env.replicas,
                    "resources": env.resources,
                    "status": env.status,
                }
        
        with open(self.config_path, "w") as f:
            json.dump(config, f, indent=2, ensure_ascii=False)
    
    def get_active_endpoint(self) -> str:
        env = self.environments[self.active_color]
        if not env:
            raise ValueError("No active environment")
        return env.endpoint
    
    def get_standby_color(self) -> EnvironmentColor:
        return (EnvironmentColor.GREEN if self.active_color == EnvironmentColor.BLUE
                else EnvironmentColor.BLUE)

部署流程

# blue_green/deployer.py
from kubernetes import client, config
import time

class BlueGreenDeployer:
    def __init__(self, manager: BlueGreenManager):
        self.manager = manager
        config.load_incluster_config()
        self.k8s = client.AppsV1Api()
    
    def deploy_to_standby(self, model_version: str, image: str):
        standby_color = self.manager.get_standby_color()
        
        env = EnvironmentConfig(
            color=standby_color,
            model_name="llm-model",
            model_version=model_version,
            endpoint=f"http://llm-{standby_color.value}.svc:8000",
            replicas=2,
            resources={"gpu": 1, "memory": "16Gi"},
            status="deploying"
        )
        
        self._create_deployment(standby_color, image, env)
        self.manager.environments[standby_color] = env
        self.manager._save_config()
        
        print(f"Deployed v{model_version} to {standby_color.value} environment")
    
    def _create_deployment(self, color: EnvironmentColor, image: str, env: EnvironmentConfig):
        name = f"llm-{color.value}"
        
        deployment = client.V1Deployment(
            metadata=client.V1ObjectMeta(name=name),
            spec=client.V1DeploymentSpec(
                replicas=env.replicas,
                selector=client.V1LabelSelector(
                    matchLabels={"environment": color.value}
                ),
                template=client.V1PodTemplateSpec(
                    metadata=client.V1ObjectMeta(
                        labels={"environment": color.value}
                    ),
                    spec=client.V1PodSpec(
                        containers=[
                            client.V1Container(
                                name="llm",
                                image=image,
                                ports=[client.V1ContainerPort(container_port=8000)],
                                resources=client.V1ResourceRequirements(
                                    limits={"nvidia.com/gpu": "1"}
                                )
                            )
                        ]
                    )
                )
            )
        )
        
        try:
            self.k8s.read_namespaced_deployment(name, "llm-inference")
            self.k8s.patch_namespaced_deployment(name, "llm-inference", deployment)
        except client.exceptions.ApiException:
            self.k8s.create_namespaced_deployment("llm-inference", deployment)
    
    def switch_traffic(self):
        old_color = self.manager.active_color
        new_color = self.manager.get_standby_color()
        
        # 更新负载均衡器配置
        self._update_loadbalancer(new_color)
        
        # 更新状态
        if self.manager.environments[old_color]:
            self.manager.environments[old_color].status = "standby"
        if self.manager.environments[new_color]:
            self.manager.environments[new_color].status = "active"
        
        self.manager.active_color = new_color
        self.manager._save_config()
        
        print(f"Traffic switched: {old_color.value} → {new_color.value}")
    
    def _update_loadbalancer(self, target_color: EnvironmentColor):
        # 更新Kubernetes Service指向新环境
        service = client.V1Service(
            metadata=client.V1ObjectMeta(name="llm-production"),
            spec=client.V1ServiceSpec(
                selector={"environment": target_color.value},
                ports=[client.V1ServicePort(port=8000)]
            )
        )
        try:
            self.k8s.patch_namespaced_service("llm-production", "llm-inference", service)
        except client.exceptions.ApiException:
            self.k8s.create_namespaced_service("llm-inference", service)

自动化蓝绿切换

# blue_green/automated_switch.py
class BlueGreenAutomation:
    def __init__(self, deployer: BlueGreenDeployer, 
                 smoke_test_func):
        self.deployer = deployer
        self.smoke_test = smoke_test_func
    
    def run_full_deployment(self, model_version: str, image: str):
        print("1. Deploying to standby environment...")
        self.deployer.deploy_to_standby(model_version, image)
        
        print("2. Waiting for pods to be ready...")
        self._wait_for_ready()
        
        print("3. Running smoke tests...")
        if not self.smoke_test(self.deployer.manager.get_standby_color().value):
            print("❌ Smoke tests failed, aborting deployment")
            return False
        
        print("4. Switching traffic...")
        self.deployer.switch_traffic()
        
        print("5. Running post-switch health check...")
        time.sleep(10)
        if not self._post_switch_health_check():
            print("⚠️ Health check failed, consider rollback")
            return False
        
        print("✅ Deployment completed successfully!")
        return True
    
    def rollback(self):
        print("Rolling back to previous version...")
        self.deployer.switch_traffic()
        print("✅ Rollback completed")
    
    def _wait_for_ready(self, timeout: int = 300):
        # 等待新环境的所有Pod就绪
        pass
    
    def _post_switch_health_check(self) -> bool:
        # 切换后的健康检查
        return True

蓝绿部署 vs 金丝雀发布

特性 蓝绿部署 金丝雀发布
流量切换 一次性全量切换 渐进式切换
回滚速度 立即(切换流量) 需要逐步缩小流量
资源开销 需要双倍资源 额外资源较少
风险控制 切换前充分测试 实时监控风险
适用场景 关键版本发布 常规迭代更新

最佳实践

  1. 切换前验证:务必在备用环境完成充分的冒烟测试
  2. 监控切换:切换后密切监控新环境的各项指标
  3. 保留旧环境:切换后保留旧环境至少30分钟,以便快速回滚
  4. 自动化优先:尽可能自动化整个流程,减少人为错误