🔧

高可用架构：系统可靠性设计

📂 devops ⏱ 3 min 599 words

高可用架构：系统可靠性设计

高可用核心概念

高可用设计原则:
  ├── 冗余设计: 消除单点故障
  ├── 故障检测: 快速发现故障
  ├── 自动切换: 快速恢复服务
  ├── 数据一致性: 保证数据可靠
  └── 容错设计: 容忍部分故障

可用性等级

availability_levels:
  two_nines:
    availability: "99%"
    downtime_per_year: "3.65天"
    use_case: "内部系统"
    
  three_nines:
    availability: "99.9%"
    downtime_per_year: "8.76小时"
    use_case: "一般业务系统"
    
  four_nines:
    availability: "99.99%"
    downtime_per_year: "52.6分钟"
    use_case: "核心业务系统"
    
  five_nines:
    availability: "99.999%"
    downtime_per_year: "5.26分钟"
    use_case: "金融、医疗关键系统"

冗余设计

数据库高可用

# PostgreSQL高可用配置
postgresql_ha:
  primary:
    host: "db-primary"
    port: 5432
    
  standbys:
    - host: "db-standby-1"
      port: 5432
      sync_mode: "synchronous"
      
    - host: "db-standby-2"
      port: 5432
      sync_mode: "asynchronous"
      
  patroni:
    enabled: true
    consul_url: "http://consul:8500"
    
  pgbouncer:
    enabled: true
    max_connections: 200
    pool_mode: transaction

Redis高可用

# Redis Sentinel配置
redis_sentinel:
  sentinels:
    - host: "sentinel-1"
      port: 26379
    - host: "sentinel-2"
      port: 26379
    - host: "sentinel-3"
      port: 26379
      
  master:
    name: "mymaster"
    quorum: 2
    
  slaves:
    - host: "redis-slave-1"
      port: 6379
    - host: "redis-slave-2"
      port: 6379

应用服务高可用

# Kubernetes高可用部署
apiVersion: apps/v1
kind: Deployment
metadata:
  name: api-server
spec:
  replicas: 3
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0
  template:
    spec:
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - api-server
                topologyKey: kubernetes.io/hostname
      containers:
        - name: api
          image: api-server:latest
          readinessProbe:
            httpGet:
              path: /health
              port: 8080
            initialDelaySeconds: 10
            periodSeconds: 5
          livenessProbe:
            httpGet:
              path: /health
              port: 8080
            initialDelaySeconds: 30
            periodSeconds: 10

负载均衡

Nginx负载均衡

# nginx.conf
upstream backend {
    # 轮询
    server backend1.example.com;
    server backend2.example.com;
    server backend3.example.com;
    
    # 加权轮询
    server backend1.example.com weight=3;
    server backend2.example.com weight=2;
    server backend3.example.com weight=1;
    
    # ip_hash（会话保持）
    ip_hash;
    
    # 最少连接
    least_conn;
    
    # 健康检查
    server backend1.example.com max_fails=3 fail_timeout=30s;
}

server {
    listen 80;
    
    location / {
        proxy_pass http://backend;
        proxy_connect_timeout 5s;
        proxy_read_timeout 60s;
        proxy_send_timeout 60s;
        
        # 健康检查
        proxy_next_upstream error timeout http_502 http_503;
        proxy_next_upstream_tries 3;
    }
}

HAProxy配置

# haproxy.cfg
global
    maxconn 50000
    
defaults
    mode http
    timeout connect 5s
    timeout client 60s
    timeout server 60s
    option httplog
    option dontlognull

frontend http-in
    bind *:80
    default_backend servers

backend servers
    balance roundrobin
    option httpchk GET /health
    
    server server1 10.0.0.1:8080 check inter 5s fall 3 rise 2
    server server2 10.0.0.2:8080 check inter 5s fall 3 rise 2
    server server3 10.0.0.3:8080 check inter 5s fall 3 rise 2

listen stats
    bind *:8404
    stats enable
    stats uri /stats
    stats auth admin:password

故障检测

健康检查脚本

#!/bin/bash
# health-check.sh

SERVICES=(
    "http://api:8080/health"
    "http://web:80/health"
    "http://db:5432/health"
)

check_service() {
    local url=$1
    local status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 $url)
    
    if [ "$status" = "200" ]; then
        echo "OK: $url"
        return 0
    else
        echo "FAIL: $url (HTTP $status)"
        return 1
    fi
}

FAILED=0
for service in "${SERVICES[@]}"; do
    if ! check_service $service; then
        ((FAILED++))
    fi
done

if [ $FAILED -gt 0 ]; then
    echo "健康检查失败: $FAILED 个服务异常"
    exit 1
fi

echo "所有服务健康"
exit 0

Prometheus监控

# prometheus-rules.yaml
groups:
  - name: high-availability
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "服务宕机"
          description: "{{ $labels.instance }} 已经宕机超过1分钟"
      
      - alert: HighErrorRate
        expr: rate(http_requests_total{code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.01
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "错误率过高"
          description: "错误率超过1%，当前值 {{ $value | humanizePercentage }}"

自动故障转移

Kubernetes故障转移

# pod-disruption-budget.yaml
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
  name: api-pdb
spec:
  minAvailable: 2
  selector:
    matchLabels:
      app: api-server

---
# 自动扩缩容
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: api-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: api-server
  minReplicas: 3
  maxReplicas: 10
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70

故障转移脚本

#!/bin/bash
# failover.sh

PRIMARY_DB="db-primary"
STANDBY_DB="db-standby-1"

# 检查主数据库
check_primary() {
    pg_isready -h $PRIMARY_DB -p 5432 -U postgres
}

# 执行故障转移
failover() {
    echo "执行数据库故障转移..."
    
    # 1. 停止主数据库（模拟故障）
    # ssh $PRIMARY_DB "systemctl stop postgresql"
    
    # 2. 提升备用数据库为主
    ssh $STANDBY_DB "pg_ctl promote -D /var/lib/postgresql/data"
    
    # 3. 更新应用配置
    # 更新连接字符串指向新的主数据库
    
    # 4. 验证
    if pg_isready -h $STANDBY_DB -p 5432 -U postgres; then
        echo "故障转移成功"
    else
        echo "故障转移失败"
        exit 1
    fi
}

# 主监控循环
while true; do
    if ! check_primary; then
        echo "检测到主数据库故障"
        failover
        break
    fi
    sleep 5
done

数据一致性

分布式一致性

# etcd集群配置
etcd_cluster:
  initial-cluster: "node1=http://node1:2380,node2=http://node2:2380,node3=http://node3:2380"
  initial-cluster-state: new
  initial-cluster-token: etcd-cluster
  
  # Raft一致性
  snapshot-count: 10000
  heartbeat-interval: 100
  election-timeout: 1000

最佳实践

消除单点故障: 所有组件都应该有冗余
自动化运维: 故障检测和恢复自动化
监控告警: 完善的监控和告警体系
定期演练: 定期进行故障转移演练
容量规划: 预留足够的冗余容量
文档更新: 保持高可用架构文档更新