← 返回首页
📊

Docker监控与健康检查

📂 devops ⏱ 2 min 249 words

Docker监控与健康检查

容器监控工具

Docker内置命令

# 查看容器资源使用
docker stats

# 查看容器进程
docker top container_name

# 查看容器详情
docker inspect container_name

# 查看容器变化
docker diff container_name

cAdvisor

# docker-compose.yml
version: '3.8'

services:
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    privileged: true
    devices:
      - /dev/kmsg

健康检查

Dockerfile中定义

FROM nginx:alpine

HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
    CMD curl -f http://localhost/ || exit 1

docker-compose中定义

services:
  web:
    image: nginx
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost"]
      interval: 30s
      timeout: 3s
      retries: 3
      start_period: 5s

健康检查参数

参数 说明
interval 检查间隔
timeout 超时时间
retries 重试次数
start_period 启动等待时间

查看健康状态

# 查看容器健康状态
docker inspect --format='{{json .State.Health}}' container_name

# 简化显示
docker ps --format "table {{.Names}}\t{{.Status}}"

Prometheus监控

version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"

  node-exporter:
    image: prom/node-exporter:latest
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'

Prometheus配置

# prometheus.yml
scrape_configs:
  - job_name: 'docker'
    static_configs:
      - targets: ['cadvisor:8080']
  
  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']

资源监控

CPU监控

# 查看CPU使用
docker stats --format "table {{.Name}}\t{{.CPUPerc}}"

# 限制CPU
docker run --cpus=1.5 myapp

内存监控

# 查看内存使用
docker stats --format "table {{.Name}}\t{{.MemUsage}}"

# 限制内存
docker run --memory=512m myapp

网络监控

# 查看网络IO
docker stats --format "table {{.Name}}\t{{.NetIO}}"

# 查看容器网络详情
docker inspect --format='{{json .NetworkSettings}}' container_name

实践:监控系统搭建

version: '3.8'

services:
  prometheus:
    image: prom/prometheus
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana

  cadvisor:
    image: gcr.io/cadvisor/cadvisor
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    ports:
      - "8080:8080"

  alertmanager:
    image: prom/alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml

volumes:
  grafana_data:

告警配置

# alertmanager.yml
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'

receivers:
  - name: 'web.hook'
    webhook_configs:
      - url: 'http://webhook:5001/'

总结

Docker监控是确保容器化应用稳定运行的重要手段。通过健康检查、Prometheus监控和告警配置,可以及时发现和解决问题。