Docker监控与健康检查
Docker监控与健康检查
容器监控工具
Docker内置命令
# 查看容器资源使用
docker stats
# 查看容器进程
docker top container_name
# 查看容器详情
docker inspect container_name
# 查看容器变化
docker diff container_name
cAdvisor
# docker-compose.yml
version: '3.8'
services:
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
privileged: true
devices:
- /dev/kmsg
健康检查
Dockerfile中定义
FROM nginx:alpine
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD curl -f http://localhost/ || exit 1
docker-compose中定义
services:
web:
image: nginx
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost"]
interval: 30s
timeout: 3s
retries: 3
start_period: 5s
健康检查参数
| 参数 | 说明 |
|---|---|
| interval | 检查间隔 |
| timeout | 超时时间 |
| retries | 重试次数 |
| start_period | 启动等待时间 |
查看健康状态
# 查看容器健康状态
docker inspect --format='{{json .State.Health}}' container_name
# 简化显示
docker ps --format "table {{.Names}}\t{{.Status}}"
Prometheus监控
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
node-exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
Prometheus配置
# prometheus.yml
scrape_configs:
- job_name: 'docker'
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
资源监控
CPU监控
# 查看CPU使用
docker stats --format "table {{.Name}}\t{{.CPUPerc}}"
# 限制CPU
docker run --cpus=1.5 myapp
内存监控
# 查看内存使用
docker stats --format "table {{.Name}}\t{{.MemUsage}}"
# 限制内存
docker run --memory=512m myapp
网络监控
# 查看网络IO
docker stats --format "table {{.Name}}\t{{.NetIO}}"
# 查看容器网络详情
docker inspect --format='{{json .NetworkSettings}}' container_name
实践:监控系统搭建
version: '3.8'
services:
prometheus:
image: prom/prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
grafana:
image: grafana/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
cadvisor:
image: gcr.io/cadvisor/cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- "8080:8080"
alertmanager:
image: prom/alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
volumes:
grafana_data:
告警配置
# alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://webhook:5001/'
总结
Docker监控是确保容器化应用稳定运行的重要手段。通过健康检查、Prometheus监控和告警配置,可以及时发现和解决问题。