📊

Prometheus高级监控

📂 devops ⏱ 2 min 301 words

Prometheus高级监控

高级配置

服务发现

scrape_configs:
  # 基于文件的服务发现
  - job_name: 'file-sd'
    file_sd_configs:
      - files:
          - '/etc/prometheus/targets/*.json'
        refresh_interval: 5m
  
  # Kubernetes服务发现
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: (.+)
        replacement: ${1}:$1

指标重标签

relabel_configs:
  # 保留原始标签
  - source_labels: [__meta_kubernetes_namespace]
    target_label: namespace
  
  # 过滤标签
  - source_labels: [__meta_kubernetes_pod_label_app]
    regex: (.*)
    target_label: app
  
  # 删除标签
  - action: labeldrop
    regex: __meta_kubernetes_pod_label_.*

告警规则

高级告警

groups:
  - name: advanced-alerts
    rules:
      # 预测性告警
      - alert: DiskSpaceWillFill
        expr: (
          predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 24*3600) < 0
        )
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "磁盘空间将在24小时内耗尽"
      
      # 异常检测
      - alert: HighErrorRate
        expr: (
          sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
          /
          sum(rate(http_requests_total[5m])) by (service)
        ) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.service }} 错误率超过5%"
      
      # 趋势告警
      - alert: RequestLatencyIncreasing
        expr: (
          histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
          > histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m] offset 1h))
        )
        for: 30m
        labels:
          severity: warning

自定义指标

应用指标

# Python示例
from prometheus_client import Counter, Histogram, Gauge
import time

# 定义指标
REQUEST_COUNT = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

REQUEST_LATENCY = Histogram(
    'http_request_duration_seconds',
    'HTTP request latency',
    ['method', 'endpoint']
)

ACTIVE_CONNECTIONS = Gauge(
    'active_connections',
    'Number of active connections'
)

# 使用指标
@REQUEST_LATENCY.time()
def handle_request():
    REQUEST_COUNT.labels('GET', '/api', '200').inc()
    ACTIVE_CONNECTIONS.inc()
    # 处理请求
    time.sleep(0.1)
    ACTIVE_CONNECTIONS.dec()

中间件指标

// Go示例
var (
    httpRequestsTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total HTTP requests",
        },
        []string{"method", "path", "status"},
    )
)

func PrometheusMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        next.ServeHTTP(w, r)
        duration := time.Since(start)
        
        httpRequestsTotal.WithLabelValues(
            r.Method,
            r.URL.Path,
            strconv.Itoa(w.(statusRecorder).status),
        ).Inc()
        
        requestDuration.WithLabelValues(
            r.Method,
            r.URL.Path,
        ).Observe(duration.Seconds())
    })
}

Grafana仪表板

自定义仪表板

{
  "dashboard": {
    "title": "Application Dashboard",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total[5m])) by (method)",
            "legendFormat": "{{method}}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "singlestat",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status=~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100"
          }
        ],
        "format": "percent"
      }
    ]
  }
}

实践：完整监控系统

# docker-compose.yml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./rules:/etc/prometheus/rules
    ports:
      - "9090:9090"

  grafana:
    image: grafana/grafana
    volumes:
      - grafana_data:/var/lib/grafana
      - ./dashboards:/var/lib/grafana/dashboards
    ports:
      - "3000:3000"

  alertmanager:
    image: prom/alertmanager
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9093:9093"

  pushgateway:
    image: prom/pushgateway
    ports:
      - "9091:9091"

volumes:
  grafana_data:

总结

Prometheus高级监控包括服务发现、自定义指标、预测性告警等功能。通过这些高级特性，可以构建更完善的监控系统。