← 返回首页
🔧

OpenTelemetry:统一的可观测性标准

📂 devops ⏱ 3 min 434 words

OpenTelemetry:统一的可观测性标准

什么是OpenTelemetry

OpenTelemetry(简称OTel)是一个开源的可观测性框架,由OpenTracing和OpenCensus合并而来。它提供了统一的API、SDK和工具集,用于采集Traces(追踪)、Metrics(指标)和Logs(日志)。

核心概念

三大支柱

可观测性三大支柱:
  ├── Traces: 分布式追踪,记录请求链路
  ├── Metrics: 指标数据,量化系统状态
  └── Logs: 日志数据,记录事件详情

关键组件

OpenTelemetry组件:
  ├── API: 定义接口规范
  ├── SDK: API的具体实现
  ├── Collector: 接收、处理和导出遥测数据
  └── Exporters: 将数据发送到后端系统

安装OpenTelemetry

Collector安装

# Docker安装Collector
docker run -d \
  --name otel-collector \
  -p 4317:4317 \
  -p 4318:4318 \
  -p 8888:8888 \
  -v otel-config.yaml:/etc/otelcol/config.yaml \
  otel/opentelemetry-collector-contrib:latest

Collector配置

# otel-config.yaml
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318
  
  prometheus:
    config:
      scrape_configs:
        - job_name: 'otel-collector'
          scrape_interval: 10s
          static_configs:
            - targets: ['localhost:8888']

processors:
  batch:
    timeout: 10s
    send_batch_size: 1024
  
  memory_limiter:
    limit_mib: 200
    spike_limit_mib: 50
  
  attributes:
    actions:
      - key: environment
        value: production
        action: upsert

exporters:
  otlp:
    endpoint: jaeger:4317
    tls:
      insecure: true
  
  prometheus:
    endpoint: "0.0.0.0:8889"
    namespace: "otel"
  
  logging:
    verbosity: detailed

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [memory_limiter, batch]
      exporters: [otlp, logging]
    
    metrics:
      receivers: [otlp, prometheus]
      processors: [memory_limiter, batch]
      exporters: [prometheus, logging]
    
    logs:
      receivers: [otlp]
      processors: [memory_limiter, batch]
      exporters: [logging]

应用集成

Go应用集成

package main

import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
    "go.opentelemetry.io/otel/sdk/resource"
    traceSDK "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
)

func initTracer() (*traceSDK.TracerProvider, error) {
    exporter, err := otlptracegrpc.New(
        context.Background(),
        otlptracegrpc.WithEndpoint("otel-collector:4317"),
        otlptracegrpc.WithInsecure(),
    )
    if err != nil {
        return nil, err
    }
    
    resource := resource.NewWithAttributes(
        semconv.SchemaURL,
        semconv.ServiceName("my-service"),
        semconv.ServiceVersion("1.0.0"),
        semconv.DeploymentEnvironment("production"),
    )
    
    tp := traceSDK.NewTracerProvider(
        traceSDK.WithBatcher(exporter),
        traceSDK.WithResource(resource),
        traceSDK.WithSampler(traceSDK.AlwaysSample()),
    )
    
    otel.SetTracerProvider(tp)
    return tp, nil
}

func main() {
    tp, _ := initTracer()
    defer tp.Shutdown(context.Background())
    
    tracer := otel.Tracer("my-service")
    ctx, span := tracer.Start(context.Background(), "main-operation")
    defer span.End()
    
    // 业务逻辑...
}

Python应用集成

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.resources import Resource

def init_tracer():
    resource = Resource.create({
        "service.name": "my-python-service",
        "service.version": "1.0.0",
    })
    
    exporter = OTLPSpanExporter(
        endpoint="otel-collector:4317",
        insecure=True,
    )
    
    provider = TracerProvider(resource=resource)
    processor = BatchSpanProcessor(exporter)
    provider.add_span_processor(processor)
    
    trace.set_tracer_provider(provider)
    return provider

tracer = trace.get_tracer(__name__)

def process_request():
    with tracer.start_as_current_span("process-request") as span:
        span.set_attribute("request.type", "http")
        # 业务逻辑

Java应用集成

import io.opentelemetry.api.GlobalOpenTelemetry;
import io.opentelemetry.api.trace.Tracer;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.api.trace.StatusCode;

public class MyApp {
    private static final Tracer tracer = GlobalOpenTelemetry.getTracer("my-java-service");
    
    public void processRequest() {
        Span span = tracer.spanBuilder("process-request")
            .setAttribute("request.type", "http")
            .startSpan();
        
        try (Scope scope = span.makeCurrent()) {
            // 业务逻辑
            span.setStatus(StatusCode.OK);
        } catch (Exception e) {
            span.setStatus(StatusCode.ERROR, e.getMessage());
            span.recordException(e);
        } finally {
            span.end();
        }
    }
}

Metrics配置

# 指标采集配置
receivers:
  hostmetrics:
    collection_interval: 30s
    scrapers:
      cpu:
      memory:
      disk:
      filesystem:
      network:
      process:

  docker_stats:
    endpoint: unix:///var/run/docker.sock
    collection_interval: 30s

exporters:
  prometheus:
    endpoint: "0.0.0.0:8889"
    resource_to_telemetry_conversion:
      enabled: true

Kubernetes部署

# otel-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: otel-collector
spec:
  selector:
    matchLabels:
      app: otel-collector
  template:
    metadata:
      labels:
        app: otel-collector
    spec:
      containers:
        - name: collector
          image: otel/opentelemetry-collector-contrib:latest
          args: ["--config=/etc/otelcol/config.yaml"]
          ports:
            - containerPort: 4317
              name: otlp
            - containerPort: 4318
              name: otlp-http
            - containerPort: 8889
              name: metrics
          volumeMounts:
            - name: config
              mountPath: /etc/otelcol/config.yaml
              subPath: config.yaml
      volumes:
        - name: config
          configMap:
            name: otel-collector-config

查询和分析

# 使用PromQL查询OTel指标
curl -G 'http://prometheus:9090/api/v1/query' \
  --data-urlencode 'query=otel_http_server_duration_milliseconds'

# 查找慢请求
rate(otel_http_server_duration_milliseconds_bucket[5m])

# 计算请求速率
sum(rate(otel_http_server_requests_total[5m])) by (method, status)

最佳实践

  1. 统一标准:使用OpenTelemetry替代专有代理
  2. 采样策略:根据流量和成本选择合适的采样率
  3. 资源标注:添加服务名、环境、版本等元数据
  4. 上下文传播:确保跨服务的追踪上下文正确传播
  5. 性能监控:监控Collector自身的资源使用情况