← 返回首页
🧠

OpenTelemetry与LLM

📂 llm ⏱ 3 min 401 words

--- title: "OpenTelemetry与LLM" description: "介绍如何使用OpenTelemetry标准化LLM应用的可观测性,实现指标、日志和追踪的统一管理。" tags: ["OpenTelemetry", "LLM", "可观测性"] category: "llm" icon: "🧠"

OpenTelemetry与LLM

OpenTelemetry简介

OpenTelemetry(简称OTel)是一个开源的可观测性框架,提供了统一的API、SDK和工具来收集和导出遥测数据。它支持三大信号:追踪(Traces)、指标(Metrics)和日志(Logs)。

对于LLM应用,OpenTelemetry的优势在于:

安装与配置

基础安装

pip install opentelemetry-api opentelemetry-sdk
pip install opentelemetry-exporter-otlp
pip install opentelemetry-instrumentation-openai  # OpenAI自动注入

初始化追踪器

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter

def init_telemetry(service_name: str):
    resource = Resource.create({
        "service.name": service_name,
        "service.version": "1.0.0",
        "deployment.environment": "production"
    })
    
    provider = TracerProvider(resource=resource)
    
    # 配置OTLP导出器
    exporter = OTLPSpanExporter(
        endpoint="http://localhost:4317",
        insecure=True
    )
    
    processor = BatchSpanProcessor(exporter)
    provider.add_span_processor(processor)
    
    trace.set_tracer_provider(provider)
    
    return trace.get_tracer(service_name)

LLM Semantic Conventions

OpenTelemetry为LLM应用定义了标准的Semantic Conventions:

from opentelemetry import trace

# 标准的LLM Span属性
LLM_ATTRIBUTES = {
    "llm.model": "模型名称",
    "llm.request.model": "请求的模型",
    "llm.request.max_tokens": "最大token数",
    "llm.request.temperature": "温度参数",
    "llm.response.model": "响应的模型",
    "llm.usage.input_tokens": "输入token数",
    "llm.usage.output_tokens": "输出token数",
    "llm.usage.total_tokens": "总token数",
    "llm.finish_reason": "完成原因"
}

自定义Span封装

为LLM调用创建标准化的追踪封装:

from contextlib import contextmanager
from typing import Optional, Dict, Any

class LLMTracer:
    def __init__(self, tracer: trace.Tracer):
        self.tracer = tracer
    
    @contextmanager
    def trace_completion(self, model: str, messages: list, **kwargs):
        with self.tracer.start_as_current_span(
            "llm.chat.completion",
            attributes={
                "llm.request.model": model,
                "llm.request.messages": str(messages)[:500],
                "llm.request.max_tokens": kwargs.get("max_tokens"),
                "llm.request.temperature": kwargs.get("temperature"),
                "llm.request.top_p": kwargs.get("top_p")
            }
        ) as span:
            try:
                yield span
                span.set_status(trace.StatusCode.OK)
            except Exception as e:
                span.set_status(trace.StatusCode.ERROR, str(e))
                span.record_exception(e)
                raise
    
    def record_response(self, span: trace.Span, response: dict):
        span.set_attributes({
            "llm.response.id": response.get("id"),
            "llm.response.model": response.get("model"),
            "llm.response.finish_reason": response.get("choices", [{}])[0].get("finish_reason"),
            "llm.usage.input_tokens": response.get("usage", {}).get("prompt_tokens"),
            "llm.usage.output_tokens": response.get("usage", {}).get("completion_tokens"),
            "llm.usage.total_tokens": response.get("usage", {}).get("total_tokens")
        })

自动化仪器化

OpenAI集成

使用官方instrumentation库自动追踪OpenAI调用:

from opentelemetry.instrumentation.openai import OpenAIInstrumentor

# 自动注入OpenAI调用
OpenAIInstrumentor().instrument()

# 之后的OpenAI调用会自动创建Span
import openai

client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Hello!"}]
)

自定义仪器化

为自定义LLM服务添加追踪:

from opentelemetry.instrumentation.instrumentor import BaseInstrumentor

class CustomLLMInstrumentor(BaseInstrumentor):
    def instrumentation_dependencies(self):
        return ("custom-llm>=1.0.0",)
    
    def _instrument(self, **kwargs):
        # 注入追踪代码到目标模块
        pass
    
    def _uninstrument(self):
        # 移除追踪代码
        pass

指标收集

创建自定义指标

from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter

def setup_metrics(service_name: str):
    exporter = OTLPMetricExporter(endpoint="http://localhost:4317")
    reader = PeriodicExportingMetricReader(exporter, export_interval_millis=5000)
    provider = MeterProvider(metric_readers=[reader])
    
    meter = provider.get_meter(service_name)
    
    # 创建指标
    request_counter = meter.create_counter(
        name="llm.requests.total",
        description="Total number of LLM requests",
        unit="1"
    )
    
    latency_histogram = meter.create_histogram(
        name="llm.request.duration",
        description="LLM request duration",
        unit="ms"
    )
    
    return meter, request_counter, latency_histogram

记录指标

from opentelemetry import metrics

meter = metrics.get_meter("llm-service")

# 创建指标
request_counter = meter.create_counter("llm.requests")
token_usage = meter.create_histogram("llm.tokens.usage")

def call_llm(model: str, prompt: str):
    # 记录请求
    request_counter.add(1, {"model": model, "status": "success"})
    
    # 执行调用...
    response = client.chat.completions.create(model=model, messages=[...])
    
    # 记录token使用
    token_usage.record(response.usage.total_tokens, {"model": model})
    
    return response

日志集成

结构化日志与追踪关联

import logging
from opentelemetry import trace

class TracedLogger:
    def __init__(self, name: str):
        self.logger = logging.getLogger(name)
    
    def info(self, message: str, **kwargs):
        span = trace.get_current_span()
        context = span.get_span_context()
        
        extra = {
            "trace_id": format(context.trace_id, "032x"),
            "span_id": format(context.span_id, "016x"),
            **kwargs
        }
        
        self.logger.info(message, extra=extra)

最佳实践

  1. 渐进式采用:从关键路径开始,逐步扩展覆盖范围
  2. 语义一致性:遵循OpenTelemetry Semantic Conventions
  3. 采样策略:配置合理的采样率,平衡数据完整性与性能
  4. 隐私保护:对敏感数据进行脱敏处理
  5. 持续监控:监控OTel自身的性能开销

OpenTelemetry为LLM应用提供了标准化的可观测性解决方案,帮助你构建可维护、可扩展的AI系统。