OpenTelemetry与LLM
--- title: "OpenTelemetry与LLM" description: "介绍如何使用OpenTelemetry标准化LLM应用的可观测性,实现指标、日志和追踪的统一管理。" tags: ["OpenTelemetry", "LLM", "可观测性"] category: "llm" icon: "🧠"
OpenTelemetry与LLM
OpenTelemetry简介
OpenTelemetry(简称OTel)是一个开源的可观测性框架,提供了统一的API、SDK和工具来收集和导出遥测数据。它支持三大信号:追踪(Traces)、指标(Metrics)和日志(Logs)。
对于LLM应用,OpenTelemetry的优势在于:
- 标准化:统一的API降低厂商锁定风险
- 生态丰富:支持多种后端和导出器
- 自动注入:提供自动化的Span创建
- 可扩展:自定义Semantic Conventions扩展LLM特定语义
安装与配置
基础安装
pip install opentelemetry-api opentelemetry-sdk
pip install opentelemetry-exporter-otlp
pip install opentelemetry-instrumentation-openai # OpenAI自动注入
初始化追踪器
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
def init_telemetry(service_name: str):
resource = Resource.create({
"service.name": service_name,
"service.version": "1.0.0",
"deployment.environment": "production"
})
provider = TracerProvider(resource=resource)
# 配置OTLP导出器
exporter = OTLPSpanExporter(
endpoint="http://localhost:4317",
insecure=True
)
processor = BatchSpanProcessor(exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return trace.get_tracer(service_name)
LLM Semantic Conventions
OpenTelemetry为LLM应用定义了标准的Semantic Conventions:
from opentelemetry import trace
# 标准的LLM Span属性
LLM_ATTRIBUTES = {
"llm.model": "模型名称",
"llm.request.model": "请求的模型",
"llm.request.max_tokens": "最大token数",
"llm.request.temperature": "温度参数",
"llm.response.model": "响应的模型",
"llm.usage.input_tokens": "输入token数",
"llm.usage.output_tokens": "输出token数",
"llm.usage.total_tokens": "总token数",
"llm.finish_reason": "完成原因"
}
自定义Span封装
为LLM调用创建标准化的追踪封装:
from contextlib import contextmanager
from typing import Optional, Dict, Any
class LLMTracer:
def __init__(self, tracer: trace.Tracer):
self.tracer = tracer
@contextmanager
def trace_completion(self, model: str, messages: list, **kwargs):
with self.tracer.start_as_current_span(
"llm.chat.completion",
attributes={
"llm.request.model": model,
"llm.request.messages": str(messages)[:500],
"llm.request.max_tokens": kwargs.get("max_tokens"),
"llm.request.temperature": kwargs.get("temperature"),
"llm.request.top_p": kwargs.get("top_p")
}
) as span:
try:
yield span
span.set_status(trace.StatusCode.OK)
except Exception as e:
span.set_status(trace.StatusCode.ERROR, str(e))
span.record_exception(e)
raise
def record_response(self, span: trace.Span, response: dict):
span.set_attributes({
"llm.response.id": response.get("id"),
"llm.response.model": response.get("model"),
"llm.response.finish_reason": response.get("choices", [{}])[0].get("finish_reason"),
"llm.usage.input_tokens": response.get("usage", {}).get("prompt_tokens"),
"llm.usage.output_tokens": response.get("usage", {}).get("completion_tokens"),
"llm.usage.total_tokens": response.get("usage", {}).get("total_tokens")
})
自动化仪器化
OpenAI集成
使用官方instrumentation库自动追踪OpenAI调用:
from opentelemetry.instrumentation.openai import OpenAIInstrumentor
# 自动注入OpenAI调用
OpenAIInstrumentor().instrument()
# 之后的OpenAI调用会自动创建Span
import openai
client = openai.OpenAI()
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello!"}]
)
自定义仪器化
为自定义LLM服务添加追踪:
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
class CustomLLMInstrumentor(BaseInstrumentor):
def instrumentation_dependencies(self):
return ("custom-llm>=1.0.0",)
def _instrument(self, **kwargs):
# 注入追踪代码到目标模块
pass
def _uninstrument(self):
# 移除追踪代码
pass
指标收集
创建自定义指标
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
def setup_metrics(service_name: str):
exporter = OTLPMetricExporter(endpoint="http://localhost:4317")
reader = PeriodicExportingMetricReader(exporter, export_interval_millis=5000)
provider = MeterProvider(metric_readers=[reader])
meter = provider.get_meter(service_name)
# 创建指标
request_counter = meter.create_counter(
name="llm.requests.total",
description="Total number of LLM requests",
unit="1"
)
latency_histogram = meter.create_histogram(
name="llm.request.duration",
description="LLM request duration",
unit="ms"
)
return meter, request_counter, latency_histogram
记录指标
from opentelemetry import metrics
meter = metrics.get_meter("llm-service")
# 创建指标
request_counter = meter.create_counter("llm.requests")
token_usage = meter.create_histogram("llm.tokens.usage")
def call_llm(model: str, prompt: str):
# 记录请求
request_counter.add(1, {"model": model, "status": "success"})
# 执行调用...
response = client.chat.completions.create(model=model, messages=[...])
# 记录token使用
token_usage.record(response.usage.total_tokens, {"model": model})
return response
日志集成
结构化日志与追踪关联
import logging
from opentelemetry import trace
class TracedLogger:
def __init__(self, name: str):
self.logger = logging.getLogger(name)
def info(self, message: str, **kwargs):
span = trace.get_current_span()
context = span.get_span_context()
extra = {
"trace_id": format(context.trace_id, "032x"),
"span_id": format(context.span_id, "016x"),
**kwargs
}
self.logger.info(message, extra=extra)
最佳实践
- 渐进式采用:从关键路径开始,逐步扩展覆盖范围
- 语义一致性:遵循OpenTelemetry Semantic Conventions
- 采样策略:配置合理的采样率,平衡数据完整性与性能
- 隐私保护:对敏感数据进行脱敏处理
- 持续监控:监控OTel自身的性能开销
OpenTelemetry为LLM应用提供了标准化的可观测性解决方案,帮助你构建可维护、可扩展的AI系统。