Datadog监控LLM
--- title: "Datadog监控LLM" description: "介绍如何使用Datadog平台监控LLM应用,包括APM、日志管理和自定义仪表盘。" tags: ["Datadog", "LLM", "APM"] category: "llm" icon: "🧠"
Datadog监控LLM
Datadog简介
Datadog是一个云规模的监控和分析平台,提供应用性能监控(APM)、日志管理、基础设施监控等功能。对于LLM应用,Datadog可以帮助团队全面监控系统状态。
Datadog的核心优势:
- 全栈可观测性:APM、日志、指标统一平台
- AI驱动分析:内置异常检测和根因分析
- 云原生支持:与Kubernetes、Docker深度集成
- 丰富的集成:支持200+技术栈
安装与配置
Agent安装
# Linux
DD_API_KEY=<YOUR_API_KEY> DD_SITE="datadoghq.com" bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)"
# Docker
docker run -d \
--name datadog-agent \
-e DD_API_KEY=<YOUR_API_KEY> \
-e DD_SITE="datadoghq.com" \
-v /var/run/docker.sock:/var/run/docker.sock:ro \
-v /proc/:/host/proc/:ro \
-v /sys/fs/cgroup/:/host/sys/fs/cgroup:ro \
datadog/agent:latest
Python库安装
pip install datadog
pip install ddtrace # APM库
APM追踪配置
初始化追踪器
from ddtrace import tracer, patch
# 启用自动插桩
patch() # 自动插桩所有支持的库
# 配置追踪器
tracer.configure(
hostname="datadoghq.com",
port=8126,
dogstatsd_url="udp://localhost:8125"
)
# 设置全局标签
tracer.set_tags({
"env": "production",
"service": "llm-service",
"version": "1.0.0"
})
追踪LLM调用
from ddtrace import tracer
from functools import wraps
def trace_llm_call(func):
@wraps(func)
def wrapper(*args, **kwargs):
with tracer.trace(
"llm.request",
service="llm-service",
resource=kwargs.get("model", "unknown")
) as span:
# 记录请求详情
span.set_tag("llm.model", kwargs.get("model"))
span.set_tag("llm.max_tokens", kwargs.get("max_tokens"))
span.set_tag("llm.temperature", kwargs.get("temperature"))
try:
result = func(*args, **kwargs)
# 记录响应详情
span.set_tag("llm.response.id", result.id)
span.set_tag("llm.usage.prompt_tokens", result.usage.prompt_tokens)
span.set_tag("llm.usage.completion_tokens", result.usage.completion_tokens)
span.set_tag("llm.usage.total_tokens", result.usage.total_tokens)
span.set_tag("llm.finish_reason", result.choices[0].finish_reason)
return result
except Exception as e:
span.set_tag("error", str(e))
span.set_traceback()
raise
return wrapper
# 使用装饰器
@trace_llm_call
def call_openai(model: str, messages: list, **kwargs):
return client.chat.completions.create(
model=model,
messages=messages,
**kwargs
)
多步追踪
@tracer.wrap("llm.pipeline", service="llm-service")
async def llm_pipeline(query: str):
# 步骤1:意图识别
with tracer.trace("llm.intent_detection"):
intent = await detect_intent(query)
# 步骤2:检索相关文档
with tracer.trace("llm.retrieval"):
documents = await retrieve_documents(query)
# 步骤3:生成响应
with tracer.trace("llm.generation"):
response = await generate_response(query, documents)
return response
日志管理
结构化日志配置
import logging
from ddtrace import tracer
class DatadogLogger:
def __init__(self, service_name: str):
self.logger = logging.getLogger(service_name)
self.setup_handler()
def setup_handler():
# 配置Datadog日志集成
handler = logging.StreamHandler()
formatter = logging.Formatter(
'{"message": "%(message)s", '
'"service": "llm-service", '
'"dd.trace_id": "%(dd.trace_id)s", '
'"dd.span_id": "%(dd.span_id)s"}'
)
handler.setFormatter(formatter)
self.logger.addHandler(handler)
self.logger.setLevel(logging.INFO)
def log_request(self, query: str, response: dict, latency: float):
current_span = tracer.current_span()
dd_trace_id = format(current_span.trace_id, "032x") if current_span else ""
dd_span_id = format(current_span.span_id, "016x") if current_span else ""
self.logger.info(
"LLM请求完成",
extra={
"dd.trace_id": dd_trace_id,
"dd.span_id": dd_span_id,
"query_length": len(query),
"response_length": len(response.get("content", "")),
"latency_ms": latency * 1000
}
)
日志索引配置
在Datadog UI中创建日志索引:
{
"filter": "service:llm-service",
"retention_days": 30,
"excluded_index_filters": [
{"query": "status:debug"}
],
"daily_limit": {
"limit": 1000000,
"reset_timezone": "America/New_York"
}
}
自定义指标
发送自定义指标
from datadog import initialize, statsd
# 初始化
initialize(
api_key="<YOUR_API_KEY>",
app_key="<YOUR_APP_KEY>"
)
# 发送计数器
statsd.increment("llm.requests.total", tags=["model:gpt-4", "status:success"])
# 发送直方图
statsd.histogram("llm.request.duration", latency_ms, tags=["model:gpt-4"])
# 发送gauge
statsd.gauge("llm.tokens.usage", token_count, tags=["model:gpt-4"])
创建自定义指标
class LLMMetricsCollector:
def __init__(self):
self.metrics = {}
def record_metric(self, name: str, value: float, tags: dict = None):
tag_list = [f"{k}:{v}" for k, v in (tags or {}).items()]
if name.endswith(".counter"):
statsd.increment(name, tags=tag_list)
elif name.endswith(".histogram"):
statsd.histogram(name, value, tags=tag_list)
else:
statsd.gauge(name, value, tags=tag_list)
def track_request(self, model: str, latency: float, success: bool):
status = "success" if success else "error"
tags = {"model": model, "status": status}
self.record_metric("llm.requests.total", 1, tags)
self.record_metric("llm.request.duration.ms", latency, tags)
仪表盘创建
使用JSON创建仪表盘
from datadog import initialize, Dashboard
initialize(api_key="<YOUR_API_KEY>", app_key="<YOUR_APP_KEY>")
dashboard = Dashboard.create(
title="LLM服务监控",
description="LLM应用性能监控仪表盘",
widgets=[
{
"definition": {
"type": "timeseries",
"requests": [
{
"q": "sum:llm.requests.total{model:gpt-4}.as_count()",
"display_type": "line"
}
],
"title": "GPT-4请求量"
}
},
{
"definition": {
"type": "query_value",
"requests": [
{
"q": "avg:llm.request.duration.ms{model:gpt-4}",
"aggregator": "avg"
}
],
"title": "平均延迟",
"conditional_formats": [
{"comparator": ">", "value": 1000, "palette": "red"}
]
}
}
],
layout_type="ordered"
)
告警配置
创建监控告警
from datadog import initialize, Monitor
initialize(api_key="<YOUR_API_KEY>", app_key="<YOUR_APP_KEY>")
# 错误率告警
Monitor.create(
type="metric alert",
name="LLM高错误率",
query="sum:llm.requests.total{status:error}.as_count() / sum:llm.requests.total.as_count() > 0.05",
message="LLM服务错误率超过5%,当前值: {{value}}",
tags=["service:llm-service", "team:ai-platform"],
options={
"thresholds": {"critical": 0.05, "warning": 0.02},
"notify_no_data": True,
"no_data_timeframe": 10
}
)
最佳实践
- 统一标签:在整个服务中使用一致的标签体系
- 采样控制:合理配置APM采样率,控制成本
- 日志分级:区分调试日志和生产日志
- 告警优化:避免告警风暴,设置合理的静默规则
- 成本监控:定期审查Datadog使用量和成本
通过Datadog,你可以获得LLM应用的全栈可观测性,实现快速的问题定位和性能优化。