← 返回首页
🧠

Datadog监控LLM

📂 llm ⏱ 3 min 497 words

--- title: "Datadog监控LLM" description: "介绍如何使用Datadog平台监控LLM应用,包括APM、日志管理和自定义仪表盘。" tags: ["Datadog", "LLM", "APM"] category: "llm" icon: "🧠"

Datadog监控LLM

Datadog简介

Datadog是一个云规模的监控和分析平台,提供应用性能监控(APM)、日志管理、基础设施监控等功能。对于LLM应用,Datadog可以帮助团队全面监控系统状态。

Datadog的核心优势:

安装与配置

Agent安装

# Linux
DD_API_KEY=<YOUR_API_KEY> DD_SITE="datadoghq.com" bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)"

# Docker
docker run -d \
  --name datadog-agent \
  -e DD_API_KEY=<YOUR_API_KEY> \
  -e DD_SITE="datadoghq.com" \
  -v /var/run/docker.sock:/var/run/docker.sock:ro \
  -v /proc/:/host/proc/:ro \
  -v /sys/fs/cgroup/:/host/sys/fs/cgroup:ro \
  datadog/agent:latest

Python库安装

pip install datadog
pip install ddtrace  # APM库

APM追踪配置

初始化追踪器

from ddtrace import tracer, patch

# 启用自动插桩
patch()  # 自动插桩所有支持的库

# 配置追踪器
tracer.configure(
    hostname="datadoghq.com",
    port=8126,
    dogstatsd_url="udp://localhost:8125"
)

# 设置全局标签
tracer.set_tags({
    "env": "production",
    "service": "llm-service",
    "version": "1.0.0"
})

追踪LLM调用

from ddtrace import tracer
from functools import wraps

def trace_llm_call(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        with tracer.trace(
            "llm.request",
            service="llm-service",
            resource=kwargs.get("model", "unknown")
        ) as span:
            # 记录请求详情
            span.set_tag("llm.model", kwargs.get("model"))
            span.set_tag("llm.max_tokens", kwargs.get("max_tokens"))
            span.set_tag("llm.temperature", kwargs.get("temperature"))
            
            try:
                result = func(*args, **kwargs)
                
                # 记录响应详情
                span.set_tag("llm.response.id", result.id)
                span.set_tag("llm.usage.prompt_tokens", result.usage.prompt_tokens)
                span.set_tag("llm.usage.completion_tokens", result.usage.completion_tokens)
                span.set_tag("llm.usage.total_tokens", result.usage.total_tokens)
                span.set_tag("llm.finish_reason", result.choices[0].finish_reason)
                
                return result
            except Exception as e:
                span.set_tag("error", str(e))
                span.set_traceback()
                raise
    
    return wrapper

# 使用装饰器
@trace_llm_call
def call_openai(model: str, messages: list, **kwargs):
    return client.chat.completions.create(
        model=model,
        messages=messages,
        **kwargs
    )

多步追踪

@tracer.wrap("llm.pipeline", service="llm-service")
async def llm_pipeline(query: str):
    # 步骤1:意图识别
    with tracer.trace("llm.intent_detection"):
        intent = await detect_intent(query)
    
    # 步骤2:检索相关文档
    with tracer.trace("llm.retrieval"):
        documents = await retrieve_documents(query)
    
    # 步骤3:生成响应
    with tracer.trace("llm.generation"):
        response = await generate_response(query, documents)
    
    return response

日志管理

结构化日志配置

import logging
from ddtrace import tracer

class DatadogLogger:
    def __init__(self, service_name: str):
        self.logger = logging.getLogger(service_name)
        self.setup_handler()
    
    def setup_handler():
        # 配置Datadog日志集成
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '{"message": "%(message)s", '
            '"service": "llm-service", '
            '"dd.trace_id": "%(dd.trace_id)s", '
            '"dd.span_id": "%(dd.span_id)s"}'
        )
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.setLevel(logging.INFO)
    
    def log_request(self, query: str, response: dict, latency: float):
        current_span = tracer.current_span()
        dd_trace_id = format(current_span.trace_id, "032x") if current_span else ""
        dd_span_id = format(current_span.span_id, "016x") if current_span else ""
        
        self.logger.info(
            "LLM请求完成",
            extra={
                "dd.trace_id": dd_trace_id,
                "dd.span_id": dd_span_id,
                "query_length": len(query),
                "response_length": len(response.get("content", "")),
                "latency_ms": latency * 1000
            }
        )

日志索引配置

在Datadog UI中创建日志索引:

{
  "filter": "service:llm-service",
  "retention_days": 30,
  "excluded_index_filters": [
    {"query": "status:debug"}
  ],
  "daily_limit": {
    "limit": 1000000,
    "reset_timezone": "America/New_York"
  }
}

自定义指标

发送自定义指标

from datadog import initialize, statsd

# 初始化
initialize(
    api_key="<YOUR_API_KEY>",
    app_key="<YOUR_APP_KEY>"
)

# 发送计数器
statsd.increment("llm.requests.total", tags=["model:gpt-4", "status:success"])

# 发送直方图
statsd.histogram("llm.request.duration", latency_ms, tags=["model:gpt-4"])

# 发送gauge
statsd.gauge("llm.tokens.usage", token_count, tags=["model:gpt-4"])

创建自定义指标

class LLMMetricsCollector:
    def __init__(self):
        self.metrics = {}
    
    def record_metric(self, name: str, value: float, tags: dict = None):
        tag_list = [f"{k}:{v}" for k, v in (tags or {}).items()]
        
        if name.endswith(".counter"):
            statsd.increment(name, tags=tag_list)
        elif name.endswith(".histogram"):
            statsd.histogram(name, value, tags=tag_list)
        else:
            statsd.gauge(name, value, tags=tag_list)
    
    def track_request(self, model: str, latency: float, success: bool):
        status = "success" if success else "error"
        tags = {"model": model, "status": status}
        
        self.record_metric("llm.requests.total", 1, tags)
        self.record_metric("llm.request.duration.ms", latency, tags)

仪表盘创建

使用JSON创建仪表盘

from datadog import initialize, Dashboard

initialize(api_key="<YOUR_API_KEY>", app_key="<YOUR_APP_KEY>")

dashboard = Dashboard.create(
    title="LLM服务监控",
    description="LLM应用性能监控仪表盘",
    widgets=[
        {
            "definition": {
                "type": "timeseries",
                "requests": [
                    {
                        "q": "sum:llm.requests.total{model:gpt-4}.as_count()",
                        "display_type": "line"
                    }
                ],
                "title": "GPT-4请求量"
            }
        },
        {
            "definition": {
                "type": "query_value",
                "requests": [
                    {
                        "q": "avg:llm.request.duration.ms{model:gpt-4}",
                        "aggregator": "avg"
                    }
                ],
                "title": "平均延迟",
                "conditional_formats": [
                    {"comparator": ">", "value": 1000, "palette": "red"}
                ]
            }
        }
    ],
    layout_type="ordered"
)

告警配置

创建监控告警

from datadog import initialize, Monitor

initialize(api_key="<YOUR_API_KEY>", app_key="<YOUR_APP_KEY>")

# 错误率告警
Monitor.create(
    type="metric alert",
    name="LLM高错误率",
    query="sum:llm.requests.total{status:error}.as_count() / sum:llm.requests.total.as_count() > 0.05",
    message="LLM服务错误率超过5%,当前值: {{value}}",
    tags=["service:llm-service", "team:ai-platform"],
    options={
        "thresholds": {"critical": 0.05, "warning": 0.02},
        "notify_no_data": True,
        "no_data_timeframe": 10
    }
)

最佳实践

  1. 统一标签:在整个服务中使用一致的标签体系
  2. 采样控制:合理配置APM采样率,控制成本
  3. 日志分级:区分调试日志和生产日志
  4. 告警优化:避免告警风暴,设置合理的静默规则
  5. 成本监控:定期审查Datadog使用量和成本

通过Datadog,你可以获得LLM应用的全栈可观测性,实现快速的问题定位和性能优化。