错误处理:LLM应用中的错误处理策略
--- title: "错误处理:LLM应用中的错误处理策略" description: "全面解析LLM应用中各类错误的处理方法,包括网络错误、API错误、内容过滤、超时处理等异常场景" tags: ["错误处理", "异常处理", "LLM错误", "容错"] category: "llm" icon: "🧠"
错误处理:LLM应用中的错误处理策略
LLM API常见错误类型
LLM应用面临的错误可分为几大类,每类需要不同的处理策略:
# 错误分类体系
class LLMErrorCategory:
NETWORK = "network" # 网络连接问题
AUTH = "authentication" # 认证授权问题
RATE_LIMIT = "rate_limit" # 频率限制
VALIDATION = "validation" # 请求参数问题
CONTENT = "content" # 内容安全过滤
SERVER = "server" # 服务端内部错误
TIMEOUT = "timeout" # 请求超时
网络错误处理
网络不稳定是LLM应用最常见的问题:
import httpx
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=30),
retry=retry_if_exception_type((httpx.ConnectError, httpx.ReadTimeout)),
)
async def call_llm_with_retry(messages: list[dict]) -> str:
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.openai.com/v1/chat/completions",
json={"model": "gpt-4o", "messages": messages},
timeout=30.0,
)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
API错误处理
认证错误
from openai import AuthenticationError
def handle_auth_error(error: AuthenticationError):
if "invalid_api_key" in str(error):
return {
"action": "notify_user",
"message": "API密钥无效,请检查配置",
}
elif "insufficient_quota" in str(error):
return {
"action": "switch_provider",
"message": "配额不足,切换备用提供商",
}
限流错误
from openai import RateLimitError
import time
async def handle_rate_limit(error: RateLimitError, attempt: int):
retry_after = int(error.response.headers.get("Retry-After", 60))
if attempt < 3:
print(f"限流,等待 {retry_after} 秒后重试 (第{attempt+1}次)")
await asyncio.sleep(retry_after)
return True # 可重试
else:
print("重试次数耗尽,切换到备用模型")
return False # 不可重试,需要降级
模型错误
from openai import (
BadRequestError,
NotFoundError,
UnprocessableEntityError,
)
def handle_model_error(error):
if isinstance(error, NotFoundError):
return {"action": "fallback", "reason": "模型不存在"}
elif isinstance(error, BadRequestError):
error_data = error.response.json()
if "context_length_exceeded" in str(error_data):
return {"action": "truncate", "reason": "上下文超长"}
return {"action": "reject", "reason": "请求参数错误"}
内容过滤处理
LLM可能因安全策略拒绝回答某些问题:
async def safe_llm_call(messages: list[dict]) -> dict:
try:
response = await client.chat.completions.create(
model="gpt-4o",
messages=messages,
)
return {"success": True, "content": response.choices[0].message.content}
except openai.ContentFilterFinishReasonError:
return {
"success": False,
"error_type": "content_filtered",
"message": "内容被安全过滤,请修改输入后重试",
}
except openai.BadRequestError as e:
if "sensitive" in str(e).lower():
return {
"success": False,
"error_type": "sensitive_content",
"message": "检测到敏感内容,已拒绝处理",
}
raise
超时处理策略
不同场景需要不同的超时策略:
from enum import Enum
class TimeoutStrategy(Enum):
STRICT = 10 # 实时对话:短超时
NORMAL = 30 # 普通请求:标准超时
GENERATION = 120 # 长文本生成:长超时
ANALYSIS = 300 # 深度分析:最长超时
async def adaptive_timeout_call(
messages: list[dict],
strategy: TimeoutStrategy = TimeoutStrategy.NORMAL,
) -> str:
timeout = strategy.value
try:
response = await client.chat.completions.create(
model="gpt-4o",
messages=messages,
timeout=timeout,
)
return response.choices[0].message.content
except httpx.ReadTimeout:
if strategy == TimeoutStrategy.NORMAL:
# 一次超时,增加时间重试
response = await client.chat.completions.create(
model="gpt-4o",
messages=messages,
timeout=timeout * 2,
)
return response.choices[0].message.content
raise
统一错误处理框架
构建统一的错误处理层,简化业务代码:
class LLMErrorHandler:
def __init__(self, config):
self.config = config
self.error_counts = {}
async def execute(self, func, *args, **kwargs):
try:
return await func(*args, **kwargs)
except RateLimitError as e:
await self._handle_rate_limit(e)
except AuthenticationError:
return self._handle_auth_error()
except httpx.TimeoutException:
return await self._handle_timeout(func, *args, **kwargs)
except Exception as e:
self._log_error(e)
raise LLMAPIError(f"LLM调用失败: {str(e)}")
async def _handle_rate_limit(self, error):
retry_after = int(error.response.headers.get("Retry-After", 60))
await asyncio.sleep(retry_after)
return True
def _handle_auth_error(self):
logger.error("API认证失败,检查密钥配置")
return False
async def _handle_timeout(self, func, *args, **kwargs):
logger.warning("请求超时,使用更长超时重试")
return await func(*args, timeout=60, **kwargs)
错误监控与告警
记录错误指标,及时发现问题:
from dataclasses import dataclass
from datetime import datetime
@dataclass
class ErrorRecord:
timestamp: datetime
error_type: str
message: str
request_id: str
model: str
class LLMMetrics:
def __init__(self):
self.error_records: list[ErrorRecord] = []
def record_error(self, error_type: str, message: str, **context):
record = ErrorRecord(
timestamp=datetime.now(),
error_type=error_type,
message=message,
**context,
)
self.error_records.append(record)
# 超过阈值触发告警
recent_errors = [
r for r in self.error_records
if r.error_type == error_type
and (datetime.now() - r.timestamp).seconds < 300
]
if len(recent_errors) >= 10:
self._send_alert(error_type, len(recent_errors))
总结
完善的错误处理是LLM应用可靠性的基石。关键策略包括:区分错误类型采取不同处理、实现智能重试机制、提供优雅降级方案、建立错误监控告警体系。