LLM浸泡测试
--- title: "LLM浸泡测试" description: "全面介绍LLM系统的浸泡测试方法,包括长时间运行测试、内存泄漏检测、资源退化监控、稳定性验证以及浸泡测试结果分析" tags: ["浸泡测试", "稳定性测试", "内存泄漏", "长期运行"] category: "llm" icon: "🧠"
LLM浸泡测试
什么是浸泡测试
浸泡测试(Soak Testing)是让系统在正常或略高于正常的负载下长时间运行(通常数小时到数天),以发现那些在短时间测试中无法暴露的问题。对于LLM系统,浸泡测试尤为重要,因为GPU内存泄漏、缓存膨胀、连接池耗尽等问题往往只在长时间运行后才显现。
浸泡测试的核心目标
- 内存泄漏检测:发现随时间持续增长的内存消耗
- 资源退化监控:观察性能是否随时间逐渐下降
- 稳定性验证:确认系统能长期稳定运行
- 累积效应评估:评估长时间运行对输出质量的影响
浸泡测试框架
import asyncio
import time
import psutil
import json
from pathlib import Path
class SoakTestRunner:
def __init__(self, base_url, duration_hours=8, interval_seconds=60):
self.base_url = base_url
self.duration = duration_hours * 3600
self.interval = interval_seconds
self.metrics_log = []
self.start_time = None
async def run(self):
self.start_time = time.time()
end_time = self.start_time + self.duration
print(f"开始浸泡测试,持续{self.duration/3600}小时")
while time.time() < end_time:
metrics = await self._collect_metrics()
self.metrics_log.append(metrics)
elapsed = time.time() - self.start_time
print(f"[{elapsed/3600:.1f}h] "
f"CPU: {metrics['cpu_percent']:.1f}% | "
f"Memory: {metrics['memory_percent']:.1f}% | "
f"GPU: {metrics['gpu_util']:.1f}%")
await asyncio.sleep(self.interval)
self._generate_report()
async def _collect_metrics(self):
process = psutil.Process()
gpu_info = await self._get_gpu_metrics()
return {
"timestamp": time.time(),
"elapsed_hours": (time.time() - self.start_time) / 3600,
"cpu_percent": psutil.cpu_percent(),
"memory_percent": psutil.virtual_memory().percent,
"process_memory_mb": process.memory_info().rss / 1024 / 1024,
"gpu_util": gpu_info.get("utilization", 0),
"gpu_memory_used": gpu_info.get("memory_used", 0),
"gpu_memory_total": gpu_info.get("memory_total", 0),
"open_files": len(process.open_files()),
"threads": process.num_threads(),
}
async def _get_gpu_metrics(self):
try:
import GPUtil
gpus = GPUtil.getGPUs()
if gpus:
gpu = gpus[0]
return {
"utilization": gpu.load * 100,
"memory_used": gpu.memoryUsed,
"memory_total": gpu.memoryTotal,
}
except Exception:
pass
return {}
def _generate_report(self):
report = {
"duration_hours": self.duration / 3600,
"total_samples": len(self.metrics_log),
"memory_trend": self._analyze_trend("process_memory_mb"),
"gpu_trend": self._analyze_trend("gpu_util"),
}
report_path = Path("soak_test_report.json")
report_path.write_text(json.dumps(report, indent=2))
print(f"\n报告已保存: {report_path}")
def _analyze_trend(self, metric_name):
values = [m[metric_name] for m in self.metrics_log if metric_name in m]
if len(values) < 2:
return {"status": "insufficient_data"}
first_half = values[:len(values)//2]
second_half = values[len(values)//2:]
avg_first = sum(first_half) / len(first_half)
avg_second = sum(second_half) / len(second_half)
change_percent = (avg_second - avg_first) / avg_first * 100 if avg_first > 0 else 0
return {
"start_value": values[0],
"end_value": values[-1],
"peak": max(values),
"change_percent": change_percent,
"trend": "increasing" if change_percent > 10 else "stable" if abs(change_percent) < 5 else "decreasing",
}
内存泄漏检测
内存泄漏是长时间运行的LLM系统最常见的问题之一:
import tracemalloc
class MemoryLeakDetector:
def __init__(self):
self.snapshots = []
tracemalloc.start()
def take_snapshot(self):
snapshot = tracemalloc.take_snapshot()
self.snapshots.append({
"timestamp": time.time(),
"snapshot": snapshot,
"current": tracemalloc.get_traced_memory(),
})
def compare_snapshots(self, idx1, idx2):
snap1 = self.snapshots[idx1]["snapshot"]
snap2 = self.snapshots[idx2]["snapshot"]
stats = snap2.compare_to(snap1, "lineno")
print("\n内存增长前10行:")
for stat in stats[:10]:
print(f" {stat}")
current = self.snapshots[idx2]["current"]
print(f"\n当前内存: {current[0]/1024/1024:.1f}MB")
print(f"峰值内存: {current[1]/1024/1024:.1f}MB")
质量退化监控
长时间运行后,LLM的输出质量可能因各种原因退化:
class QualityDegradationMonitor:
def __init__(self, test_cases, check_interval_hours=1):
self.test_cases = test_cases
self.check_interval = check_interval_hours * 3600
self.results_log = []
async def run_quality_checks(self, llm_client):
while True:
score = await self._run_evaluation(llm_client)
self.results_log.append({
"timestamp": time.time(),
"score": score,
})
if len(self.results_log) > 1:
prev = self.results_log[-2]["score"]
curr = self.results_log[-1]["score"]
if curr < prev * 0.9:
print(f"警告: 质量评分下降 {(1-curr/prev)*100:.1f}%")
await asyncio.sleep(self.check_interval)
async def _run_evaluation(self, llm_client):
scores = []
for case in self.test_cases:
response = await llm_client.generate(case["prompt"])
score = self._score_response(response, case["expected"])
scores.append(score)
return sum(scores) / len(scores)
浸泡测试最佳实践
- 选择合适的持续时间:至少8小时,生产环境建议24-72小时
- 使用生产级负载:模拟真实的请求模式和数据分布
- 全面监控:记录CPU、内存、GPU、磁盘、网络等所有指标
- 设置告警阈值:当指标异常时及时通知
- 保存详细日志:便于事后分析问题根因
- 定期执行:将浸泡测试纳入发布流程,每次大版本更新前执行