← 返回首页
🧠

LLM浸泡测试

📂 llm ⏱ 3 min 406 words

--- title: "LLM浸泡测试" description: "全面介绍LLM系统的浸泡测试方法,包括长时间运行测试、内存泄漏检测、资源退化监控、稳定性验证以及浸泡测试结果分析" tags: ["浸泡测试", "稳定性测试", "内存泄漏", "长期运行"] category: "llm" icon: "🧠"

LLM浸泡测试

什么是浸泡测试

浸泡测试(Soak Testing)是让系统在正常或略高于正常的负载下长时间运行(通常数小时到数天),以发现那些在短时间测试中无法暴露的问题。对于LLM系统,浸泡测试尤为重要,因为GPU内存泄漏、缓存膨胀、连接池耗尽等问题往往只在长时间运行后才显现。

浸泡测试的核心目标

浸泡测试框架

import asyncio
import time
import psutil
import json
from pathlib import Path

class SoakTestRunner:
    def __init__(self, base_url, duration_hours=8, interval_seconds=60):
        self.base_url = base_url
        self.duration = duration_hours * 3600
        self.interval = interval_seconds
        self.metrics_log = []
        self.start_time = None
    
    async def run(self):
        self.start_time = time.time()
        end_time = self.start_time + self.duration
        
        print(f"开始浸泡测试,持续{self.duration/3600}小时")
        
        while time.time() < end_time:
            metrics = await self._collect_metrics()
            self.metrics_log.append(metrics)
            
            elapsed = time.time() - self.start_time
            print(f"[{elapsed/3600:.1f}h] "
                  f"CPU: {metrics['cpu_percent']:.1f}% | "
                  f"Memory: {metrics['memory_percent']:.1f}% | "
                  f"GPU: {metrics['gpu_util']:.1f}%")
            
            await asyncio.sleep(self.interval)
        
        self._generate_report()
    
    async def _collect_metrics(self):
        process = psutil.Process()
        
        gpu_info = await self._get_gpu_metrics()
        
        return {
            "timestamp": time.time(),
            "elapsed_hours": (time.time() - self.start_time) / 3600,
            "cpu_percent": psutil.cpu_percent(),
            "memory_percent": psutil.virtual_memory().percent,
            "process_memory_mb": process.memory_info().rss / 1024 / 1024,
            "gpu_util": gpu_info.get("utilization", 0),
            "gpu_memory_used": gpu_info.get("memory_used", 0),
            "gpu_memory_total": gpu_info.get("memory_total", 0),
            "open_files": len(process.open_files()),
            "threads": process.num_threads(),
        }
    
    async def _get_gpu_metrics(self):
        try:
            import GPUtil
            gpus = GPUtil.getGPUs()
            if gpus:
                gpu = gpus[0]
                return {
                    "utilization": gpu.load * 100,
                    "memory_used": gpu.memoryUsed,
                    "memory_total": gpu.memoryTotal,
                }
        except Exception:
            pass
        return {}
    
    def _generate_report(self):
        report = {
            "duration_hours": self.duration / 3600,
            "total_samples": len(self.metrics_log),
            "memory_trend": self._analyze_trend("process_memory_mb"),
            "gpu_trend": self._analyze_trend("gpu_util"),
        }
        
        report_path = Path("soak_test_report.json")
        report_path.write_text(json.dumps(report, indent=2))
        print(f"\n报告已保存: {report_path}")
    
    def _analyze_trend(self, metric_name):
        values = [m[metric_name] for m in self.metrics_log if metric_name in m]
        if len(values) < 2:
            return {"status": "insufficient_data"}
        
        first_half = values[:len(values)//2]
        second_half = values[len(values)//2:]
        
        avg_first = sum(first_half) / len(first_half)
        avg_second = sum(second_half) / len(second_half)
        
        change_percent = (avg_second - avg_first) / avg_first * 100 if avg_first > 0 else 0
        
        return {
            "start_value": values[0],
            "end_value": values[-1],
            "peak": max(values),
            "change_percent": change_percent,
            "trend": "increasing" if change_percent > 10 else "stable" if abs(change_percent) < 5 else "decreasing",
        }

内存泄漏检测

内存泄漏是长时间运行的LLM系统最常见的问题之一:

import tracemalloc

class MemoryLeakDetector:
    def __init__(self):
        self.snapshots = []
        tracemalloc.start()
    
    def take_snapshot(self):
        snapshot = tracemalloc.take_snapshot()
        self.snapshots.append({
            "timestamp": time.time(),
            "snapshot": snapshot,
            "current": tracemalloc.get_traced_memory(),
        })
    
    def compare_snapshots(self, idx1, idx2):
        snap1 = self.snapshots[idx1]["snapshot"]
        snap2 = self.snapshots[idx2]["snapshot"]
        
        stats = snap2.compare_to(snap1, "lineno")
        
        print("\n内存增长前10行:")
        for stat in stats[:10]:
            print(f"  {stat}")
        
        current = self.snapshots[idx2]["current"]
        print(f"\n当前内存: {current[0]/1024/1024:.1f}MB")
        print(f"峰值内存: {current[1]/1024/1024:.1f}MB")

质量退化监控

长时间运行后,LLM的输出质量可能因各种原因退化:

class QualityDegradationMonitor:
    def __init__(self, test_cases, check_interval_hours=1):
        self.test_cases = test_cases
        self.check_interval = check_interval_hours * 3600
        self.results_log = []
    
    async def run_quality_checks(self, llm_client):
        while True:
            score = await self._run_evaluation(llm_client)
            self.results_log.append({
                "timestamp": time.time(),
                "score": score,
            })
            
            if len(self.results_log) > 1:
                prev = self.results_log[-2]["score"]
                curr = self.results_log[-1]["score"]
                if curr < prev * 0.9:
                    print(f"警告: 质量评分下降 {(1-curr/prev)*100:.1f}%")
            
            await asyncio.sleep(self.check_interval)
    
    async def _run_evaluation(self, llm_client):
        scores = []
        for case in self.test_cases:
            response = await llm_client.generate(case["prompt"])
            score = self._score_response(response, case["expected"])
            scores.append(score)
        return sum(scores) / len(scores)

浸泡测试最佳实践

  1. 选择合适的持续时间:至少8小时,生产环境建议24-72小时
  2. 使用生产级负载:模拟真实的请求模式和数据分布
  3. 全面监控:记录CPU、内存、GPU、磁盘、网络等所有指标
  4. 设置告警阈值:当指标异常时及时通知
  5. 保存详细日志:便于事后分析问题根因
  6. 定期执行:将浸泡测试纳入发布流程,每次大版本更新前执行