← 返回首页
🧠

鲁棒性测试:验证模型稳定性

📂 llm ⏱ 5 min 847 words

--- title: "鲁棒性测试:验证模型稳定性" description: "系统化的LLM鲁棒性测试方法,确保模型在各种条件下稳定运行" tags: ["鲁棒性", "稳定性测试", "压力测试", "LLM", "质量保证"] category: "llm" icon: "🏋️"

鲁棒性测试:验证模型稳定性

鲁棒性测试概述

鲁棒性测试是验证模型在各种扰动和边界条件下保持稳定性能的系统化方法。

测试框架

1. 鲁棒性测试器

import numpy as np
from typing import List, Dict, Callable, Any
from dataclasses import dataclass
from enum import Enum

class PerturbationType(Enum):
    NOISE = "noise"
    TYPOS = "typos"
    SYNONYM = "synonym"
    DELETION = "deletion"
    INSERTION = "insertion"
    REORDER = "reorder"

@dataclass
class RobustnessTestResult:
    """鲁棒性测试结果"""
    test_name: str
    perturbation_type: PerturbationType
    original_score: float
    perturbed_scores: List[float]
    mean_score: float
    std_score: float
    degradation: float
    passed: bool

class RobustnessTester:
    """鲁棒性测试器"""
    
    def __init__(self, model_predict: Callable, threshold: float = 0.1):
        self.model_predict = model_predict
        self.threshold = threshold
        self.results = []
    
    def add_noise(self, text: str, noise_level: float = 0.1) -> str:
        """添加噪声"""
        chars = list(text)
        n_noise = int(len(chars) * noise_level)
        
        for _ in range(n_noise):
            pos = np.random.randint(0, len(chars))
            chars[pos] = chr(np.random.randint(32, 127))
        
        return "".join(chars)
    
    def introduce_typos(self, text: str, typo_rate: float = 0.1) -> str:
        """引入拼写错误"""
        chars = list(text)
        n_typos = int(len(chars) * typo_rate)
        
        for _ in range(n_typos):
            pos = np.random.randint(0, len(chars))
            # 邻近键位替换
            adjacent_keys = {
                'a': 'sqwz', 'b': 'vghn', 'c': 'xdfv',
                'd': 'sfcer', 'e': 'wrd', 'f': 'dgcv',
                'g': 'fhtb', 'h': 'gjyn', 'i': 'uok',
                'j': 'hknm', 'k': 'jloi', 'l': 'kop',
                'm': 'njk', 'n': 'bmhj', 'o': 'iplk',
                'p': 'ol', 'q': 'wa', 'r': 'edt',
                's': 'awedx', 't': 'rfy', 'u': 'yihj',
                'v': 'cfgb', 'w': 'qase', 'x': 'zsdc',
                'y': 'tuhg', 'z': 'asx'
            }
            
            if chars[pos].lower() in adjacent_keys:
                replacement = np.random.choice(list(adjacent_keys[chars[pos].lower()]))
                chars[pos] = replacement
        
        return "".join(chars)
    
    def delete_characters(self, text: str, delete_rate: float = 0.1) -> str:
        """删除字符"""
        chars = list(text)
        n_delete = int(len(chars) * delete_rate)
        indices = np.random.choice(len(chars), n_delete, replace=False)
        
        return "".join([c for i, c in enumerate(chars) if i not in indices])
    
    def reorder_words(self, text: str, reorder_rate: float = 0.1) -> str:
        """重排词语"""
        words = text.split()
        n_reorder = int(len(words) * reorder_rate)
        
        for _ in range(n_reorder):
            i, j = np.random.choice(len(words), 2, replace=False)
            words[i], words[j] = words[j], words[i]
        
        return " ".join(words)
    
    def test_perturbation(self, text: str, perturbation_type: PerturbationType, 
                         n_samples: int = 10) -> RobustnessTestResult:
        """测试扰动影响"""
        # 获取原始分数
        original_score = self.model_predict(text)
        
        # 生成扰动并测试
        perturbed_scores = []
        for _ in range(n_samples):
            if perturbation_type == PerturbationType.NOISE:
                perturbed = self.add_noise(text)
            elif perturbation_type == PerturbationType.TYPOS:
                perturbed = self.introduce_typos(text)
            elif perturbation_type == PerturbationType.DELETION:
                perturbed = self.delete_characters(text)
            elif perturbation_type == PerturbationType.REORDER:
                perturbed = self.reorder_words(text)
            else:
                perturbed = text
            
            score = self.model_predict(perturbed)
            perturbed_scores.append(score)
        
        mean_score = np.mean(perturbed_scores)
        std_score = np.std(perturbed_scores)
        degradation = (original_score - mean_score) / original_score
        
        result = RobustnessTestResult(
            test_name=f"test_{perturbation_type.value}",
            perturbation_type=perturbation_type,
            original_score=original_score,
            perturbed_scores=perturbed_scores,
            mean_score=mean_score,
            std_score=std_score,
            degradation=degradation,
            passed=degradation < self.threshold
        )
        
        self.results.append(result)
        return result
    
    def run_all_tests(self, texts: List[str]) -> Dict:
        """运行所有测试"""
        all_results = []
        
        for text in texts:
            for perturbation_type in PerturbationType:
                result = self.test_perturbation(text, perturbation_type)
                all_results.append(result)
        
        # 汇总结果
        summary = {
            "total_tests": len(all_results),
            "passed_tests": sum(1 for r in all_results if r.passed),
            "failed_tests": sum(1 for r in all_results if not r.passed),
            "mean_degradation": np.mean([r.degradation for r in all_results]),
            "by_perturbation": {}
        }
        
        for perturbation_type in PerturbationType:
            type_results = [r for r in all_results if r.perturbation_type == perturbation_type]
            if type_results:
                summary["by_perturbation"][perturbation_type.value] = {
                    "count": len(type_results),
                    "passed": sum(1 for r in type_results if r.passed),
                    "mean_degradation": np.mean([r.degradation for r in type_results])
                }
        
        return summary

2. 边界测试

class BoundaryTester:
    """边界测试"""
    
    def __init__(self, model_predict: Callable):
        self.model_predict = model_predict
    
    def test_empty_input(self) -> Dict:
        """测试空输入"""
        try:
            result = self.model_predict("")
            return {"success": True, "output": result, "issue": None}
        except Exception as e:
            return {"success": False, "output": None, "issue": str(e)}
    
    def test_long_input(self, max_length: int = 10000) -> Dict:
        """测试长输入"""
        long_text = "这是一个测试。" * (max_length // 6)
        
        try:
            result = self.model_predict(long_text)
            return {"success": True, "output_length": len(str(result)), "issue": None}
        except Exception as e:
            return {"success": False, "output": None, "issue": str(e)}
    
    def test_special_characters(self) -> Dict:
        """测试特殊字符"""
        special_texts = [
            "测试\0字符",
            "测试\n换行符",
            "测试\t制表符",
            "测试\"引号",
            "测试'单引号"
        ]
        
        results = []
        for text in special_texts:
            try:
                result = self.model_predict(text)
                results.append({"input": repr(text), "success": True})
            except Exception as e:
                results.append({"input": repr(text), "success": False, "error": str(e)})
        
        return {"results": results, "all_passed": all(r["success"] for r in results)}
    
    def test_unicode(self) -> Dict:
        """测试Unicode字符"""
        unicode_texts = [
            "测试中文",
            "テスト日本語",
            "테스트 한국어",
            "Тест русский",
            "🎉 Emoji测试 🚀"
        ]
        
        results = []
        for text in unicode_texts:
            try:
                result = self.model_predict(text)
                results.append({"input": text, "success": True})
            except Exception as e:
                results.append({"input": text, "success": False, "error": str(e)})
        
        return {"results": results, "all_passed": all(r["success"] for r in results)}

压力测试

class StressTester:
    """压力测试"""
    
    def __init__(self, model_predict: Callable):
        self.model_predict = model_predict
    
    def test_concurrent_requests(self, n_concurrent: int = 10, duration: int = 10) -> Dict:
        """测试并发请求"""
        import concurrent.futures
        import time
        
        results = {"success": 0, "failure": 0, "latencies": []}
        
        def single_request():
            start = time.time()
            try:
                self.model_predict("测试输入")
                latency = time.time() - start
                results["success"] += 1
                results["latencies"].append(latency)
            except Exception:
                results["failure"] += 1
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=n_concurrent) as executor:
            futures = []
            for _ in range(duration * n_concurrent):
                futures.append(executor.submit(single_request))
            
            for future in concurrent.futures.as_completed(futures):
                future.result()
        
        results["mean_latency"] = np.mean(results["latencies"]) if results["latencies"] else 0
        results["p95_latency"] = np.percentile(results["latencies"], 95) if results["latencies"] else 0
        
        return results
    
    def test_memory_usage(self, n_iterations: int = 100) -> Dict:
        """测试内存使用"""
        import psutil
        import os
        
        process = psutil.Process(os.getpid())
        initial_memory = process.memory_info().rss / 1024 / 1024  # MB
        
        for _ in range(n_iterations):
            self.model_predict("测试输入")
        
        final_memory = process.memory_info().rss / 1024 / 1024
        
        return {
            "initial_memory_mb": initial_memory,
            "final_memory_mb": final_memory,
            "memory_increase_mb": final_memory - initial_memory,
            "memory_leak": final_memory > initial_memory * 1.1
        }

结果分析

import matplotlib.pyplot as plt

def plot_robustness_results(results: Dict):
    """绘制鲁棒性测试结果"""
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # 扰动类型对比
    perturbations = list(results["by_perturbation"].keys())
    degradations = [results["by_perturbation"][p]["mean_degradation"] for p in perturbations]
    
    axes[0].bar(perturbations, degradations)
    axes[0].set_xlabel("Perturbation Type")
    axes[0].set_ylabel("Mean Degradation")
    axes[0].set_title("Robustness by Perturbation Type")
    axes[0].tick_params(axis="x", rotation=45)
    
    # 通过率
    passed = results["passed_tests"]
    failed = results["failed_tests"]
    axes[1].pie([passed, failed], labels=["Passed", "Failed"], autopct="%1.1f%%")
    axes[1].set_title(f"Overall Pass Rate ({passed}/{passed+failed})")
    
    plt.tight_layout()
    plt.show()

最佳实践

  1. 全面覆盖:测试多种扰动类型和边界条件
  2. 量化评估:使用量化指标评估鲁棒性
  3. 持续测试:将鲁棒性测试集成到CI/CD流程
  4. 结果可视化:直观展示测试结果

总结

鲁棒性测试是确保LLM稳定可靠的关键环节。通过系统化的测试方法,可以发现和修复模型的弱点。