鲁棒性测试:验证模型稳定性
--- title: "鲁棒性测试:验证模型稳定性" description: "系统化的LLM鲁棒性测试方法,确保模型在各种条件下稳定运行" tags: ["鲁棒性", "稳定性测试", "压力测试", "LLM", "质量保证"] category: "llm" icon: "🏋️"
鲁棒性测试:验证模型稳定性
鲁棒性测试概述
鲁棒性测试是验证模型在各种扰动和边界条件下保持稳定性能的系统化方法。
测试框架
1. 鲁棒性测试器
import numpy as np
from typing import List, Dict, Callable, Any
from dataclasses import dataclass
from enum import Enum
class PerturbationType(Enum):
NOISE = "noise"
TYPOS = "typos"
SYNONYM = "synonym"
DELETION = "deletion"
INSERTION = "insertion"
REORDER = "reorder"
@dataclass
class RobustnessTestResult:
"""鲁棒性测试结果"""
test_name: str
perturbation_type: PerturbationType
original_score: float
perturbed_scores: List[float]
mean_score: float
std_score: float
degradation: float
passed: bool
class RobustnessTester:
"""鲁棒性测试器"""
def __init__(self, model_predict: Callable, threshold: float = 0.1):
self.model_predict = model_predict
self.threshold = threshold
self.results = []
def add_noise(self, text: str, noise_level: float = 0.1) -> str:
"""添加噪声"""
chars = list(text)
n_noise = int(len(chars) * noise_level)
for _ in range(n_noise):
pos = np.random.randint(0, len(chars))
chars[pos] = chr(np.random.randint(32, 127))
return "".join(chars)
def introduce_typos(self, text: str, typo_rate: float = 0.1) -> str:
"""引入拼写错误"""
chars = list(text)
n_typos = int(len(chars) * typo_rate)
for _ in range(n_typos):
pos = np.random.randint(0, len(chars))
# 邻近键位替换
adjacent_keys = {
'a': 'sqwz', 'b': 'vghn', 'c': 'xdfv',
'd': 'sfcer', 'e': 'wrd', 'f': 'dgcv',
'g': 'fhtb', 'h': 'gjyn', 'i': 'uok',
'j': 'hknm', 'k': 'jloi', 'l': 'kop',
'm': 'njk', 'n': 'bmhj', 'o': 'iplk',
'p': 'ol', 'q': 'wa', 'r': 'edt',
's': 'awedx', 't': 'rfy', 'u': 'yihj',
'v': 'cfgb', 'w': 'qase', 'x': 'zsdc',
'y': 'tuhg', 'z': 'asx'
}
if chars[pos].lower() in adjacent_keys:
replacement = np.random.choice(list(adjacent_keys[chars[pos].lower()]))
chars[pos] = replacement
return "".join(chars)
def delete_characters(self, text: str, delete_rate: float = 0.1) -> str:
"""删除字符"""
chars = list(text)
n_delete = int(len(chars) * delete_rate)
indices = np.random.choice(len(chars), n_delete, replace=False)
return "".join([c for i, c in enumerate(chars) if i not in indices])
def reorder_words(self, text: str, reorder_rate: float = 0.1) -> str:
"""重排词语"""
words = text.split()
n_reorder = int(len(words) * reorder_rate)
for _ in range(n_reorder):
i, j = np.random.choice(len(words), 2, replace=False)
words[i], words[j] = words[j], words[i]
return " ".join(words)
def test_perturbation(self, text: str, perturbation_type: PerturbationType,
n_samples: int = 10) -> RobustnessTestResult:
"""测试扰动影响"""
# 获取原始分数
original_score = self.model_predict(text)
# 生成扰动并测试
perturbed_scores = []
for _ in range(n_samples):
if perturbation_type == PerturbationType.NOISE:
perturbed = self.add_noise(text)
elif perturbation_type == PerturbationType.TYPOS:
perturbed = self.introduce_typos(text)
elif perturbation_type == PerturbationType.DELETION:
perturbed = self.delete_characters(text)
elif perturbation_type == PerturbationType.REORDER:
perturbed = self.reorder_words(text)
else:
perturbed = text
score = self.model_predict(perturbed)
perturbed_scores.append(score)
mean_score = np.mean(perturbed_scores)
std_score = np.std(perturbed_scores)
degradation = (original_score - mean_score) / original_score
result = RobustnessTestResult(
test_name=f"test_{perturbation_type.value}",
perturbation_type=perturbation_type,
original_score=original_score,
perturbed_scores=perturbed_scores,
mean_score=mean_score,
std_score=std_score,
degradation=degradation,
passed=degradation < self.threshold
)
self.results.append(result)
return result
def run_all_tests(self, texts: List[str]) -> Dict:
"""运行所有测试"""
all_results = []
for text in texts:
for perturbation_type in PerturbationType:
result = self.test_perturbation(text, perturbation_type)
all_results.append(result)
# 汇总结果
summary = {
"total_tests": len(all_results),
"passed_tests": sum(1 for r in all_results if r.passed),
"failed_tests": sum(1 for r in all_results if not r.passed),
"mean_degradation": np.mean([r.degradation for r in all_results]),
"by_perturbation": {}
}
for perturbation_type in PerturbationType:
type_results = [r for r in all_results if r.perturbation_type == perturbation_type]
if type_results:
summary["by_perturbation"][perturbation_type.value] = {
"count": len(type_results),
"passed": sum(1 for r in type_results if r.passed),
"mean_degradation": np.mean([r.degradation for r in type_results])
}
return summary
2. 边界测试
class BoundaryTester:
"""边界测试"""
def __init__(self, model_predict: Callable):
self.model_predict = model_predict
def test_empty_input(self) -> Dict:
"""测试空输入"""
try:
result = self.model_predict("")
return {"success": True, "output": result, "issue": None}
except Exception as e:
return {"success": False, "output": None, "issue": str(e)}
def test_long_input(self, max_length: int = 10000) -> Dict:
"""测试长输入"""
long_text = "这是一个测试。" * (max_length // 6)
try:
result = self.model_predict(long_text)
return {"success": True, "output_length": len(str(result)), "issue": None}
except Exception as e:
return {"success": False, "output": None, "issue": str(e)}
def test_special_characters(self) -> Dict:
"""测试特殊字符"""
special_texts = [
"测试\0字符",
"测试\n换行符",
"测试\t制表符",
"测试\"引号",
"测试'单引号"
]
results = []
for text in special_texts:
try:
result = self.model_predict(text)
results.append({"input": repr(text), "success": True})
except Exception as e:
results.append({"input": repr(text), "success": False, "error": str(e)})
return {"results": results, "all_passed": all(r["success"] for r in results)}
def test_unicode(self) -> Dict:
"""测试Unicode字符"""
unicode_texts = [
"测试中文",
"テスト日本語",
"테스트 한국어",
"Тест русский",
"🎉 Emoji测试 🚀"
]
results = []
for text in unicode_texts:
try:
result = self.model_predict(text)
results.append({"input": text, "success": True})
except Exception as e:
results.append({"input": text, "success": False, "error": str(e)})
return {"results": results, "all_passed": all(r["success"] for r in results)}
压力测试
class StressTester:
"""压力测试"""
def __init__(self, model_predict: Callable):
self.model_predict = model_predict
def test_concurrent_requests(self, n_concurrent: int = 10, duration: int = 10) -> Dict:
"""测试并发请求"""
import concurrent.futures
import time
results = {"success": 0, "failure": 0, "latencies": []}
def single_request():
start = time.time()
try:
self.model_predict("测试输入")
latency = time.time() - start
results["success"] += 1
results["latencies"].append(latency)
except Exception:
results["failure"] += 1
with concurrent.futures.ThreadPoolExecutor(max_workers=n_concurrent) as executor:
futures = []
for _ in range(duration * n_concurrent):
futures.append(executor.submit(single_request))
for future in concurrent.futures.as_completed(futures):
future.result()
results["mean_latency"] = np.mean(results["latencies"]) if results["latencies"] else 0
results["p95_latency"] = np.percentile(results["latencies"], 95) if results["latencies"] else 0
return results
def test_memory_usage(self, n_iterations: int = 100) -> Dict:
"""测试内存使用"""
import psutil
import os
process = psutil.Process(os.getpid())
initial_memory = process.memory_info().rss / 1024 / 1024 # MB
for _ in range(n_iterations):
self.model_predict("测试输入")
final_memory = process.memory_info().rss / 1024 / 1024
return {
"initial_memory_mb": initial_memory,
"final_memory_mb": final_memory,
"memory_increase_mb": final_memory - initial_memory,
"memory_leak": final_memory > initial_memory * 1.1
}
结果分析
import matplotlib.pyplot as plt
def plot_robustness_results(results: Dict):
"""绘制鲁棒性测试结果"""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 扰动类型对比
perturbations = list(results["by_perturbation"].keys())
degradations = [results["by_perturbation"][p]["mean_degradation"] for p in perturbations]
axes[0].bar(perturbations, degradations)
axes[0].set_xlabel("Perturbation Type")
axes[0].set_ylabel("Mean Degradation")
axes[0].set_title("Robustness by Perturbation Type")
axes[0].tick_params(axis="x", rotation=45)
# 通过率
passed = results["passed_tests"]
failed = results["failed_tests"]
axes[1].pie([passed, failed], labels=["Passed", "Failed"], autopct="%1.1f%%")
axes[1].set_title(f"Overall Pass Rate ({passed}/{passed+failed})")
plt.tight_layout()
plt.show()
最佳实践
- 全面覆盖:测试多种扰动类型和边界条件
- 量化评估:使用量化指标评估鲁棒性
- 持续测试:将鲁棒性测试集成到CI/CD流程
- 结果可视化:直观展示测试结果
总结
鲁棒性测试是确保LLM稳定可靠的关键环节。通过系统化的测试方法,可以发现和修复模型的弱点。