LLM安全
--- title: "LLM安全" description: "大语言模型安全原则与负责任AI实践,涵盖安全设计、风险评估和治理框架" tags: ["安全", "负责任AI", "风险评估", "治理"] category: "llm" icon: "🧠"
LLM安全
安全原则概述
LLM安全是确保AI系统可靠、可信、可控的关键。随着大模型应用日益广泛,建立完善的安全体系变得至关重要。
安全设计原则
1. 最小权限原则
class PermissionManager:
def __init__(self):
self.permissions = {
"read": ["public_data", "user_data"],
"write": ["user_data"],
"execute": ["safe_commands"],
"admin": [] # 默认无管理员权限
}
def check_permission(self, user_role: str, action: str, resource: str) -> bool:
"""检查用户权限"""
allowed_resources = self.permissions.get(action, [])
return resource in allowed_resources
def apply_least_privilege(self, model_config: dict) -> dict:
"""应用最小权限原则"""
restricted_config = {
"file_access": False,
"network_access": False,
"system_commands": False,
"data_export": False
}
# 只启用必要的权限
for key in model_config:
if key in restricted_config:
restricted_config[key] = model_config[key]
return restricted_config
# 使用示例
permission_mgr = PermissionManager()
config = permission_mgr.apply_least_privilege({
"file_access": True, # 会被限制
"read": True
})
2. 安全输入验证
import re
from typing import Optional, Tuple
class InputValidator:
def __init__(self):
self.max_length = 10000
self.blocked_patterns = [
r'<script>.*?</script>', # XSS攻击
r'(\b|\s)(DROP|DELETE|UPDATE|INSERT)\b', # SQL注入
r'rm\s+-rf', # 命令注入
r'\.\.\/', # 路径遍历
]
def validate(self, user_input: str) -> Tuple[bool, Optional[str]]:
"""验证用户输入"""
# 长度检查
if len(user_input) > self.max_length:
return False, "输入过长"
# 模式匹配检查
for pattern in self.blocked_patterns:
if re.search(pattern, user_input, re.IGNORECASE):
return False, "输入包含不允许的内容"
# 编码检查
if self.detect_encoding_attack(user_input):
return False, "检测到编码攻击"
return True, None
def detect_encoding_attack(self, text: str) -> bool:
"""检测编码攻击"""
# 检测Base64编码
if re.search(r'[A-Za-z0-9+/]{100,}={0,2}', text):
return True
# 检测Unicode规范化攻击
if '\u200b' in text or '\u200c' in text:
return True
return False
# 使用示例
validator = InputValidator()
is_valid, error = validator.validate(user_input)
if not is_valid:
print(f"输入验证失败: {error}")
3. 安全输出过滤
class OutputFilter:
def __init__(self):
self.sensitive_patterns = {
"pii": [
r'\b\d{3}-\d{2}-\d{4}\b', # SSN
r'\b\d{16}\b', # 信用卡号
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' # 邮箱
],
"credentials": [
r'password\s*[:=]\s*\S+',
r'api_key\s*[:=]\s*\S+',
r'secret\s*[:=]\s*\S+'
]
}
def filter_output(self, text: str) -> str:
"""过滤敏感信息"""
filtered_text = text
for category, patterns in self.sensitive_patterns.items():
for pattern in patterns:
filtered_text = re.sub(pattern, f'[{category.upper()}_REDACTED]', filtered_text)
return filtered_text
def validate_response(self, response: str, context: dict) -> str:
"""验证和过滤响应"""
# 检查是否包含有害内容
if self.contains_harmful_content(response):
return "抱歉,我无法提供此信息"
# 过滤敏感信息
filtered = self.filter_output(response)
# 长度限制
if len(filtered) > 1000:
filtered = filtered[:1000] + "..."
return filtered
安全评估框架
class SafetyAssessment:
def __init__(self):
self.risk_levels = {
"low": 1,
"medium": 2,
"high": 3,
"critical": 4
}
def assess_risk(self, use_case: str, data_sensitivity: str) -> dict:
"""评估风险等级"""
risk_factors = {
"use_case": self.evaluate_use_case_risk(use_case),
"data": self.evaluate_data_risk(data_sensitivity),
"user_impact": self.estimate_user_impact(use_case)
}
overall_risk = max(risk_factors.values())
return {
"risk_factors": risk_factors,
"overall_risk": overall_risk,
"risk_level": self.get_risk_level(overall_risk),
"recommendations": self.get_recommendations(overall_risk)
}
def evaluate_use_case_risk(self, use_case: str) -> int:
"""评估用例风险"""
high_risk_cases = ["healthcare", "finance", "legal"]
medium_risk_cases = ["education", "entertainment"]
if use_case in high_risk_cases:
return self.risk_levels["high"]
elif use_case in medium_risk_cases:
return self.risk_levels["medium"]
else:
return self.risk_levels["low"]
def evaluate_data_risk(self, sensitivity: str) -> int:
"""评估数据风险"""
sensitivity_map = {
"public": 1,
"internal": 2,
"confidential": 3,
"secret": 4
}
return sensitivity_map.get(sensitivity, 2)
def estimate_user_impact(self, use_case: str) -> int:
"""估算用户影响"""
# 简化实现
return self.risk_levels["medium"]
def get_risk_level(self, risk_score: int) -> str:
"""获取风险等级"""
for level, score in self.risk_levels.items():
if score == risk_score:
return level
return "medium"
def get_recommendations(self, risk_level: int) -> list:
"""获取安全建议"""
recommendations = {
1: ["基本监控", "标准测试"],
2: ["增强监控", "定期审计", "用户反馈机制"],
3: ["严格监控", "人工审核", "实时告警", "数据加密"],
4: ["最高级别监控", "专家审核", "实时阻断", "数据脱敏", "访问控制"]
}
return recommendations.get(risk_level, ["标准安全措施"])
负责任AI实践
class ResponsibleAI:
def __init__(self):
self.principles = [
"fairness",
"transparency",
"accountability",
"privacy",
"safety"
]
def check_fairness(self, model, test_data: dict) -> dict:
"""检查公平性"""
fairness_metrics = {}
for group in test_data.keys():
predictions = model.predict(test_data[group])
fairness_metrics[group] = {
"accuracy": self.calculate_accuracy(predictions),
"bias_score": self.detect_bias(predictions)
}
# 检查群体间差异
disparities = self.calculate_disparities(fairness_metrics)
return {
"group_metrics": fairness_metrics,
"disparities": disparities,
"is_fair": all(d < 0.1 for d in disparities.values())
}
def ensure_transparency(self, model, input_data: dict) -> dict:
"""确保透明性"""
# 生成解释
explanation = self.generate_explanation(model, input_data)
# 记录决策过程
decision_log = {
"input": input_data,
"model_version": model.version,
"timestamp": self.get_timestamp(),
"explanation": explanation
}
return {
"decision_log": decision_log,
"explanation": explanation,
"confidence": self.calculate_confidence(model, input_data)
}
def protect_privacy(self, data: dict) -> dict:
"""保护隐私"""
# 数据脱敏
anonymized_data = self.anonymize_data(data)
# 差分隐私
private_data = self.apply_differential_privacy(anonymized_data)
return {
"original_data": data,
"protected_data": private_data,
"privacy_budget": self.calculate_privacy_budget()
}
安全监控系统
class SecurityMonitor:
def __init__(self):
self.alert_thresholds = {
"high_risk_count": 10,
"suspicious_pattern_count": 5,
"anomaly_score": 0.8
}
def monitor_usage(self, logs: list) -> dict:
"""监控使用情况"""
suspicious_activities = []
for log in logs:
if self.detect_suspicious_activity(log):
suspicious_activities.append(log)
return {
"total_requests": len(logs),
"suspicious_count": len(suspicious_activities),
"suspicious_activities": suspicious_activities,
"risk_level": self.calculate_risk_level(suspicious_activities)
}
def detect_suspicious_activity(self, log: dict) -> bool:
"""检测可疑活动"""
# 检查异常频率
if log.get("request_count", 0) > 100:
return True
# 检查异常内容
if self.contains_anomaly(log.get("content", "")):
return True
return False
总结
LLM安全需要多层次、全方位的防护。通过遵循安全设计原则、实施安全评估框架、践行负责任AI实践,可以构建安全可靠的AI系统。