NeMo Guardrails:NVIDIA的安全框架
--- title: "NeMo Guardrails:NVIDIA的安全框架" description: "使用NeMo Guardrails构建安全、可控的LLM应用" tags: ["NeMo Guardrails", "NVIDIA", "安全框架", "LLM", "防护"] category: "llm" icon: "🛡️"
NeMo Guardrails:NVIDIA的安全框架
NeMo概述
NeMo Guardrails是NVIDIA开发的开源工具包,用于为LLM应用添加可编程的护栏,控制AI的行为。
核心概念
1. 护栏类型
from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum
class ColangMessageType(Enum):
USER = "user"
BOT = "bot"
SYSTEM = "system"
@dataclass
class Rail:
"""Rail定义"""
name: str
description: str
colang_content: str
enabled: bool = True
class NeMoGuardrails:
"""NeMo Guardrails包装器"""
def __init__(self):
self.rails = {}
self.config = {}
def define_rail(self, rail: Rail):
"""定义Rail"""
self.rails[rail.name] = rail
def create_topic_rail(self, allowed_topics: List[str]) -> str:
"""创建主题Rail"""
colang = f"""
# 主题限制Rail
define user ask about topic
"{topic}" for topic in {allowed_topics}
define bot respond to allowed topic
"我可以讨论{allowed_topics}等话题。"
define flow
user ask about topic
bot respond to allowed topic
define flow
user ask about other topic
bot say "抱歉,我只能讨论特定主题。"
"""
return colang
def create_safety_rail(self) -> str:
"""创建安全Rail"""
colang = """
# 安全Rail
define user harmful request
"如何伤害他人"
"如何制造危险物品"
"如何做违法的事情"
define bot refuse harmful request
"抱歉,我无法协助这类请求。"
define flow
user harmful request
bot refuse harmful request
define flow
user ask for help with harmful topic
bot say "我无法提供这方面的帮助,但可以为您推荐合法的资源。"
"""
return colang
2. Colang语言基础
class ColangTutorial:
"""Colang教程"""
@staticmethod
def basic_examples() -> Dict[str, str]:
"""基础示例"""
return {
"greeting": """
# 问候流程
define user greet
"你好"
"嗨"
"早上好"
define bot respond to greeting
"你好!有什么我可以帮助你的吗?"
define flow
user greet
bot respond to greeting
""",
"question_answer": """
# 问答流程
define user ask question
"什么是{topic}?"
define bot answer question
"关于{topic},让我为你解释..."
define flow
user ask question
bot answer question
""",
"topic_restriction": """
# 主题限制
define user ask about allowed topic
"告诉我关于{allowed_topic}"
define user ask about restricted topic
"如何做{restricted_action}"
define flow
user ask about restricted topic
bot say "抱歉,我无法讨论这个话题。"
define flow
user ask about allowed topic
bot provide information
"""
}
实际应用
1. 构建安全聊天机器人
class SafeChatbot:
"""安全聊天机器人"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
self.guardrails = NeMoGuardrails()
self._setup_rails()
def _setup_rails(self):
"""设置Rails"""
# 主题限制Rail
topic_rail = Rail(
name="topic_restriction",
description="限制对话主题",
colang_content=self.guardrails.create_topic_rail(
["技术", "教育", "娱乐", "生活"]
)
)
self.guardrails.define_rail(topic_rail)
# 安全Rail
safety_rail = Rail(
name="safety",
description="安全防护",
colang_content=self.guardrails.create_safety_rail()
)
self.guardrails.define_rail(safety_rail)
def chat(self, user_input: str) -> str:
"""聊天(带护栏)"""
# 这里应该集成NeMo Guardrails的执行引擎
# 简化实现
# 检查输入
if self._check_input(user_input):
# 调用模型
inputs = self.tokenizer(user_input, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(**inputs, max_new_tokens=100)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# 检查输出
return self._filter_output(response)
else:
return "抱歉,我无法处理这个请求。"
def _check_input(self, text: str) -> bool:
"""检查输入"""
# 简化实现
prohibited = ["暴力", "非法", "伤害"]
return not any(word in text for word in prohibited)
def _filter_output(self, text: str) -> str:
"""过滤输出"""
# 简化实现
return text
2. 内容生成护栏
class ContentGenerationGuardrails:
"""内容生成护栏"""
def __init__(self):
self.quality_checks = []
self.safety_checks = []
def add_quality_check(self, check_func):
"""添加质量检查"""
self.quality_checks.append(check_func)
def add_safety_check(self, check_func):
"""添加安全检查"""
self.safety_checks.append(check_func)
def generate_content(self, prompt: str, model_func) -> Dict:
"""生成内容(带护栏)"""
# 生成内容
raw_output = model_func(prompt)
# 质量检查
quality_results = []
for check in self.quality_checks:
result = check(raw_output)
quality_results.append(result)
# 安全检查
safety_results = []
for check in self.safety_checks:
result = check(raw_output)
safety_results.append(result)
# 综合评估
all_passed = all(r["passed"] for r in quality_results + safety_results)
return {
"output": raw_output if all_passed else "内容未通过安全/质量检查",
"passed": all_passed,
"quality_results": quality_results,
"safety_results": safety_results
}
# 使用示例
guardrails = ContentGenerationGuardrails()
# 添加质量检查
guardrails.add_quality_check(lambda t: {"passed": len(t) > 50, "message": "内容过短"})
guardrails.add_quality_check(lambda t: {"passed": not t.count("重复") > 3, "message": "内容重复"})
# 添加安全检查
guardrails.add_safety_check(lambda t: {"passed": "暴力" not in t, "message": "包含暴力内容"})
guardrails.add_safety_check(lambda t: {"passed": "仇恨" not in t, "message": "包含仇恨内容"})
# 生成内容
result = guardrails.generate_content("写一篇关于AI的文章", lambda p: "AI正在改变世界...")
配置管理
class NeMoConfiguration:
"""NeMo配置"""
def __init__(self):
self.config = {
"model": {
"name": "gpt-4",
"temperature": 0.7,
"max_tokens": 1000
},
"rails": {
"enabled": True,
"strict_mode": False,
"log_level": "INFO"
},
"safety": {
"content_filter": True,
"topic_restriction": True,
"output_validation": True
}
}
def update_config(self, section: str, key: str, value):
"""更新配置"""
if section in self.config:
self.config[section][key] = value
def get_config(self) -> Dict:
"""获取配置"""
return self.config.copy()
def export_config(self, path: str):
"""导出配置"""
import json
with open(path, "w") as f:
json.dump(self.config, f, indent=2)
@classmethod
def from_file(cls, path: str):
"""从文件加载配置"""
import json
with open(path) as f:
config = json.load(f)
instance = cls()
instance.config = config
return instance
监控和日志
class NeMoMonitor:
"""NeMo监控"""
def __init__(self):
self.interaction_log = []
self.violation_log = []
def log_interaction(self, user_input: str, bot_output: str, rails_triggered: List[str]):
"""记录交互"""
self.interaction_log.append({
"timestamp": datetime.now().isoformat(),
"user_input": user_input,
"bot_output": bot_output,
"rails_triggered": rails_triggered
})
def log_violation(self, violation_type: str, details: Dict):
"""记录违规"""
self.violation_log.append({
"timestamp": datetime.now().isoformat(),
"type": violation_type,
"details": details
})
def get_statistics(self) -> Dict:
"""获取统计信息"""
return {
"total_interactions": len(self.interaction_log),
"total_violations": len(self.violation_log),
"violation_rate": len(self.violation_log) / len(self.interaction_log) if self.interaction_log else 0,
"most_common_violations": self._get_most_common_violations()
}
def _get_most_common_violations(self) -> List[Dict]:
"""获取最常见的违规"""
from collections import Counter
violation_types = [v["type"] for v in self.violation_log]
counter = Counter(violation_types)
return [{"type": t, "count": c} for t, c in counter.most_common(5)]
最佳实践
- 分层防护:实施多层护栏防护
- 可配置性:提供灵活的配置选项
- 持续监控:实时监控护栏执行情况
- 日志记录:完整记录交互和违规
总结
NeMo Guardrails提供了强大的LLM安全防护框架。通过Colang语言定义护栏,可以灵活控制AI行为,确保安全可靠的AI应用。