← 返回首页
🧠

NeMo Guardrails:NVIDIA的安全框架

📂 llm ⏱ 4 min 716 words

--- title: "NeMo Guardrails:NVIDIA的安全框架" description: "使用NeMo Guardrails构建安全、可控的LLM应用" tags: ["NeMo Guardrails", "NVIDIA", "安全框架", "LLM", "防护"] category: "llm" icon: "🛡️"

NeMo Guardrails:NVIDIA的安全框架

NeMo概述

NeMo Guardrails是NVIDIA开发的开源工具包,用于为LLM应用添加可编程的护栏,控制AI的行为。

核心概念

1. 护栏类型

from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum

class ColangMessageType(Enum):
    USER = "user"
    BOT = "bot"
    SYSTEM = "system"

@dataclass
class Rail:
    """Rail定义"""
    name: str
    description: str
    colang_content: str
    enabled: bool = True

class NeMoGuardrails:
    """NeMo Guardrails包装器"""
    
    def __init__(self):
        self.rails = {}
        self.config = {}
    
    def define_rail(self, rail: Rail):
        """定义Rail"""
        self.rails[rail.name] = rail
    
    def create_topic_rail(self, allowed_topics: List[str]) -> str:
        """创建主题Rail"""
        colang = f"""
# 主题限制Rail
define user ask about topic
  "{topic}" for topic in {allowed_topics}

define bot respond to allowed topic
  "我可以讨论{allowed_topics}等话题。"

define flow
  user ask about topic
  bot respond to allowed topic

define flow
  user ask about other topic
  bot say "抱歉,我只能讨论特定主题。"
"""
        return colang
    
    def create_safety_rail(self) -> str:
        """创建安全Rail"""
        colang = """
# 安全Rail
define user harmful request
  "如何伤害他人"
  "如何制造危险物品"
  "如何做违法的事情"

define bot refuse harmful request
  "抱歉,我无法协助这类请求。"

define flow
  user harmful request
  bot refuse harmful request

define flow
  user ask for help with harmful topic
  bot say "我无法提供这方面的帮助,但可以为您推荐合法的资源。"
"""
        return colang

2. Colang语言基础

class ColangTutorial:
    """Colang教程"""
    
    @staticmethod
    def basic_examples() -> Dict[str, str]:
        """基础示例"""
        return {
            "greeting": """
# 问候流程
define user greet
  "你好"
  "嗨"
  "早上好"

define bot respond to greeting
  "你好!有什么我可以帮助你的吗?"

define flow
  user greet
  bot respond to greeting
""",
            "question_answer": """
# 问答流程
define user ask question
  "什么是{topic}?"

define bot answer question
  "关于{topic},让我为你解释..."

define flow
  user ask question
  bot answer question
""",
            "topic_restriction": """
# 主题限制
define user ask about allowed topic
  "告诉我关于{allowed_topic}"

define user ask about restricted topic
  "如何做{restricted_action}"

define flow
  user ask about restricted topic
  bot say "抱歉,我无法讨论这个话题。"

define flow
  user ask about allowed topic
  bot provide information
"""
        }

实际应用

1. 构建安全聊天机器人

class SafeChatbot:
    """安全聊天机器人"""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.guardrails = NeMoGuardrails()
        self._setup_rails()
    
    def _setup_rails(self):
        """设置Rails"""
        # 主题限制Rail
        topic_rail = Rail(
            name="topic_restriction",
            description="限制对话主题",
            colang_content=self.guardrails.create_topic_rail(
                ["技术", "教育", "娱乐", "生活"]
            )
        )
        self.guardrails.define_rail(topic_rail)
        
        # 安全Rail
        safety_rail = Rail(
            name="safety",
            description="安全防护",
            colang_content=self.guardrails.create_safety_rail()
        )
        self.guardrails.define_rail(safety_rail)
    
    def chat(self, user_input: str) -> str:
        """聊天(带护栏)"""
        # 这里应该集成NeMo Guardrails的执行引擎
        # 简化实现
        
        # 检查输入
        if self._check_input(user_input):
            # 调用模型
            inputs = self.tokenizer(user_input, return_tensors="pt")
            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_new_tokens=100)
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # 检查输出
            return self._filter_output(response)
        else:
            return "抱歉,我无法处理这个请求。"
    
    def _check_input(self, text: str) -> bool:
        """检查输入"""
        # 简化实现
        prohibited = ["暴力", "非法", "伤害"]
        return not any(word in text for word in prohibited)
    
    def _filter_output(self, text: str) -> str:
        """过滤输出"""
        # 简化实现
        return text

2. 内容生成护栏

class ContentGenerationGuardrails:
    """内容生成护栏"""
    
    def __init__(self):
        self.quality_checks = []
        self.safety_checks = []
    
    def add_quality_check(self, check_func):
        """添加质量检查"""
        self.quality_checks.append(check_func)
    
    def add_safety_check(self, check_func):
        """添加安全检查"""
        self.safety_checks.append(check_func)
    
    def generate_content(self, prompt: str, model_func) -> Dict:
        """生成内容(带护栏)"""
        # 生成内容
        raw_output = model_func(prompt)
        
        # 质量检查
        quality_results = []
        for check in self.quality_checks:
            result = check(raw_output)
            quality_results.append(result)
        
        # 安全检查
        safety_results = []
        for check in self.safety_checks:
            result = check(raw_output)
            safety_results.append(result)
        
        # 综合评估
        all_passed = all(r["passed"] for r in quality_results + safety_results)
        
        return {
            "output": raw_output if all_passed else "内容未通过安全/质量检查",
            "passed": all_passed,
            "quality_results": quality_results,
            "safety_results": safety_results
        }

# 使用示例
guardrails = ContentGenerationGuardrails()

# 添加质量检查
guardrails.add_quality_check(lambda t: {"passed": len(t) > 50, "message": "内容过短"})
guardrails.add_quality_check(lambda t: {"passed": not t.count("重复") > 3, "message": "内容重复"})

# 添加安全检查
guardrails.add_safety_check(lambda t: {"passed": "暴力" not in t, "message": "包含暴力内容"})
guardrails.add_safety_check(lambda t: {"passed": "仇恨" not in t, "message": "包含仇恨内容"})

# 生成内容
result = guardrails.generate_content("写一篇关于AI的文章", lambda p: "AI正在改变世界...")

配置管理

class NeMoConfiguration:
    """NeMo配置"""
    
    def __init__(self):
        self.config = {
            "model": {
                "name": "gpt-4",
                "temperature": 0.7,
                "max_tokens": 1000
            },
            "rails": {
                "enabled": True,
                "strict_mode": False,
                "log_level": "INFO"
            },
            "safety": {
                "content_filter": True,
                "topic_restriction": True,
                "output_validation": True
            }
        }
    
    def update_config(self, section: str, key: str, value):
        """更新配置"""
        if section in self.config:
            self.config[section][key] = value
    
    def get_config(self) -> Dict:
        """获取配置"""
        return self.config.copy()
    
    def export_config(self, path: str):
        """导出配置"""
        import json
        with open(path, "w") as f:
            json.dump(self.config, f, indent=2)
    
    @classmethod
    def from_file(cls, path: str):
        """从文件加载配置"""
        import json
        with open(path) as f:
            config = json.load(f)
        
        instance = cls()
        instance.config = config
        return instance

监控和日志

class NeMoMonitor:
    """NeMo监控"""
    
    def __init__(self):
        self.interaction_log = []
        self.violation_log = []
    
    def log_interaction(self, user_input: str, bot_output: str, rails_triggered: List[str]):
        """记录交互"""
        self.interaction_log.append({
            "timestamp": datetime.now().isoformat(),
            "user_input": user_input,
            "bot_output": bot_output,
            "rails_triggered": rails_triggered
        })
    
    def log_violation(self, violation_type: str, details: Dict):
        """记录违规"""
        self.violation_log.append({
            "timestamp": datetime.now().isoformat(),
            "type": violation_type,
            "details": details
        })
    
    def get_statistics(self) -> Dict:
        """获取统计信息"""
        return {
            "total_interactions": len(self.interaction_log),
            "total_violations": len(self.violation_log),
            "violation_rate": len(self.violation_log) / len(self.interaction_log) if self.interaction_log else 0,
            "most_common_violations": self._get_most_common_violations()
        }
    
    def _get_most_common_violations(self) -> List[Dict]:
        """获取最常见的违规"""
        from collections import Counter
        violation_types = [v["type"] for v in self.violation_log]
        counter = Counter(violation_types)
        return [{"type": t, "count": c} for t, c in counter.most_common(5)]

最佳实践

  1. 分层防护:实施多层护栏防护
  2. 可配置性:提供灵活的配置选项
  3. 持续监控:实时监控护栏执行情况
  4. 日志记录:完整记录交互和违规

总结

NeMo Guardrails提供了强大的LLM安全防护框架。通过Colang语言定义护栏,可以灵活控制AI行为,确保安全可靠的AI应用。