← 返回首页
🤖

Prompt工程架构:模板管理、版本控制与评估

📂 architecture ⏱ 7 min 1399 words

Prompt工程架构:模板管理、版本控制与评估

Prompt管理系统概述

随着LLM应用的普及,Prompt已成为一种重要的代码资产。Prompt管理系统需要支持模板化、版本控制、A/B测试、性能评估和协作开发,确保Prompt质量和可追溯性。

# Prompt管理系统核心
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any, Callable
from datetime import datetime
import uuid
import json

@dataclass
class PromptTemplate:
    template_id: str
    name: str
    template: str
    version: str
    variables: List[str]
    description: str = ""
    author: str = ""
    tags: List[str] = field(default_factory=list)
    metadata: Dict = field(default_factory=dict)
    created_at: datetime = field(default_factory=datetime.now)
    is_active: bool = True

@dataclass
class PromptVersion:
    version_id: str
    template_id: str
    version: str
    template: str
    changelog: str
    metrics: Dict[str, float] = field(default_factory=dict)
    created_at: datetime = field(default_factory=datetime.now)

class PromptManager:
    def __init__(self):
        self.templates: Dict[str, PromptTemplate] = {}
        self.versions: Dict[str, List[PromptVersion]] = {}
    
    def create_template(self, name: str, template: str, 
                       variables: List[str], **kwargs) -> PromptTemplate:
        """创建新模板"""
        template_id = str(uuid.uuid4())[:8]
        
        prompt = PromptTemplate(
            template_id=template_id,
            name=name,
            template=template,
            version="1.0.0",
            variables=variables,
            **kwargs
        )
        
        self.templates[template_id] = prompt
        self.versions[template_id] = [PromptVersion(
            version_id=str(uuid.uuid4())[:8],
            template_id=template_id,
            version="1.0.0",
            template=template,
            changelog="Initial version"
        )]
        
        return prompt
    
    def update_template(self, template_id: str, new_template: str,
                       changelog: str = "") -> PromptTemplate:
        """更新模板(创建新版本)"""
        template = self.templates.get(template_id)
        if not template:
            raise ValueError(f"Template not found: {template_id}")
        
        # 更新版本号
        parts = template.version.split(".")
        parts[2] = str(int(parts[2]) + 1)
        new_version = ".".join(parts)
        
        # 保存旧版本
        old_version = PromptVersion(
            version_id=str(uuid.uuid4())[:8],
            template_id=template_id,
            version=template.version,
            template=template.template,
            changelog=changelog or "Updated template"
        )
        
        if template_id not in self.versions:
            self.versions[template_id] = []
        self.versions[template_id].append(old_version)
        
        # 更新当前版本
        template.template = new_template
        template.version = new_version
        template.created_at = datetime.now()
        
        return template
    
    def get_template(self, template_id: str, 
                    version: str = None) -> Optional[PromptTemplate]:
        """获取模板"""
        template = self.templates.get(template_id)
        if not template:
            return None
        
        if version and version != template.version:
            # 获取指定版本
            for v in self.versions.get(template_id, []):
                if v.version == version:
                    return PromptTemplate(
                        template_id=template_id,
                        name=template.name,
                        template=v.template,
                        version=v.version,
                        variables=template.variables
                    )
            return None
        
        return template
    
    def render(self, template_id: str, variables: Dict[str, Any]) -> str:
        """渲染模板"""
        template = self.get_template(template_id)
        if not template:
            raise ValueError(f"Template not found: {template_id}")
        
        rendered = template.template
        for var_name, var_value in variables.items():
            rendered = rendered.replace(f"{{{var_name}}}", str(var_value))
        
        return rendered
    
    def list_templates(self, tags: List[str] = None) -> List[PromptTemplate]:
        """列出模板"""
        templates = list(self.templates.values())
        
        if tags:
            templates = [t for t in templates 
                        if any(tag in t.tags for tag in tags)]
        
        return templates

Prompt版本控制

版本控制是Prompt管理的核心功能,支持版本历史、差异比较、回滚和标签管理。每次修改都应记录变更原因和性能影响。

# Prompt版本控制器
class PromptVersionControl:
    def __init__(self, manager: PromptManager):
        self.manager = manager
    
    def commit(self, template_id: str, changes: Dict, 
              message: str, author: str = "") -> str:
        """提交变更"""
        template = self.manager.templates.get(template_id)
        if not template:
            raise ValueError(f"Template not found: {template_id}")
        
        # 应用变更
        new_template = template.template
        if "template" in changes:
            new_template = changes["template"]
        
        # 更新模板
        updated = self.manager.update_template(
            template_id, 
            new_template,
            changelog=message
        )
        
        # 记录提交
        commit_id = str(uuid.uuid4())[:8]
        print(f"Committed {commit_id}: {message}")
        
        return commit_id
    
    def diff(self, template_id: str, version1: str, 
            version2: str) -> Dict:
        """比较两个版本的差异"""
        versions = self.manager.versions.get(template_id, [])
        
        v1 = next((v for v in versions if v.version == version1), None)
        v2 = next((v for v in versions if v.version == version2), None)
        
        if not v1 or not v2:
            return {"error": "Version not found"}
        
        # 简单的行级差异
        lines1 = v1.template.split("\n")
        lines2 = v2.template.split("\n")
        
        diff = []
        max_lines = max(len(lines1), len(lines2))
        
        for i in range(max_lines):
            line1 = lines1[i] if i < len(lines1) else ""
            line2 = lines2[i] if i < len(lines2) else ""
            
            if line1 != line2:
                diff.append({
                    "line": i + 1,
                    "old": line1,
                    "new": line2
                })
        
        return {
            "version1": version1,
            "version2": version2,
            "changes": diff,
            "summary": f"{len(diff)} lines changed"
        }
    
    def rollback(self, template_id: str, target_version: str) -> bool:
        """回滚到指定版本"""
        template = self.manager.templates.get(template_id)
        if not template:
            return False
        
        versions = self.manager.versions.get(template_id, [])
        target = next((v for v in versions if v.version == target_version), None)
        
        if not target:
            return False
        
        # 使用目标版本的模板内容
        self.manager.update_template(
            template_id,
            target.template,
            changelog=f"Rollback to version {target_version}"
        )
        
        return True
    
    def tag(self, template_id: str, version: str, tag_name: str):
        """给版本打标签"""
        template = self.manager.templates.get(template_id)
        if template:
            template.metadata.setdefault("tags", {})
            template.metadata["tags"][tag_name] = version
    
    def get_history(self, template_id: str) -> List[Dict]:
        """获取版本历史"""
        versions = self.manager.versions.get(template_id, [])
        
        history = []
        for v in versions:
            history.append({
                "version": v.version,
                "changelog": v.changelog,
                "created_at": v.created_at.isoformat(),
                "metrics": v.metrics
            })
        
        return sorted(history, key=lambda x: x["created_at"], reverse=True)

Prompt评估框架

Prompt评估包括多个维度:准确性、相关性、一致性、安全性和延迟。自动化评估框架需要支持批量测试、统计分析和性能基准比较。

# Prompt评估器
import statistics
from typing import Callable

class PromptEvaluator:
    def __init__(self, llm_client):
        self.llm = llm_client
        self.evaluation_metrics = {}
    
    def evaluate(self, template_id: str, test_cases: List[Dict],
                metrics: List[str] = None) -> Dict:
        """评估Prompt模板"""
        results = []
        
        for test_case in test_cases:
            result = self._evaluate_single(template_id, test_case, metrics)
            results.append(result)
        
        # 汇总统计
        summary = self._aggregate_results(results)
        
        return {
            "template_id": template_id,
            "test_cases": len(test_cases),
            "results": results,
            "summary": summary
        }
    
    def _evaluate_single(self, template_id: str, test_case: Dict,
                        metrics: List[str] = None) -> Dict:
        """评估单个测试用例"""
        # 渲染Prompt
        prompt = self.manager.render(template_id, test_case["variables"])
        
        # 调用LLM
        response = self.llm.generate(prompt)
        
        # 计算指标
        scores = {}
        
        if not metrics or "accuracy" in metrics:
            scores["accuracy"] = self._check_accuracy(response, test_case.get("expected"))
        
        if not metrics or "relevance" in metrics:
            scores["relevance"] = self._check_relevance(response, test_case["input"])
        
        if not metrics or "safety" in metrics:
            scores["safety"] = self._check_safety(response)
        
        return {
            "input": test_case["input"],
            "response": response,
            "expected": test_case.get("expected"),
            "scores": scores
        }
    
    def _check_accuracy(self, response: str, expected: str) -> float:
        """检查准确性"""
        if not expected:
            return 1.0
        
        # 简单的关键词匹配
        expected_words = set(expected.lower().split())
        response_words = set(response.lower().split())
        
        overlap = len(expected_words & response_words)
        return overlap / max(len(expected_words), 1)
    
    def _check_relevance(self, response: str, query: str) -> float:
        """检查相关性"""
        # 使用LLM评估相关性
        prompt = f"""评估以下回答与问题的相关性(0-1分):

问题:{query}
回答:{response}

相关性分数:"""
        
        score_str = self.llm.generate(prompt)
        try:
            return float(score_str.strip())
        except:
            return 0.5
    
    def _check_safety(self, response: str) -> float:
        """检查安全性"""
        unsafe_patterns = ["暴力", "歧视", "违法", "有害"]
        
        for pattern in unsafe_patterns:
            if pattern in response:
                return 0.0
        
        return 1.0
    
    def _aggregate_results(self, results: List[Dict]) -> Dict:
        """汇总评估结果"""
        all_scores = {}
        
        for result in results:
            for metric, score in result["scores"].items():
                if metric not in all_scores:
                    all_scores[metric] = []
                all_scores[metric].append(score)
        
        summary = {}
        for metric, scores in all_scores.items():
            summary[metric] = {
                "mean": statistics.mean(scores),
                "std": statistics.stdev(scores) if len(scores) > 1 else 0,
                "min": min(scores),
                "max": max(scores)
            }
        
        return summary

# Prompt基准测试
class PromptBenchmark:
    def __init__(self, evaluator: PromptEvaluator):
        self.evaluator = evaluator
        self.benchmarks = {}
    
    def run_benchmark(self, template_ids: List[str], 
                     test_suite: List[Dict]) -> Dict:
        """运行基准测试"""
        results = {}
        
        for template_id in template_ids:
            result = self.evaluator.evaluate(template_id, test_suite)
            results[template_id] = result["summary"]
        
        # 排名
        rankings = self._rank_templates(results)
        
        return {
            "results": results,
            "rankings": rankings
        }
    
    def _rank_templates(self, results: Dict) -> List[Dict]:
        """按综合分数排名"""
        rankings = []
        
        for template_id, summary in results.items():
            # 计算综合分数
            overall = sum(
                metric["mean"] 
                for metric in summary.values()
            ) / max(len(summary), 1)
            
            rankings.append({
                "template_id": template_id,
                "overall_score": overall,
                "details": summary
            })
        
        rankings.sort(key=lambda x: x["overall_score"], reverse=True)
        
        return rankings

Prompt优化与最佳实践

Prompt优化是持续改进的过程,包括结构化设计、示例选择、参数调优和失败分析。建立最佳实践库帮助团队快速编写高质量Prompt。

# Prompt优化器
class PromptOptimizer:
    def __init__(self, llm_client, evaluator: PromptEvaluator):
        self.llm = llm_client
        self.evaluator = evaluator
    
    async def optimize(self, template_id: str, 
                      test_cases: List[Dict],
                      iterations: int = 5) -> Dict:
        """自动优化Prompt"""
        template = self.evaluator.manager.get_template(template_id)
        current_template = template.template
        best_score = 0
        best_template = current_template
        
        for i in range(iterations):
            # 生成优化建议
            suggestions = await self._generate_suggestions(
                current_template, test_cases
            )
            
            # 应用建议
            new_template = await self._apply_suggestions(
                current_template, suggestions
            )
            
            # 评估新版本
            # 更新模板进行评估
            self.evaluator.manager.update_template(
                template_id, new_template
            )
            
            result = self.evaluator.evaluate(template_id, test_cases)
            score = result["summary"].get("accuracy", {}).get("mean", 0)
            
            if score > best_score:
                best_score = score
                best_template = new_template
            
            print(f"Iteration {i+1}: score = {score:.3f}")
        
        return {
            "original": template.template,
            "optimized": best_template,
            "score_improvement": best_score
        }
    
    async def _generate_suggestions(self, template: str, 
                                   test_cases: List[Dict]) -> List[str]:
        """生成优化建议"""
        prompt = f"""分析以下Prompt模板,提供优化建议:

当前模板:
{template}

请提供3个具体的改进建议:"""
        
        response = await self.llm.generate(prompt)
        return [s.strip() for s in response.split("\n") if s.strip()]
    
    async def _apply_suggestions(self, template: str, 
                                suggestions: List[str]) -> str:
        """应用优化建议"""
        prompt = f"""根据以下建议改进Prompt模板:

当前模板:
{template}

改进建议:
{chr(10).join(f'- {s}' for s in suggestions)}

改进后的模板:"""
        
        return await self.llm.generate(prompt)

# Prompt模板库
class PromptLibrary:
    def __init__(self):
        self.templates = {}
        self.best_practices = []
    
    def register_template(self, category: str, name: str, 
                         template: str, description: str):
        """注册模板"""
        if category not in self.templates:
            self.templates[category] = {}
        
        self.templates[category][name] = {
            "template": template,
            "description": description,
            "usage_count": 0
        }
    
    def get_template(self, category: str, name: str) -> Optional[str]:
        """获取模板"""
        if category in self.templates and name in self.templates[category]:
            self.templates[category][name]["usage_count"] += 1
            return self.templates[category][name]["template"]
        return None
    
    def add_best_practice(self, practice: str):
        """添加最佳实践"""
        self.best_practices.append(practice)
    
    def search(self, query: str) -> List[Dict]:
        """搜索模板"""
        results = []
        
        for category, templates in self.templates.items():
            for name, info in templates.items():
                if query.lower() in name.lower() or query.lower() in info["description"].lower():
                    results.append({
                        "category": category,
                        "name": name,
                        "description": info["description"],
                        "usage_count": info["usage_count"]
                    })
        
        return results

# 预定义模板库
library = PromptLibrary()

library.register_template(
    "classification",
    "zero_shot",
    """对以下文本进行分类:

文本:{text}

类别:{categories}

分类结果:""",
    "零样本分类模板"
)

library.register_template(
    "extraction",
    "entity",
    """从以下文本中提取实体:

文本:{text}

实体类型:{entity_types}

提取结果:""",
    "实体提取模板"
)

library.register_template(
    "generation",
    "summary",
    """请对以下内容进行总结:

内容:{content}

总结要求:
1. 长度:{length}
2. 重点:{focus}

总结:""",
    "文本摘要模板"
)

Prompt监控与分析

监控Prompt的使用情况和效果,帮助持续优化。指标包括:调用量、延迟、成本、用户反馈和成功率。

# Prompt监控
from collections import defaultdict
import time

class PromptMonitor:
    def __init__(self):
        self.usage_stats = defaultdict(lambda: {
            "count": 0,
            "total_latency": 0,
            "errors": 0,
            "feedback_scores": []
        })
    
    def record_usage(self, template_id: str, latency_ms: float,
                    success: bool, feedback_score: float = None):
        """记录使用情况"""
        stats = self.usage_stats[template_id]
        stats["count"] += 1
        stats["total_latency"] += latency_ms
        
        if not success:
            stats["errors"] += 1
        
        if feedback_score is not None:
            stats["feedback_scores"].append(feedback_score)
    
    def get_stats(self, template_id: str) -> Dict:
        """获取统计数据"""
        stats = self.usage_stats.get(template_id, {})
        
        if stats["count"] == 0:
            return {"count": 0}
        
        return {
            "count": stats["count"],
            "avg_latency_ms": stats["total_latency"] / stats["count"],
            "error_rate": stats["errors"] / stats["count"],
            "avg_feedback": (
                sum(stats["feedback_scores"]) / len(stats["feedback_scores"])
                if stats["feedback_scores"] else None
            )
        }
    
    def get_dashboard(self) -> Dict:
        """获取监控面板数据"""
        dashboard = {
            "total_templates": len(self.usage_stats),
            "total_calls": sum(s["count"] for s in self.usage_stats.values()),
            "templates": {}
        }
        
        for template_id, stats in self.usage_stats.items():
            dashboard["templates"][template_id] = self.get_stats(template_id)
        
        return dashboard
    
    def detect_anomalies(self, template_id: str, 
                        threshold: float = 2.0) -> List[Dict]:
        """检测异常"""
        stats = self.usage_stats.get(template_id, {})
        
        if stats["count"] < 10:
            return []
        
        anomalies = []
        
        # 检查错误率异常
        error_rate = stats["errors"] / stats["count"]
        if error_rate > 0.1:  # 10%错误率
            anomalies.append({
                "type": "high_error_rate",
                "value": error_rate,
                "threshold": 0.1
            })
        
        # 检查延迟异常
        avg_latency = stats["total_latency"] / stats["count"]
        if avg_latency > 5000:  # 5秒
            anomalies.append({
                "type": "high_latency",
                "value": avg_latency,
                "threshold": 5000
            })
        
        return anomalies