Prompt工程架构:模板管理、版本控制与评估
Prompt工程架构:模板管理、版本控制与评估
Prompt管理系统概述
随着LLM应用的普及,Prompt已成为一种重要的代码资产。Prompt管理系统需要支持模板化、版本控制、A/B测试、性能评估和协作开发,确保Prompt质量和可追溯性。
# Prompt管理系统核心
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any, Callable
from datetime import datetime
import uuid
import json
@dataclass
class PromptTemplate:
template_id: str
name: str
template: str
version: str
variables: List[str]
description: str = ""
author: str = ""
tags: List[str] = field(default_factory=list)
metadata: Dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.now)
is_active: bool = True
@dataclass
class PromptVersion:
version_id: str
template_id: str
version: str
template: str
changelog: str
metrics: Dict[str, float] = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.now)
class PromptManager:
def __init__(self):
self.templates: Dict[str, PromptTemplate] = {}
self.versions: Dict[str, List[PromptVersion]] = {}
def create_template(self, name: str, template: str,
variables: List[str], **kwargs) -> PromptTemplate:
"""创建新模板"""
template_id = str(uuid.uuid4())[:8]
prompt = PromptTemplate(
template_id=template_id,
name=name,
template=template,
version="1.0.0",
variables=variables,
**kwargs
)
self.templates[template_id] = prompt
self.versions[template_id] = [PromptVersion(
version_id=str(uuid.uuid4())[:8],
template_id=template_id,
version="1.0.0",
template=template,
changelog="Initial version"
)]
return prompt
def update_template(self, template_id: str, new_template: str,
changelog: str = "") -> PromptTemplate:
"""更新模板(创建新版本)"""
template = self.templates.get(template_id)
if not template:
raise ValueError(f"Template not found: {template_id}")
# 更新版本号
parts = template.version.split(".")
parts[2] = str(int(parts[2]) + 1)
new_version = ".".join(parts)
# 保存旧版本
old_version = PromptVersion(
version_id=str(uuid.uuid4())[:8],
template_id=template_id,
version=template.version,
template=template.template,
changelog=changelog or "Updated template"
)
if template_id not in self.versions:
self.versions[template_id] = []
self.versions[template_id].append(old_version)
# 更新当前版本
template.template = new_template
template.version = new_version
template.created_at = datetime.now()
return template
def get_template(self, template_id: str,
version: str = None) -> Optional[PromptTemplate]:
"""获取模板"""
template = self.templates.get(template_id)
if not template:
return None
if version and version != template.version:
# 获取指定版本
for v in self.versions.get(template_id, []):
if v.version == version:
return PromptTemplate(
template_id=template_id,
name=template.name,
template=v.template,
version=v.version,
variables=template.variables
)
return None
return template
def render(self, template_id: str, variables: Dict[str, Any]) -> str:
"""渲染模板"""
template = self.get_template(template_id)
if not template:
raise ValueError(f"Template not found: {template_id}")
rendered = template.template
for var_name, var_value in variables.items():
rendered = rendered.replace(f"{{{var_name}}}", str(var_value))
return rendered
def list_templates(self, tags: List[str] = None) -> List[PromptTemplate]:
"""列出模板"""
templates = list(self.templates.values())
if tags:
templates = [t for t in templates
if any(tag in t.tags for tag in tags)]
return templates
Prompt版本控制
版本控制是Prompt管理的核心功能,支持版本历史、差异比较、回滚和标签管理。每次修改都应记录变更原因和性能影响。
# Prompt版本控制器
class PromptVersionControl:
def __init__(self, manager: PromptManager):
self.manager = manager
def commit(self, template_id: str, changes: Dict,
message: str, author: str = "") -> str:
"""提交变更"""
template = self.manager.templates.get(template_id)
if not template:
raise ValueError(f"Template not found: {template_id}")
# 应用变更
new_template = template.template
if "template" in changes:
new_template = changes["template"]
# 更新模板
updated = self.manager.update_template(
template_id,
new_template,
changelog=message
)
# 记录提交
commit_id = str(uuid.uuid4())[:8]
print(f"Committed {commit_id}: {message}")
return commit_id
def diff(self, template_id: str, version1: str,
version2: str) -> Dict:
"""比较两个版本的差异"""
versions = self.manager.versions.get(template_id, [])
v1 = next((v for v in versions if v.version == version1), None)
v2 = next((v for v in versions if v.version == version2), None)
if not v1 or not v2:
return {"error": "Version not found"}
# 简单的行级差异
lines1 = v1.template.split("\n")
lines2 = v2.template.split("\n")
diff = []
max_lines = max(len(lines1), len(lines2))
for i in range(max_lines):
line1 = lines1[i] if i < len(lines1) else ""
line2 = lines2[i] if i < len(lines2) else ""
if line1 != line2:
diff.append({
"line": i + 1,
"old": line1,
"new": line2
})
return {
"version1": version1,
"version2": version2,
"changes": diff,
"summary": f"{len(diff)} lines changed"
}
def rollback(self, template_id: str, target_version: str) -> bool:
"""回滚到指定版本"""
template = self.manager.templates.get(template_id)
if not template:
return False
versions = self.manager.versions.get(template_id, [])
target = next((v for v in versions if v.version == target_version), None)
if not target:
return False
# 使用目标版本的模板内容
self.manager.update_template(
template_id,
target.template,
changelog=f"Rollback to version {target_version}"
)
return True
def tag(self, template_id: str, version: str, tag_name: str):
"""给版本打标签"""
template = self.manager.templates.get(template_id)
if template:
template.metadata.setdefault("tags", {})
template.metadata["tags"][tag_name] = version
def get_history(self, template_id: str) -> List[Dict]:
"""获取版本历史"""
versions = self.manager.versions.get(template_id, [])
history = []
for v in versions:
history.append({
"version": v.version,
"changelog": v.changelog,
"created_at": v.created_at.isoformat(),
"metrics": v.metrics
})
return sorted(history, key=lambda x: x["created_at"], reverse=True)
Prompt评估框架
Prompt评估包括多个维度:准确性、相关性、一致性、安全性和延迟。自动化评估框架需要支持批量测试、统计分析和性能基准比较。
# Prompt评估器
import statistics
from typing import Callable
class PromptEvaluator:
def __init__(self, llm_client):
self.llm = llm_client
self.evaluation_metrics = {}
def evaluate(self, template_id: str, test_cases: List[Dict],
metrics: List[str] = None) -> Dict:
"""评估Prompt模板"""
results = []
for test_case in test_cases:
result = self._evaluate_single(template_id, test_case, metrics)
results.append(result)
# 汇总统计
summary = self._aggregate_results(results)
return {
"template_id": template_id,
"test_cases": len(test_cases),
"results": results,
"summary": summary
}
def _evaluate_single(self, template_id: str, test_case: Dict,
metrics: List[str] = None) -> Dict:
"""评估单个测试用例"""
# 渲染Prompt
prompt = self.manager.render(template_id, test_case["variables"])
# 调用LLM
response = self.llm.generate(prompt)
# 计算指标
scores = {}
if not metrics or "accuracy" in metrics:
scores["accuracy"] = self._check_accuracy(response, test_case.get("expected"))
if not metrics or "relevance" in metrics:
scores["relevance"] = self._check_relevance(response, test_case["input"])
if not metrics or "safety" in metrics:
scores["safety"] = self._check_safety(response)
return {
"input": test_case["input"],
"response": response,
"expected": test_case.get("expected"),
"scores": scores
}
def _check_accuracy(self, response: str, expected: str) -> float:
"""检查准确性"""
if not expected:
return 1.0
# 简单的关键词匹配
expected_words = set(expected.lower().split())
response_words = set(response.lower().split())
overlap = len(expected_words & response_words)
return overlap / max(len(expected_words), 1)
def _check_relevance(self, response: str, query: str) -> float:
"""检查相关性"""
# 使用LLM评估相关性
prompt = f"""评估以下回答与问题的相关性(0-1分):
问题:{query}
回答:{response}
相关性分数:"""
score_str = self.llm.generate(prompt)
try:
return float(score_str.strip())
except:
return 0.5
def _check_safety(self, response: str) -> float:
"""检查安全性"""
unsafe_patterns = ["暴力", "歧视", "违法", "有害"]
for pattern in unsafe_patterns:
if pattern in response:
return 0.0
return 1.0
def _aggregate_results(self, results: List[Dict]) -> Dict:
"""汇总评估结果"""
all_scores = {}
for result in results:
for metric, score in result["scores"].items():
if metric not in all_scores:
all_scores[metric] = []
all_scores[metric].append(score)
summary = {}
for metric, scores in all_scores.items():
summary[metric] = {
"mean": statistics.mean(scores),
"std": statistics.stdev(scores) if len(scores) > 1 else 0,
"min": min(scores),
"max": max(scores)
}
return summary
# Prompt基准测试
class PromptBenchmark:
def __init__(self, evaluator: PromptEvaluator):
self.evaluator = evaluator
self.benchmarks = {}
def run_benchmark(self, template_ids: List[str],
test_suite: List[Dict]) -> Dict:
"""运行基准测试"""
results = {}
for template_id in template_ids:
result = self.evaluator.evaluate(template_id, test_suite)
results[template_id] = result["summary"]
# 排名
rankings = self._rank_templates(results)
return {
"results": results,
"rankings": rankings
}
def _rank_templates(self, results: Dict) -> List[Dict]:
"""按综合分数排名"""
rankings = []
for template_id, summary in results.items():
# 计算综合分数
overall = sum(
metric["mean"]
for metric in summary.values()
) / max(len(summary), 1)
rankings.append({
"template_id": template_id,
"overall_score": overall,
"details": summary
})
rankings.sort(key=lambda x: x["overall_score"], reverse=True)
return rankings
Prompt优化与最佳实践
Prompt优化是持续改进的过程,包括结构化设计、示例选择、参数调优和失败分析。建立最佳实践库帮助团队快速编写高质量Prompt。
# Prompt优化器
class PromptOptimizer:
def __init__(self, llm_client, evaluator: PromptEvaluator):
self.llm = llm_client
self.evaluator = evaluator
async def optimize(self, template_id: str,
test_cases: List[Dict],
iterations: int = 5) -> Dict:
"""自动优化Prompt"""
template = self.evaluator.manager.get_template(template_id)
current_template = template.template
best_score = 0
best_template = current_template
for i in range(iterations):
# 生成优化建议
suggestions = await self._generate_suggestions(
current_template, test_cases
)
# 应用建议
new_template = await self._apply_suggestions(
current_template, suggestions
)
# 评估新版本
# 更新模板进行评估
self.evaluator.manager.update_template(
template_id, new_template
)
result = self.evaluator.evaluate(template_id, test_cases)
score = result["summary"].get("accuracy", {}).get("mean", 0)
if score > best_score:
best_score = score
best_template = new_template
print(f"Iteration {i+1}: score = {score:.3f}")
return {
"original": template.template,
"optimized": best_template,
"score_improvement": best_score
}
async def _generate_suggestions(self, template: str,
test_cases: List[Dict]) -> List[str]:
"""生成优化建议"""
prompt = f"""分析以下Prompt模板,提供优化建议:
当前模板:
{template}
请提供3个具体的改进建议:"""
response = await self.llm.generate(prompt)
return [s.strip() for s in response.split("\n") if s.strip()]
async def _apply_suggestions(self, template: str,
suggestions: List[str]) -> str:
"""应用优化建议"""
prompt = f"""根据以下建议改进Prompt模板:
当前模板:
{template}
改进建议:
{chr(10).join(f'- {s}' for s in suggestions)}
改进后的模板:"""
return await self.llm.generate(prompt)
# Prompt模板库
class PromptLibrary:
def __init__(self):
self.templates = {}
self.best_practices = []
def register_template(self, category: str, name: str,
template: str, description: str):
"""注册模板"""
if category not in self.templates:
self.templates[category] = {}
self.templates[category][name] = {
"template": template,
"description": description,
"usage_count": 0
}
def get_template(self, category: str, name: str) -> Optional[str]:
"""获取模板"""
if category in self.templates and name in self.templates[category]:
self.templates[category][name]["usage_count"] += 1
return self.templates[category][name]["template"]
return None
def add_best_practice(self, practice: str):
"""添加最佳实践"""
self.best_practices.append(practice)
def search(self, query: str) -> List[Dict]:
"""搜索模板"""
results = []
for category, templates in self.templates.items():
for name, info in templates.items():
if query.lower() in name.lower() or query.lower() in info["description"].lower():
results.append({
"category": category,
"name": name,
"description": info["description"],
"usage_count": info["usage_count"]
})
return results
# 预定义模板库
library = PromptLibrary()
library.register_template(
"classification",
"zero_shot",
"""对以下文本进行分类:
文本:{text}
类别:{categories}
分类结果:""",
"零样本分类模板"
)
library.register_template(
"extraction",
"entity",
"""从以下文本中提取实体:
文本:{text}
实体类型:{entity_types}
提取结果:""",
"实体提取模板"
)
library.register_template(
"generation",
"summary",
"""请对以下内容进行总结:
内容:{content}
总结要求:
1. 长度:{length}
2. 重点:{focus}
总结:""",
"文本摘要模板"
)
Prompt监控与分析
监控Prompt的使用情况和效果,帮助持续优化。指标包括:调用量、延迟、成本、用户反馈和成功率。
# Prompt监控
from collections import defaultdict
import time
class PromptMonitor:
def __init__(self):
self.usage_stats = defaultdict(lambda: {
"count": 0,
"total_latency": 0,
"errors": 0,
"feedback_scores": []
})
def record_usage(self, template_id: str, latency_ms: float,
success: bool, feedback_score: float = None):
"""记录使用情况"""
stats = self.usage_stats[template_id]
stats["count"] += 1
stats["total_latency"] += latency_ms
if not success:
stats["errors"] += 1
if feedback_score is not None:
stats["feedback_scores"].append(feedback_score)
def get_stats(self, template_id: str) -> Dict:
"""获取统计数据"""
stats = self.usage_stats.get(template_id, {})
if stats["count"] == 0:
return {"count": 0}
return {
"count": stats["count"],
"avg_latency_ms": stats["total_latency"] / stats["count"],
"error_rate": stats["errors"] / stats["count"],
"avg_feedback": (
sum(stats["feedback_scores"]) / len(stats["feedback_scores"])
if stats["feedback_scores"] else None
)
}
def get_dashboard(self) -> Dict:
"""获取监控面板数据"""
dashboard = {
"total_templates": len(self.usage_stats),
"total_calls": sum(s["count"] for s in self.usage_stats.values()),
"templates": {}
}
for template_id, stats in self.usage_stats.items():
dashboard["templates"][template_id] = self.get_stats(template_id)
return dashboard
def detect_anomalies(self, template_id: str,
threshold: float = 2.0) -> List[Dict]:
"""检测异常"""
stats = self.usage_stats.get(template_id, {})
if stats["count"] < 10:
return []
anomalies = []
# 检查错误率异常
error_rate = stats["errors"] / stats["count"]
if error_rate > 0.1: # 10%错误率
anomalies.append({
"type": "high_error_rate",
"value": error_rate,
"threshold": 0.1
})
# 检查延迟异常
avg_latency = stats["total_latency"] / stats["count"]
if avg_latency > 5000: # 5秒
anomalies.append({
"type": "high_latency",
"value": avg_latency,
"threshold": 5000
})
return anomalies