结构化输出
--- title: "结构化输出" description: "LLM结构化输出技术,包括JSON Schema和约束解码" tags: ["结构化输出", "JSON Schema", "约束解码", "输出格式", "数据提取"] category: "llm" icon: "🧠"
结构化输出
结构化输出是确保LLM输出符合预定义格式的关键技术。在生产环境中,LLM的输出需要被程序解析和处理,结构化输出保证了输出的可靠性和一致性。JSON Schema和约束解码是实现结构化输出的主要方法。
JSON Schema定义
基础Schema定义
from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum
class SentimentType(str, Enum):
POSITIVE = "positive"
NEGATIVE = "negative"
NEUTRAL = "neutral"
class Entity(BaseModel):
name: str = Field(description="实体名称")
type: str = Field(description="实体类型,如人名、地名、组织")
confidence: float = Field(ge=0, le=1, description="置信度")
class AnalysisResult(BaseModel):
sentiment: SentimentType = Field(description="情感倾向")
summary: str = Field(max_length=200, description="摘要")
entities: List[Entity] = Field(description="识别到的实体")
keywords: List[str] = Field(min_length=1, max_length=10, description="关键词")
confidence: float = Field(ge=0, le=1, description="整体置信度")
schema = AnalysisResult.model_json_schema()
print("Schema定义:")
print(json.dumps(schema, indent=2, ensure_ascii=False))
动态Schema生成
from typing import Dict, Any, Type
from pydantic import create_model, Field
class SchemaGenerator:
@staticmethod
def create_extraction_schema(fields: Dict[str, Dict[str, Any]]) -> Type[BaseModel]:
field_definitions = {}
for field_name, field_config in fields.items():
field_type = field_config.get("type", str)
description = field_config.get("description", "")
required = field_config.get("required", True)
default = field_config.get("default", None)
if not required and default is None:
field_definitions[field_name] = (Optional[field_type], Field(description=description))
else:
field_definitions[field_name] = (field_type, Field(description=description))
return create_model("DynamicModel", **field_definitions)
generator = SchemaGenerator()
schema = generator.create_extraction_schema({
"title": {"type": str, "description": "文档标题", "required": True},
"author": {"type": str, "description": "作者", "required": False},
"word_count": {"type": int, "description": "字数", "required": True},
"tags": {"type": list, "description": "标签列表", "required": False, "default": []}
})
print("动态Schema:", schema.model_json_schema())
OpenAI结构化输出
from openai import OpenAI
import json
client = OpenAI()
response_format = {
"type": "json_schema",
"json_schema": {
"name": "extraction_result",
"strict": True,
"schema": {
"type": "object",
"properties": {
"sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
"entities": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"type": {"type": "string"}
},
"required": ["name", "type"]
}
},
"summary": {"type": "string"}
},
"required": ["sentiment", "entities", "summary"]
}
}
}
result = client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages=[{"role": "user", "content": "分析这段文本的情感和实体"}],
response_format=response_format
)
parsed = json.loads(result.choices[0].message.content)
print("解析结果:", json.dumps(parsed, ensure_ascii=False, indent=2))
约束解码
Outlines集成
import outlines
from pydantic import BaseModel
from typing import List
class Recipe(BaseModel):
name: str
ingredients: List[str]
steps: List[str]
cooking_time: int
model = outlines.models.transformers("Qwen/Qwen2.5-7B-Instruct")
generator = outlines.generate.json(model, Recipe)
result = generator("请提供一个简单的番茄炒蛋食谱")
print("结构化输出:", result)
LangChain结构化输出
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
class MovieReview(BaseModel):
title: str = Field(description="电影标题")
rating: float = Field(ge=0, le=10, description="评分0-10")
genres: List[str] = Field(description="电影类型")
summary: str = Field(description="一句话总结")
llm = ChatOpenAI(model="gpt-4o")
structured_llm = llm.with_structured_output(MovieReview)
prompt = ChatPromptTemplate.from_template("请评价电影《{movie}》")
chain = prompt | structured_llm
result = chain.invoke({"movie": "流浪地球"})
print(f"电影: {result.title}")
print(f"评分: {result.rating}")
print(f"类型: {', '.join(result.genres)}")
错误处理与重试
import json
from typing import Optional
from pydantic import ValidationError
class StructuredOutputHandler:
def __init__(self, max_retries: int = 3):
self.max_retries = max_retries
async def generate_with_retry(self, llm, prompt: str, schema: type) -> Optional[type]:
for attempt in range(self.max_retries):
try:
response = await llm.ainvoke(prompt)
return schema.model_validate_json(response.content)
except ValidationError as e:
print(f"验证失败 (尝试 {attempt + 1}/{self.max_retries}): {e}")
if attempt == self.max_retries - 1:
return None
except Exception as e:
print(f"生成失败: {e}")
return None
return None
handler = StructuredOutputHandler(max_retries=3)
应用场景
结构化输出广泛应用于数据提取、表单填写、API响应生成、报告生成等场景。选择合适的实现方式需要考虑模型支持、Schema复杂度和性能需求。