← 返回首页
🧠

结构化输出

📂 llm ⏱ 3 min 456 words

--- title: "结构化输出" description: "LLM结构化输出技术,包括JSON Schema和约束解码" tags: ["结构化输出", "JSON Schema", "约束解码", "输出格式", "数据提取"] category: "llm" icon: "🧠"

结构化输出

结构化输出是确保LLM输出符合预定义格式的关键技术。在生产环境中,LLM的输出需要被程序解析和处理,结构化输出保证了输出的可靠性和一致性。JSON Schema和约束解码是实现结构化输出的主要方法。

JSON Schema定义

基础Schema定义

from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

class SentimentType(str, Enum):
    POSITIVE = "positive"
    NEGATIVE = "negative"
    NEUTRAL = "neutral"

class Entity(BaseModel):
    name: str = Field(description="实体名称")
    type: str = Field(description="实体类型,如人名、地名、组织")
    confidence: float = Field(ge=0, le=1, description="置信度")

class AnalysisResult(BaseModel):
    sentiment: SentimentType = Field(description="情感倾向")
    summary: str = Field(max_length=200, description="摘要")
    entities: List[Entity] = Field(description="识别到的实体")
    keywords: List[str] = Field(min_length=1, max_length=10, description="关键词")
    confidence: float = Field(ge=0, le=1, description="整体置信度")

schema = AnalysisResult.model_json_schema()
print("Schema定义:")
print(json.dumps(schema, indent=2, ensure_ascii=False))

动态Schema生成

from typing import Dict, Any, Type
from pydantic import create_model, Field

class SchemaGenerator:
    @staticmethod
    def create_extraction_schema(fields: Dict[str, Dict[str, Any]]) -> Type[BaseModel]:
        field_definitions = {}
        for field_name, field_config in fields.items():
            field_type = field_config.get("type", str)
            description = field_config.get("description", "")
            required = field_config.get("required", True)
            default = field_config.get("default", None)

            if not required and default is None:
                field_definitions[field_name] = (Optional[field_type], Field(description=description))
            else:
                field_definitions[field_name] = (field_type, Field(description=description))

        return create_model("DynamicModel", **field_definitions)

generator = SchemaGenerator()
schema = generator.create_extraction_schema({
    "title": {"type": str, "description": "文档标题", "required": True},
    "author": {"type": str, "description": "作者", "required": False},
    "word_count": {"type": int, "description": "字数", "required": True},
    "tags": {"type": list, "description": "标签列表", "required": False, "default": []}
})
print("动态Schema:", schema.model_json_schema())

OpenAI结构化输出

from openai import OpenAI
import json

client = OpenAI()

response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "extraction_result",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
                "entities": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "type": {"type": "string"}
                        },
                        "required": ["name", "type"]
                    }
                },
                "summary": {"type": "string"}
            },
            "required": ["sentiment", "entities", "summary"]
        }
    }
}

result = client.chat.completions.create(
    model="gpt-4o-2024-08-06",
    messages=[{"role": "user", "content": "分析这段文本的情感和实体"}],
    response_format=response_format
)

parsed = json.loads(result.choices[0].message.content)
print("解析结果:", json.dumps(parsed, ensure_ascii=False, indent=2))

约束解码

Outlines集成

import outlines
from pydantic import BaseModel
from typing import List

class Recipe(BaseModel):
    name: str
    ingredients: List[str]
    steps: List[str]
    cooking_time: int

model = outlines.models.transformers("Qwen/Qwen2.5-7B-Instruct")
generator = outlines.generate.json(model, Recipe)

result = generator("请提供一个简单的番茄炒蛋食谱")
print("结构化输出:", result)

LangChain结构化输出

from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

class MovieReview(BaseModel):
    title: str = Field(description="电影标题")
    rating: float = Field(ge=0, le=10, description="评分0-10")
    genres: List[str] = Field(description="电影类型")
    summary: str = Field(description="一句话总结")

llm = ChatOpenAI(model="gpt-4o")
structured_llm = llm.with_structured_output(MovieReview)

prompt = ChatPromptTemplate.from_template("请评价电影《{movie}》")
chain = prompt | structured_llm
result = chain.invoke({"movie": "流浪地球"})
print(f"电影: {result.title}")
print(f"评分: {result.rating}")
print(f"类型: {', '.join(result.genres)}")

错误处理与重试

import json
from typing import Optional
from pydantic import ValidationError

class StructuredOutputHandler:
    def __init__(self, max_retries: int = 3):
        self.max_retries = max_retries

    async def generate_with_retry(self, llm, prompt: str, schema: type) -> Optional[type]:
        for attempt in range(self.max_retries):
            try:
                response = await llm.ainvoke(prompt)
                return schema.model_validate_json(response.content)
            except ValidationError as e:
                print(f"验证失败 (尝试 {attempt + 1}/{self.max_retries}): {e}")
                if attempt == self.max_retries - 1:
                    return None
            except Exception as e:
                print(f"生成失败: {e}")
                return None
        return None

handler = StructuredOutputHandler(max_retries=3)

应用场景

结构化输出广泛应用于数据提取、表单填写、API响应生成、报告生成等场景。选择合适的实现方式需要考虑模型支持、Schema复杂度和性能需求。