← 返回首页
🧠

LlamaIndex:数据驱动的LLM应用框架

📂 llm ⏱ 2 min 321 words

--- title: "LlamaIndex:数据驱动的LLM应用框架" description: "深入掌握LlamaIndex的核心概念、数据索引策略和查询引擎,构建强大的RAG应用" tags: ["LlamaIndex", "RAG", "LLM应用", "数据索引"] category: "llm" icon: "🧠"

LlamaIndex:数据驱动的LLM应用框架

LlamaIndex简介

LlamaIndex(原名GPT Index)是一个专为大语言模型设计的数据框架,用于连接LLM与外部数据源。它提供了简洁的API来索引、查询和分析各种数据格式,是构建RAG应用的首选框架之一。

LlamaIndex的核心价值:

安装与快速开始

安装

pip install llama-index
# 可选:安装常用数据连接器
pip install llama-index.readers.file
pip install llama-index.vector-stores.chroma

基本使用

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# 加载文档
documents = SimpleDirectoryReader("./data").load_data()

# 创建索引(自动使用OpenAI嵌入)
index = VectorStoreIndex.from_documents(documents)

# 查询
query_engine = index.as_query_engine()
response = query_engine.query("文档的主要内容是什么?")
print(response)

数据连接器

LlamaIndex支持多种数据源的读取:

# 本地文件
from llama_index.readers.file import PDFReader, DocxReader
reader = PDFReader()
documents = reader.load_data(file_path="./document.pdf")

# 网页
from llama_index.readers.web import BeautifulSoupWebReader
reader = BeautifulSoupWebReader()
documents = reader.load_data(urls=["https://example.com"])

# 数据库
from llama_index.readers.database import DatabaseReader
reader = DatabaseReader(uri="sqlite:///mydb.db")
documents = reader.load_data(query="SELECT * FROM articles")

# Notion
from llama_index.readers.notion import NotionPageReader
reader = NotionPageReader(integration_token="YOUR_TOKEN")
documents = reader.load_data(page_ids=["page1", "page2"])

文档处理与分块

文本分割

from llama_index.core.node_parser import (
    SentenceSplitter,
    TokenTextSplitter,
    HierarchicalNodeParser
)

# 句子分割器
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(documents)

# Token分割器
token_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
nodes = token_splitter.get_nodes_from_documents(documents)

# 层级分割器(生成父子节点)
hierarchical_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)
nodes = hierarchical_parser.get_nodes_from_documents(documents)

索引类型

向量存储索引

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

# 使用Chroma作为向量存储
import chromadb
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("my_docs")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_documents(
    documents,
    vector_store=vector_store
)

关键词索引

from llama_index.core import KeywordTableIndex

# 基于关键词的索引
index = KeywordTableIndex.from_documents(documents)

知识图谱索引

from llama_index.core import KnowledgeGraphIndex
from llama_index.llms.openai import OpenAI

# 构建知识图谱
llm = OpenAI(model="gpt-4")
kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    llm=llm,
    max_triplets_per_chunk=10
)

查询引擎

基本查询

# 语义搜索
query_engine = index.as_query_engine(
    similarity_top_k=5,
    response_mode="compact"
)
response = query_engine.query("查询问题")

多查询引擎

from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool

# 定义多个子问题查询引擎
tools = [
    QueryEngineTool.from_defaults(
        query_engine=summary_index.as_query_engine(),
        description="用于总结文档"
    ),
    QueryEngineTool.from_defaults(
        query_engine=detail_index.as_query_engine(),
        description="用于查找具体细节"
    )
]

# 自动路由查询
query_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
response = query_engine.query("总结文档并给出关键数据")

路由查询

from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import (
    PydanticSingleSelector,
    LLMSingleSelector
)

# 定义选择器
selector = LLMSingleSelector.from_defaults()

# 创建路由查询引擎
router_engine = RouterQueryEngine(
    selector=selector,
    query_engine_tools=tools
)

# 自动选择合适的查询引擎
response = router_engine.query("查询问题")

构建完整RAG应用

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# 配置全局设置
Settings.llm = OpenAI(model="gpt-4", temperature=0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.chunk_size = 1024

# 加载和索引文档
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)

# 创建查询引擎
query_engine = index.as_query_engine(
    response_mode="tree_summarize",
    similarity_top_k=5
)

# 保存索引到磁盘
index.storage_context.persist(persist_dir="./storage")

# 从磁盘加载索引
from llama_index.core import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

LlamaIndex通过其丰富的功能和灵活的设计,成为构建生产级RAG应用的强大工具。