知识库搭建
有了向量数据库,我们需要把文档处理成可检索的知识库。本节介绍文档加载、切分、索引的完整流程。
知识库搭建流程
原始文档 → 加载 → 切分 → 向量化 → 存入向量库
↓
PDF、Word、网页、数据库...文档加载
加载文本文件
python
from langchain_community.document_loaders import TextLoader
loader = TextLoader("document.txt", encoding="utf-8")
documents = loader.load()加载PDF
python
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("document.pdf")
pages = loader.load() # 每页一个Document对象
print(f"共{len(pages)}页")
print(pages[0].page_content[:200])加载Word文档
python
from langchain_community.document_loaders import Docx2txtLoader
loader = Docx2txtLoader("document.docx")
documents = loader.load()加载网页
python
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://example.com/article")
documents = loader.load()加载Markdown
python
from langchain_community.document_loaders import UnstructuredMarkdownLoader
loader = UnstructuredMarkdownLoader("document.md")
documents = loader.load()加载目录
python
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader(
"./docs",
glob="**/*.md", # 匹配所有markdown文件
show_progress=True
)
documents = loader.load()加载代码文件
python
from langchain_community.document_loaders import PythonLoader
loader = PythonLoader("script.py")
documents = loader.load()文档切分
为什么需要切分?
问题:
- 文档太长,超过模型上下文窗口
- 需要精确定位相关片段
- 提高检索效率
解决方案:
- 把长文档切成小块
- 每块独立索引
- 检索时只返回相关块按字符切分
python
from langchain_text_splitters import CharacterTextSplitter
splitter = CharacterTextSplitter(
chunk_size=500, # 每块最大500字符
chunk_overlap=50, # 块之间重叠50字符
separator="\n\n" # 按段落分隔
)
chunks = splitter.split_documents(documents)递归字符切分(推荐)
python
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", "。", "!", "?", " ", ""]
)
chunks = splitter.split_documents(documents)按Token切分
python
from langchain_text_splitters import TokenTextSplitter
splitter = TokenTextSplitter(
chunk_size=500, # 每块500个Token
chunk_overlap=50
)
chunks = splitter.split_documents(documents)代码切分
python
from langchain_text_splitters import PythonCodeTextSplitter
splitter = PythonCodeTextSplitter(chunk_size=500)
chunks = splitter.split_documents(documents)Markdown按标题切分
python
from langchain_text_splitters import MarkdownHeaderTextSplitter
markdown_document = """
# 标题1
内容1
## 子标题1
内容2
"""
headers_to_split_on = [
("#", "header1"),
("##", "header2"),
]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
chunks = splitter.split_text(markdown_document)构建知识库
完整示例
python
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
# 1. 加载文档
loader = DirectoryLoader("./knowledge", glob="**/*.pdf")
documents = loader.load()
print(f"加载了 {len(documents)} 个文档")
# 2. 切分文档
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = splitter.split_documents(documents)
print(f"切分成 {len(chunks)} 个块")
# 3. 创建向量库
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory="./knowledge_db"
)
print("知识库构建完成!")
# 4. 测试检索
results = vectorstore.similarity_search("什么是机器学习?", k=3)
for i, doc in enumerate(results):
print(f"\n结果{i+1}:")
print(doc.page_content[:200])添加元数据
python
from langchain_core.documents import Document
# 创建带元数据的文档
docs = [
Document(
page_content="内容...",
metadata={
"source": "manual.pdf",
"page": 1,
"category": "技术文档"
}
)
]
vectorstore.add_documents(docs)
# 按元数据过滤搜索
results = vectorstore.similarity_search(
"查询",
filter={"category": "技术文档"}
)增量更新
python
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
# 加载已有向量库
vectorstore = Chroma(
persist_directory="./knowledge_db",
embedding_function=OpenAIEmbeddings()
)
# 添加新文档
new_docs = [...] # 新文档列表
vectorstore.add_documents(new_docs)
# 删除旧文档
vectorstore.delete(["doc_id_1", "doc_id_2"])知识库管理类
python
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
class KnowledgeBase:
def __init__(self, persist_directory: str):
self.embeddings = OpenAIEmbeddings()
self.vectorstore = Chroma(
persist_directory=persist_directory,
embedding_function=self.embeddings
)
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
def add_pdf(self, file_path: str):
"""添加PDF文档"""
loader = PyPDFLoader(file_path)
documents = loader.load()
chunks = self.splitter.split_documents(documents)
self.vectorstore.add_documents(chunks)
return len(chunks)
def add_text(self, file_path: str):
"""添加文本文档"""
loader = TextLoader(file_path)
documents = loader.load()
chunks = self.splitter.split_documents(documents)
self.vectorstore.add_documents(chunks)
return len(chunks)
def search(self, query: str, k: int = 5):
"""搜索相关内容"""
return self.vectorstore.similarity_search(query, k=k)
def get_retriever(self):
"""获取检索器"""
return self.vectorstore.as_retriever()
# 使用
kb = KnowledgeBase("./knowledge_db")
kb.add_pdf("document.pdf")
results = kb.search("查询内容")小结
| 步骤 | 说明 |
|---|---|
| 加载 | 支持PDF、Word、网页等格式 |
| 切分 | 按字符、Token、语义切分 |
| 向量化 | 使用Embedding模型 |
| 存储 | 存入向量数据库 |
| 更新 | 支持增量添加删除 |
下一步
继续学习 检索增强生成(RAG),将知识库与大模型结合。