mirror of
https://github.com/open-webui/open-webui
synced 2025-06-09 07:56:42 +00:00
feat: add langchain markdown document splitter
This commit is contained in:
parent
30d15c1b4b
commit
ac26da74f9
@ -29,6 +29,7 @@ import tiktoken
|
|||||||
|
|
||||||
|
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
|
||||||
|
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from open_webui.models.files import FileModel, Files
|
from open_webui.models.files import FileModel, Files
|
||||||
@ -982,6 +983,7 @@ def save_docs_to_vector_db(
|
|||||||
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
||||||
add_start_index=True,
|
add_start_index=True,
|
||||||
)
|
)
|
||||||
|
docs = text_splitter.split_documents(docs)
|
||||||
elif request.app.state.config.TEXT_SPLITTER == "token":
|
elif request.app.state.config.TEXT_SPLITTER == "token":
|
||||||
log.info(
|
log.info(
|
||||||
f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
|
f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
|
||||||
@ -994,11 +996,40 @@ def save_docs_to_vector_db(
|
|||||||
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
||||||
add_start_index=True,
|
add_start_index=True,
|
||||||
)
|
)
|
||||||
|
docs = text_splitter.split_documents(docs)
|
||||||
|
elif request.app.state.config.TEXT_SPLITTER == "markdown_header":
|
||||||
|
log.info("Using markdown header text splitter")
|
||||||
|
|
||||||
|
# Define headers to split on - covering most common markdown header levels
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
("###", "Header 3"),
|
||||||
|
("####", "Header 4"),
|
||||||
|
("#####", "Header 5"),
|
||||||
|
("######", "Header 6"),
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
strip_headers=False, # Keep headers in content for context
|
||||||
|
)
|
||||||
|
|
||||||
|
md_split_docs = []
|
||||||
|
for doc in docs:
|
||||||
|
md_header_splits = markdown_splitter.split_text(doc.page_content)
|
||||||
|
|
||||||
|
# Convert back to Document objects, preserving original metadata
|
||||||
|
for split in md_header_splits:
|
||||||
|
md_split_docs.append(Document(
|
||||||
|
page_content=split.page_content,
|
||||||
|
metadata={**doc.metadata, **split.metadata}
|
||||||
|
))
|
||||||
|
|
||||||
|
docs = md_split_docs
|
||||||
else:
|
else:
|
||||||
raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
|
raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
|
||||||
|
|
||||||
docs = text_splitter.split_documents(docs)
|
|
||||||
|
|
||||||
if len(docs) == 0:
|
if len(docs) == 0:
|
||||||
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
||||||
|
|
||||||
|
@ -398,6 +398,7 @@
|
|||||||
>
|
>
|
||||||
<option value="">{$i18n.t('Default')} ({$i18n.t('Character')})</option>
|
<option value="">{$i18n.t('Default')} ({$i18n.t('Character')})</option>
|
||||||
<option value="token">{$i18n.t('Token')} ({$i18n.t('Tiktoken')})</option>
|
<option value="token">{$i18n.t('Token')} ({$i18n.t('Tiktoken')})</option>
|
||||||
|
<option value="markdown_header">{$i18n.t('Markdown (Header)')}</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
Loading…
Reference in New Issue
Block a user