diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py
index 82114d755..96e660a42 100644
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -29,6 +29,7 @@ import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
+from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from open_webui.models.files import FileModel, Files
@@ -982,6 +983,7 @@ def save_docs_to_vector_db(
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
add_start_index=True,
)
+ docs = text_splitter.split_documents(docs)
elif request.app.state.config.TEXT_SPLITTER == "token":
log.info(
f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
@@ -994,11 +996,40 @@ def save_docs_to_vector_db(
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
add_start_index=True,
)
+ docs = text_splitter.split_documents(docs)
+ elif request.app.state.config.TEXT_SPLITTER == "markdown_header":
+ log.info("Using markdown header text splitter")
+
+ # Define headers to split on - covering most common markdown header levels
+ headers_to_split_on = [
+ ("#", "Header 1"),
+ ("##", "Header 2"),
+ ("###", "Header 3"),
+ ("####", "Header 4"),
+ ("#####", "Header 5"),
+ ("######", "Header 6"),
+ ]
+
+ markdown_splitter = MarkdownHeaderTextSplitter(
+ headers_to_split_on=headers_to_split_on,
+ strip_headers=False, # Keep headers in content for context
+ )
+
+ md_split_docs = []
+ for doc in docs:
+ md_header_splits = markdown_splitter.split_text(doc.page_content)
+
+ # Convert back to Document objects, preserving original metadata
+ for split in md_header_splits:
+ md_split_docs.append(Document(
+ page_content=split.page_content,
+ metadata={**doc.metadata, **split.metadata}
+ ))
+
+ docs = md_split_docs
else:
raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
- docs = text_splitter.split_documents(docs)
-
if len(docs) == 0:
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte
index 0660dc7ae..a7ef572a7 100644
--- a/src/lib/components/admin/Settings/Documents.svelte
+++ b/src/lib/components/admin/Settings/Documents.svelte
@@ -398,6 +398,7 @@
>
+