From 871efb4ad92f75125152b6296192754cbac6d571 Mon Sep 17 00:00:00 2001 From: Diwakar Singh Maurya Date: Sat, 24 May 2025 00:19:44 +0000 Subject: [PATCH] feat: add langchain markdown document splitter --- backend/open_webui/routers/retrieval.py | 47 ++++++++++++++++++- .../admin/Settings/Documents.svelte | 1 + 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 22b264bfa..cf58b87fe 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -29,6 +29,7 @@ import tiktoken from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter +from langchain_text_splitters import MarkdownHeaderTextSplitter from langchain_core.documents import Document from open_webui.models.files import FileModel, Files @@ -1117,6 +1118,7 @@ def save_docs_to_vector_db( chunk_overlap=request.app.state.config.CHUNK_OVERLAP, add_start_index=True, ) + docs = text_splitter.split_documents(docs) elif request.app.state.config.TEXT_SPLITTER == "token": log.info( f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}" @@ -1129,11 +1131,52 @@ def save_docs_to_vector_db( chunk_overlap=request.app.state.config.CHUNK_OVERLAP, add_start_index=True, ) + docs = text_splitter.split_documents(docs) + elif request.app.state.config.TEXT_SPLITTER == "markdown_header": + log.info("Using markdown header text splitter") + + # Define headers to split on - covering most common markdown header levels + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), + ("####", "Header 4"), + ("#####", "Header 5"), + ("######", "Header 6"), + ] + + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + strip_headers=False, # Keep headers in content for context + ) + + md_split_docs = [] + for doc in docs: + md_header_splits = markdown_splitter.split_text(doc.page_content) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=request.app.state.config.CHUNK_SIZE, + chunk_overlap=request.app.state.config.CHUNK_OVERLAP, + add_start_index=True, + ) + md_header_splits = text_splitter.split_documents(md_header_splits) + + # Convert back to Document objects, preserving original metadata + for split_chunk in md_header_splits: + headings_list = [] + # Extract header values in order based on headers_to_split_on + for _, header_meta_key_name in headers_to_split_on: + if header_meta_key_name in split_chunk.metadata: + headings_list.append(split_chunk.metadata[header_meta_key_name]) + + md_split_docs.append(Document( + page_content=split_chunk.page_content, + metadata={**doc.metadata, "headings": headings_list} + )) + + docs = md_split_docs else: raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter")) - docs = text_splitter.split_documents(docs) - if len(docs) == 0: raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 4d5818f71..2253e3bb2 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -563,6 +563,7 @@ > +