From ecbdef732bc71a07c21bbb679edb420f26eac181 Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Wed, 21 Jan 2026 23:51:36 +0400 Subject: [PATCH] enh: PDF_LOADER_MODE --- backend/open_webui/config.py | 6 +++++ backend/open_webui/main.py | 2 ++ backend/open_webui/retrieval/loaders/main.py | 4 +++- backend/open_webui/routers/retrieval.py | 8 +++++++ .../admin/Settings/Documents.svelte | 24 +++++++++++++++++++ 5 files changed, 43 insertions(+), 1 deletion(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index ed7df33ed..c06297765 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2809,6 +2809,12 @@ PDF_EXTRACT_IMAGES = PersistentConfig( os.environ.get("PDF_EXTRACT_IMAGES", "False").lower() == "true", ) +PDF_LOADER_MODE = PersistentConfig( + "PDF_LOADER_MODE", + "rag.pdf_loader_mode", + os.environ.get("PDF_LOADER_MODE", "page"), +) + RAG_EMBEDDING_MODEL = PersistentConfig( "RAG_EMBEDDING_MODEL", "rag.embedding_model", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 82a87e3fd..9d1d85060 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -288,6 +288,7 @@ from open_webui.config import ( ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER, TIKTOKEN_ENCODING_NAME, PDF_EXTRACT_IMAGES, + PDF_LOADER_MODE, YOUTUBE_LOADER_LANGUAGE, YOUTUBE_LOADER_PROXY_URL, # Retrieval (Web Search) @@ -944,6 +945,7 @@ app.state.config.RAG_OLLAMA_BASE_URL = RAG_OLLAMA_BASE_URL app.state.config.RAG_OLLAMA_API_KEY = RAG_OLLAMA_API_KEY app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES +app.state.config.PDF_LOADER_MODE = PDF_LOADER_MODE app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index bf7c7286b..83adb8823 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -361,7 +361,9 @@ class Loader: else: if file_ext == "pdf": loader = PyPDFLoader( - file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES") + file_path, + extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"), + mode=self.kwargs.get("PDF_LOADER_MODE", "page"), ) elif file_ext == "csv": loader = CSVLoader(file_path, autodetect_encoding=True) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 763a9aacc..318e7bf8c 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -468,6 +468,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): # Content extraction settings "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL, "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, @@ -659,6 +660,7 @@ class ConfigForm(BaseModel): # Content extraction settings CONTENT_EXTRACTION_ENGINE: Optional[str] = None PDF_EXTRACT_IMAGES: Optional[bool] = None + PDF_LOADER_MODE: Optional[str] = None DATALAB_MARKER_API_KEY: Optional[str] = None DATALAB_MARKER_API_BASE_URL: Optional[str] = None @@ -786,6 +788,11 @@ async def update_rag_config( if form_data.PDF_EXTRACT_IMAGES is not None else request.app.state.config.PDF_EXTRACT_IMAGES ) + request.app.state.config.PDF_LOADER_MODE = ( + form_data.PDF_LOADER_MODE + if form_data.PDF_LOADER_MODE is not None + else request.app.state.config.PDF_LOADER_MODE + ) request.app.state.config.DATALAB_MARKER_API_KEY = ( form_data.DATALAB_MARKER_API_KEY if form_data.DATALAB_MARKER_API_KEY is not None @@ -1180,6 +1187,7 @@ async def update_rag_config( # Content extraction settings "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL, "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 732d82469..3f4434084 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -362,6 +362,30 @@ + +
+
+
+ + {$i18n.t('PDF Loader Mode')} + +
+
+ +
+
+
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}