enh: PDF_LOADER_MODE

This commit is contained in:
Timothy Jaeryang Baek
2026-01-21 23:51:36 +04:00
parent 4615e8f92b
commit ecbdef732b
5 changed files with 43 additions and 1 deletions

View File

@@ -2809,6 +2809,12 @@ PDF_EXTRACT_IMAGES = PersistentConfig(
os.environ.get("PDF_EXTRACT_IMAGES", "False").lower() == "true",
)
PDF_LOADER_MODE = PersistentConfig(
"PDF_LOADER_MODE",
"rag.pdf_loader_mode",
os.environ.get("PDF_LOADER_MODE", "page"),
)
RAG_EMBEDDING_MODEL = PersistentConfig(
"RAG_EMBEDDING_MODEL",
"rag.embedding_model",

View File

@@ -288,6 +288,7 @@ from open_webui.config import (
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
TIKTOKEN_ENCODING_NAME,
PDF_EXTRACT_IMAGES,
PDF_LOADER_MODE,
YOUTUBE_LOADER_LANGUAGE,
YOUTUBE_LOADER_PROXY_URL,
# Retrieval (Web Search)
@@ -944,6 +945,7 @@ app.state.config.RAG_OLLAMA_BASE_URL = RAG_OLLAMA_BASE_URL
app.state.config.RAG_OLLAMA_API_KEY = RAG_OLLAMA_API_KEY
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
app.state.config.PDF_LOADER_MODE = PDF_LOADER_MODE
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL

View File

@@ -361,7 +361,9 @@ class Loader:
else:
if file_ext == "pdf":
loader = PyPDFLoader(
file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")
file_path,
extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
mode=self.kwargs.get("PDF_LOADER_MODE", "page"),
)
elif file_ext == "csv":
loader = CSVLoader(file_path, autodetect_encoding=True)

View File

@@ -468,6 +468,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
# Content extraction settings
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
@@ -659,6 +660,7 @@ class ConfigForm(BaseModel):
# Content extraction settings
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
PDF_EXTRACT_IMAGES: Optional[bool] = None
PDF_LOADER_MODE: Optional[str] = None
DATALAB_MARKER_API_KEY: Optional[str] = None
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
@@ -786,6 +788,11 @@ async def update_rag_config(
if form_data.PDF_EXTRACT_IMAGES is not None
else request.app.state.config.PDF_EXTRACT_IMAGES
)
request.app.state.config.PDF_LOADER_MODE = (
form_data.PDF_LOADER_MODE
if form_data.PDF_LOADER_MODE is not None
else request.app.state.config.PDF_LOADER_MODE
)
request.app.state.config.DATALAB_MARKER_API_KEY = (
form_data.DATALAB_MARKER_API_KEY
if form_data.DATALAB_MARKER_API_KEY is not None
@@ -1180,6 +1187,7 @@ async def update_rag_config(
# Content extraction settings
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,

View File

@@ -362,6 +362,30 @@
</div>
</div>
</div>
<div class="flex w-full mt-2">
<div class="flex-1 flex justify-between">
<div class=" self-center text-xs font-medium">
<Tooltip
content={$i18n.t(
'Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.'
)}
placement="top-start"
>
{$i18n.t('PDF Loader Mode')}
</Tooltip>
</div>
<div class="">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden text-right"
bind:value={RAGConfig.PDF_LOADER_MODE}
>
<option value="page">{$i18n.t('Page')}</option>
<option value="single">{$i18n.t('Single')}</option>
</select>
</div>
</div>
</div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
<div class="my-0.5 flex gap-2 pr-2">
<Tooltip