enh: PDF_LOADER_MODE
This commit is contained in:
@@ -2809,6 +2809,12 @@ PDF_EXTRACT_IMAGES = PersistentConfig(
|
||||
os.environ.get("PDF_EXTRACT_IMAGES", "False").lower() == "true",
|
||||
)
|
||||
|
||||
PDF_LOADER_MODE = PersistentConfig(
|
||||
"PDF_LOADER_MODE",
|
||||
"rag.pdf_loader_mode",
|
||||
os.environ.get("PDF_LOADER_MODE", "page"),
|
||||
)
|
||||
|
||||
RAG_EMBEDDING_MODEL = PersistentConfig(
|
||||
"RAG_EMBEDDING_MODEL",
|
||||
"rag.embedding_model",
|
||||
|
||||
@@ -288,6 +288,7 @@ from open_webui.config import (
|
||||
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
|
||||
TIKTOKEN_ENCODING_NAME,
|
||||
PDF_EXTRACT_IMAGES,
|
||||
PDF_LOADER_MODE,
|
||||
YOUTUBE_LOADER_LANGUAGE,
|
||||
YOUTUBE_LOADER_PROXY_URL,
|
||||
# Retrieval (Web Search)
|
||||
@@ -944,6 +945,7 @@ app.state.config.RAG_OLLAMA_BASE_URL = RAG_OLLAMA_BASE_URL
|
||||
app.state.config.RAG_OLLAMA_API_KEY = RAG_OLLAMA_API_KEY
|
||||
|
||||
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
|
||||
app.state.config.PDF_LOADER_MODE = PDF_LOADER_MODE
|
||||
|
||||
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
|
||||
app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL
|
||||
|
||||
@@ -361,7 +361,9 @@ class Loader:
|
||||
else:
|
||||
if file_ext == "pdf":
|
||||
loader = PyPDFLoader(
|
||||
file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")
|
||||
file_path,
|
||||
extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
|
||||
mode=self.kwargs.get("PDF_LOADER_MODE", "page"),
|
||||
)
|
||||
elif file_ext == "csv":
|
||||
loader = CSVLoader(file_path, autodetect_encoding=True)
|
||||
|
||||
@@ -468,6 +468,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||
# Content extraction settings
|
||||
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||
"PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE,
|
||||
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
|
||||
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
||||
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
@@ -659,6 +660,7 @@ class ConfigForm(BaseModel):
|
||||
# Content extraction settings
|
||||
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
|
||||
PDF_EXTRACT_IMAGES: Optional[bool] = None
|
||||
PDF_LOADER_MODE: Optional[str] = None
|
||||
|
||||
DATALAB_MARKER_API_KEY: Optional[str] = None
|
||||
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
|
||||
@@ -786,6 +788,11 @@ async def update_rag_config(
|
||||
if form_data.PDF_EXTRACT_IMAGES is not None
|
||||
else request.app.state.config.PDF_EXTRACT_IMAGES
|
||||
)
|
||||
request.app.state.config.PDF_LOADER_MODE = (
|
||||
form_data.PDF_LOADER_MODE
|
||||
if form_data.PDF_LOADER_MODE is not None
|
||||
else request.app.state.config.PDF_LOADER_MODE
|
||||
)
|
||||
request.app.state.config.DATALAB_MARKER_API_KEY = (
|
||||
form_data.DATALAB_MARKER_API_KEY
|
||||
if form_data.DATALAB_MARKER_API_KEY is not None
|
||||
@@ -1180,6 +1187,7 @@ async def update_rag_config(
|
||||
# Content extraction settings
|
||||
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||
"PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE,
|
||||
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
|
||||
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
||||
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
|
||||
@@ -362,6 +362,30 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="flex w-full mt-2">
|
||||
<div class="flex-1 flex justify-between">
|
||||
<div class=" self-center text-xs font-medium">
|
||||
<Tooltip
|
||||
content={$i18n.t(
|
||||
'Page mode creates one document per page. Single mode combines all pages into one document for better chunking across page boundaries.'
|
||||
)}
|
||||
placement="top-start"
|
||||
>
|
||||
{$i18n.t('PDF Loader Mode')}
|
||||
</Tooltip>
|
||||
</div>
|
||||
<div class="">
|
||||
<select
|
||||
class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 text-xs bg-transparent outline-hidden text-right"
|
||||
bind:value={RAGConfig.PDF_LOADER_MODE}
|
||||
>
|
||||
<option value="page">{$i18n.t('Page')}</option>
|
||||
<option value="single">{$i18n.t('Single')}</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
|
||||
<div class="my-0.5 flex gap-2 pr-2">
|
||||
<Tooltip
|
||||
|
||||
Reference in New Issue
Block a user