enh: PDF_LOADER_MODE
This commit is contained in:
@@ -2809,6 +2809,12 @@ PDF_EXTRACT_IMAGES = PersistentConfig(
|
||||
os.environ.get("PDF_EXTRACT_IMAGES", "False").lower() == "true",
|
||||
)
|
||||
|
||||
PDF_LOADER_MODE = PersistentConfig(
|
||||
"PDF_LOADER_MODE",
|
||||
"rag.pdf_loader_mode",
|
||||
os.environ.get("PDF_LOADER_MODE", "page"),
|
||||
)
|
||||
|
||||
RAG_EMBEDDING_MODEL = PersistentConfig(
|
||||
"RAG_EMBEDDING_MODEL",
|
||||
"rag.embedding_model",
|
||||
|
||||
@@ -288,6 +288,7 @@ from open_webui.config import (
|
||||
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
|
||||
TIKTOKEN_ENCODING_NAME,
|
||||
PDF_EXTRACT_IMAGES,
|
||||
PDF_LOADER_MODE,
|
||||
YOUTUBE_LOADER_LANGUAGE,
|
||||
YOUTUBE_LOADER_PROXY_URL,
|
||||
# Retrieval (Web Search)
|
||||
@@ -944,6 +945,7 @@ app.state.config.RAG_OLLAMA_BASE_URL = RAG_OLLAMA_BASE_URL
|
||||
app.state.config.RAG_OLLAMA_API_KEY = RAG_OLLAMA_API_KEY
|
||||
|
||||
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
|
||||
app.state.config.PDF_LOADER_MODE = PDF_LOADER_MODE
|
||||
|
||||
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
|
||||
app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL
|
||||
|
||||
@@ -361,7 +361,9 @@ class Loader:
|
||||
else:
|
||||
if file_ext == "pdf":
|
||||
loader = PyPDFLoader(
|
||||
file_path, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES")
|
||||
file_path,
|
||||
extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"),
|
||||
mode=self.kwargs.get("PDF_LOADER_MODE", "page"),
|
||||
)
|
||||
elif file_ext == "csv":
|
||||
loader = CSVLoader(file_path, autodetect_encoding=True)
|
||||
|
||||
@@ -468,6 +468,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
||||
# Content extraction settings
|
||||
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||
"PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE,
|
||||
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
|
||||
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
||||
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
@@ -659,6 +660,7 @@ class ConfigForm(BaseModel):
|
||||
# Content extraction settings
|
||||
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
|
||||
PDF_EXTRACT_IMAGES: Optional[bool] = None
|
||||
PDF_LOADER_MODE: Optional[str] = None
|
||||
|
||||
DATALAB_MARKER_API_KEY: Optional[str] = None
|
||||
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
|
||||
@@ -786,6 +788,11 @@ async def update_rag_config(
|
||||
if form_data.PDF_EXTRACT_IMAGES is not None
|
||||
else request.app.state.config.PDF_EXTRACT_IMAGES
|
||||
)
|
||||
request.app.state.config.PDF_LOADER_MODE = (
|
||||
form_data.PDF_LOADER_MODE
|
||||
if form_data.PDF_LOADER_MODE is not None
|
||||
else request.app.state.config.PDF_LOADER_MODE
|
||||
)
|
||||
request.app.state.config.DATALAB_MARKER_API_KEY = (
|
||||
form_data.DATALAB_MARKER_API_KEY
|
||||
if form_data.DATALAB_MARKER_API_KEY is not None
|
||||
@@ -1180,6 +1187,7 @@ async def update_rag_config(
|
||||
# Content extraction settings
|
||||
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||
"PDF_LOADER_MODE": request.app.state.config.PDF_LOADER_MODE,
|
||||
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
|
||||
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
||||
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
|
||||
Reference in New Issue
Block a user