feat: external document loader support
Some checks are pending
Deploy to HuggingFace Spaces / check-secret (push) Waiting to run
Deploy to HuggingFace Spaces / deploy (push) Blocked by required conditions
Create and publish Docker images with specific build args / build-main-image (linux/amd64) (push) Waiting to run
Create and publish Docker images with specific build args / build-main-image (linux/arm64) (push) Waiting to run
Create and publish Docker images with specific build args / build-cuda-image (linux/amd64) (push) Waiting to run
Create and publish Docker images with specific build args / build-cuda-image (linux/arm64) (push) Waiting to run
Create and publish Docker images with specific build args / build-ollama-image (linux/amd64) (push) Waiting to run
Create and publish Docker images with specific build args / build-ollama-image (linux/arm64) (push) Waiting to run
Create and publish Docker images with specific build args / merge-main-images (push) Blocked by required conditions
Create and publish Docker images with specific build args / merge-cuda-images (push) Blocked by required conditions
Create and publish Docker images with specific build args / merge-ollama-images (push) Blocked by required conditions
Python CI / Format Backend (3.11.x) (push) Waiting to run
Python CI / Format Backend (3.12.x) (push) Waiting to run
Frontend Build / Format & Build Frontend (push) Waiting to run
Frontend Build / Frontend Unit Tests (push) Waiting to run

This commit is contained in:
Timothy Jaeryang Baek
2025-05-14 22:28:40 +04:00
parent 32ea31144e
commit 8732b64b6b
8 changed files with 140 additions and 4 deletions

View File

@@ -352,6 +352,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
# Content extraction settings
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
"EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
"TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
"DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
@@ -493,6 +495,9 @@ class ConfigForm(BaseModel):
# Content extraction settings
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
PDF_EXTRACT_IMAGES: Optional[bool] = None
EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
EXTERNAL_DOCUMENT_LOADER_API_KEY: Optional[str] = None
TIKA_SERVER_URL: Optional[str] = None
DOCLING_SERVER_URL: Optional[str] = None
DOCLING_OCR_ENGINE: Optional[str] = None
@@ -583,6 +588,16 @@ async def update_rag_config(
if form_data.PDF_EXTRACT_IMAGES is not None
else request.app.state.config.PDF_EXTRACT_IMAGES
)
request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = (
form_data.EXTERNAL_DOCUMENT_LOADER_URL
if form_data.EXTERNAL_DOCUMENT_LOADER_URL is not None
else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL
)
request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY = (
form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY
if form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY is not None
else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY
)
request.app.state.config.TIKA_SERVER_URL = (
form_data.TIKA_SERVER_URL
if form_data.TIKA_SERVER_URL is not None
@@ -818,6 +833,8 @@ async def update_rag_config(
# Content extraction settings
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
"EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
"TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL,
"DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL,
"DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE,
@@ -1139,6 +1156,8 @@ def process_file(
file_path = Storage.get_file(file_path)
loader = Loader(
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE,