From de70d0cb6404154b8cc224baefd2923e3798af46 Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Wed, 14 May 2025 21:26:49 +0400 Subject: [PATCH] feat: docling do picture description support --- backend/open_webui/config.py | 6 +++ backend/open_webui/main.py | 2 + backend/open_webui/retrieval/loaders/main.py | 35 ++++++++++----- backend/open_webui/routers/retrieval.py | 11 +++++ .../admin/Settings/Documents.svelte | 45 ++++++++++++------- 5 files changed, 70 insertions(+), 29 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 38bd709f1..02c3604d2 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1849,6 +1849,12 @@ DOCLING_OCR_LANG = PersistentConfig( os.getenv("DOCLING_OCR_LANG", "eng,fra,deu,spa"), ) +DOCLING_DO_PICTURE_DESCRIPTION = PersistentConfig( + "DOCLING_DO_PICTURE_DESCRIPTION", + "rag.docling_do_picture_description", + os.getenv("DOCLING_DO_PICTURE_DESCRIPTION", "False").lower() == "true", +) + DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( "DOCUMENT_INTELLIGENCE_ENDPOINT", "rag.document_intelligence_endpoint", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 8c6d6c22f..d50b5d319 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -210,6 +210,7 @@ from open_webui.config import ( DOCLING_SERVER_URL, DOCLING_OCR_ENGINE, DOCLING_OCR_LANG, + DOCLING_DO_PICTURE_DESCRIPTION, DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY, MISTRAL_OCR_API_KEY, @@ -645,6 +646,7 @@ app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG +app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = DOCLING_DO_PICTURE_DESCRIPTION app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 8e7b5a3da..1f34c9f79 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -126,14 +126,12 @@ class TikaLoader: class DoclingLoader: - def __init__( - self, url, file_path=None, mime_type=None, ocr_engine=None, ocr_lang=None - ): + def __init__(self, url, file_path=None, mime_type=None, params=None): self.url = url.rstrip("/") self.file_path = file_path self.mime_type = mime_type - self.ocr_engine = ocr_engine - self.ocr_lang = ocr_lang + + self.params = params or {} def load(self) -> list[Document]: with open(self.file_path, "rb") as f: @@ -150,11 +148,19 @@ class DoclingLoader: "table_mode": "accurate", } - if self.ocr_engine and self.ocr_lang: - params["ocr_engine"] = self.ocr_engine - params["ocr_lang"] = [ - lang.strip() for lang in self.ocr_lang.split(",") if lang.strip() - ] + if self.params: + if self.params.get("do_picture_classification"): + params["do_picture_classification"] = self.params.get( + "do_picture_classification" + ) + + if self.params.get("ocr_engine") and self.params.get("ocr_lang"): + params["ocr_engine"] = self.params.get("ocr_engine") + params["ocr_lang"] = [ + lang.strip() + for lang in self.params.get("ocr_lang").split(",") + if lang.strip() + ] endpoint = f"{self.url}/v1alpha/convert/file" r = requests.post(endpoint, files=files, data=params) @@ -225,8 +231,13 @@ class Loader: url=self.kwargs.get("DOCLING_SERVER_URL"), file_path=file_path, mime_type=file_content_type, - ocr_engine=self.kwargs.get("DOCLING_OCR_ENGINE"), - ocr_lang=self.kwargs.get("DOCLING_OCR_LANG"), + params={ + "ocr_engine": self.kwargs.get("DOCLING_OCR_ENGINE"), + "ocr_lang": self.kwargs.get("DOCLING_OCR_LANG"), + "do_picture_classification": self.kwargs.get( + "DOCLING_DO_PICTURE_DESCRIPTION" + ), + }, ) elif ( self.engine == "document_intelligence" diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 0b414a551..5f867dadf 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -356,6 +356,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, + "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, @@ -496,6 +497,7 @@ class ConfigForm(BaseModel): DOCLING_SERVER_URL: Optional[str] = None DOCLING_OCR_ENGINE: Optional[str] = None DOCLING_OCR_LANG: Optional[str] = None + DOCLING_DO_PICTURE_DESCRIPTION: Optional[bool] = None DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None @@ -601,6 +603,13 @@ async def update_rag_config( if form_data.DOCLING_OCR_LANG is not None else request.app.state.config.DOCLING_OCR_LANG ) + + request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION = ( + form_data.DOCLING_DO_PICTURE_DESCRIPTION + if form_data.DOCLING_DO_PICTURE_DESCRIPTION is not None + else request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION + ) + request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( form_data.DOCUMENT_INTELLIGENCE_ENDPOINT if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None @@ -813,6 +822,7 @@ async def update_rag_config( "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, + "DOCLING_DO_PICTURE_DESCRIPTION": request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, @@ -1133,6 +1143,7 @@ def process_file( DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE, DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG, + DOCLING_DO_PICTURE_DESCRIPTION=request.app.state.config.DOCLING_DO_PICTURE_DESCRIPTION, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index cc56356fa..8f6a68cf4 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -246,7 +246,7 @@
-
+
{$i18n.t('Content Extraction Engine')}
@@ -279,7 +279,7 @@
@@ -288,27 +288,38 @@ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling'}
+ +
+
+
+ {$i18n.t('Describe Pictures in Documents')} +
+
+ +
+
+
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'}
@@ -437,7 +448,7 @@ {#if embeddingEngine === 'openai'}
{$i18n.t('Top K')}
{$i18n.t('Top K Reranker')}