diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 7f593be2a..d0b4ed8f3 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1839,6 +1839,18 @@ DOCLING_SERVER_URL = PersistentConfig( os.getenv("DOCLING_SERVER_URL", "http://docling:5001"), ) +DOCLING_OCR_ENGINE = PersistentConfig( + "DOCLING_OCR_ENGINE", + "rag.docling_ocr_engine", + os.getenv("DOCLING_OCR_ENGINE", "tesseract"), +) + +DOCLING_OCR_LANG = PersistentConfig( + "DOCLING_OCR_LANG", + "rag.docling_ocr_lang", + os.getenv("DOCLING_OCR_LANG", "eng,fra,deu,spa"), +) + DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( "DOCUMENT_INTELLIGENCE_ENDPOINT", "rag.document_intelligence_endpoint", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index ef38904c0..83f5e6f15 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -202,6 +202,8 @@ from open_webui.config import ( CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL, DOCLING_SERVER_URL, + DOCLING_OCR_ENGINE, + DOCLING_OCR_LANG, DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY, MISTRAL_OCR_API_KEY, @@ -635,6 +637,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL +app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE +app.state.config.DOCLING_OCR_LANG = DOCLING_OCR_LANG app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.MISTRAL_OCR_API_KEY = MISTRAL_OCR_API_KEY diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 0403c5c51..fa996e16d 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -100,7 +100,7 @@ class TikaLoader: headers = {} if self.kwargs.get("PDF_EXTRACT_IMAGES") == True: - headers['X-Tika-PDFextractInlineImages'] = 'true' + headers["X-Tika-PDFextractInlineImages"] = "true" endpoint = self.url if not endpoint.endswith("/"): @@ -124,10 +124,14 @@ class TikaLoader: class DoclingLoader: - def __init__(self, url, file_path=None, mime_type=None): + def __init__( + self, url, file_path=None, mime_type=None, ocr_engine=None, ocr_lang=None + ): self.url = url.rstrip("/") self.file_path = file_path self.mime_type = mime_type + self.ocr_engine = ocr_engine + self.ocr_lang = ocr_lang def load(self) -> list[Document]: with open(self.file_path, "rb") as f: @@ -144,6 +148,12 @@ class DoclingLoader: "table_mode": "accurate", } + if self.ocr_engine and self.ocr_lang: + params["ocr_engine"] = self.ocr_engine + params["ocr_lang"] = [ + lang.strip() for lang in self.ocr_lang.split(",") if lang.strip() + ] + endpoint = f"{self.url}/v1alpha/convert/file" r = requests.post(endpoint, files=files, data=params) @@ -212,6 +222,8 @@ class Loader: url=self.kwargs.get("DOCLING_SERVER_URL"), file_path=file_path, mime_type=file_content_type, + ocr_engine=self.kwargs.get("DOCLING_OCR_ENGINE"), + ocr_lang=self.kwargs.get("DOCLING_OCR_LANG"), ) elif ( self.engine == "document_intelligence" diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 36897cdea..f75b03483 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -378,6 +378,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, + "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, + "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, @@ -511,6 +513,8 @@ class ConfigForm(BaseModel): PDF_EXTRACT_IMAGES: Optional[bool] = None TIKA_SERVER_URL: Optional[str] = None DOCLING_SERVER_URL: Optional[str] = None + DOCLING_OCR_ENGINE: Optional[str] = None + DOCLING_OCR_LANG: Optional[str] = None DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None MISTRAL_OCR_API_KEY: Optional[str] = None @@ -600,6 +604,16 @@ async def update_rag_config( if form_data.DOCLING_SERVER_URL is not None else request.app.state.config.DOCLING_SERVER_URL ) + request.app.state.config.DOCLING_OCR_ENGINE = ( + form_data.DOCLING_OCR_ENGINE + if form_data.DOCLING_OCR_ENGINE is not None + else request.app.state.config.DOCLING_OCR_ENGINE + ) + request.app.state.config.DOCLING_OCR_LANG = ( + form_data.DOCLING_OCR_LANG + if form_data.DOCLING_OCR_LANG is not None + else request.app.state.config.DOCLING_OCR_LANG + ) request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( form_data.DOCUMENT_INTELLIGENCE_ENDPOINT if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None @@ -767,6 +781,8 @@ async def update_rag_config( "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, + "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, + "DOCLING_OCR_LANG": request.app.state.config.DOCLING_OCR_LANG, "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, @@ -1080,6 +1096,8 @@ def process_file( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, + DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE, + DOCLING_OCR_LANG=request.app.state.config.DOCLING_OCR_LANG, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 2047a07e7..ed314e658 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -161,6 +161,12 @@ toast.error($i18n.t('Docling Server URL required.')); return; } + if (RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling' && + ((RAGConfig.DOCLING_OCR_ENGINE === '' && RAGConfig.DOCLING_OCR_LANG !== '') || + (RAGConfig.DOCLING_OCR_ENGINE !== '' && RAGConfig.DOCLING_OCR_LANG === ''))) { + toast.error($i18n.t('Both Docling OCR Engine and Language(s) must be provided or both left empty.')); + return; + } if ( RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' && @@ -326,6 +332,18 @@ bind:value={RAGConfig.DOCLING_SERVER_URL} /> +