diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 2f45378e1..1cb6ab56a 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1654,6 +1654,12 @@ TIKA_SERVER_URL = PersistentConfig( os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment ) +DOCLING_SERVER_URL = PersistentConfig( + "DOCLING_SERVER_URL", + "rag.docling_server_url", + os.getenv("DOCLING_SERVER_URL", "http://docling:5001"), +) + DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( "DOCUMENT_INTELLIGENCE_ENDPOINT", "rag.document_intelligence_endpoint", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 46a882f65..a453df0d7 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -186,6 +186,7 @@ from open_webui.config import ( CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL, + DOCLING_SERVER_URL, DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY, RAG_TOP_K, @@ -551,6 +552,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL +app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 7fa24ced3..5bcd2d321 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -117,6 +117,52 @@ class TikaLoader: raise Exception(f"Error calling Tika: {r.reason}") +class DoclingLoader: + def __init__(self, url, file_path=None, mime_type=None): + self.url = url.rstrip("/") + self.file_path = file_path + self.mime_type = mime_type + + def load(self) -> list[Document]: + with open(self.file_path, "rb") as f: + files = { + "files": ( + self.file_path, + f, + self.mime_type or "application/octet-stream", + ) + } + + params = { + "image_export_mode": "placeholder", + "table_mode": "accurate", + } + + endpoint = f"{self.url}/v1alpha/convert/file" + r = requests.post(endpoint, files=files, data=params) + + if r.ok: + result = r.json() + document_data = result.get("document", {}) + text = document_data.get("md_content", "") + + metadata = {"Content-Type": self.mime_type} if self.mime_type else {} + + log.debug("Docling extracted text: %s", text) + + return [Document(page_content=text, metadata=metadata)] + else: + error_msg = f"Error calling Docling API: {r.reason}" + if r.text: + try: + error_data = r.json() + if "detail" in error_data: + error_msg += f" - {error_data['detail']}" + except Exception: + error_msg += f" - {r.text}" + raise Exception(f"Error calling Docling: {error_msg}") + + class Loader: def __init__(self, engine: str = "", **kwargs): self.engine = engine @@ -149,6 +195,12 @@ class Loader: file_path=file_path, mime_type=file_content_type, ) + elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"): + loader = DoclingLoader( + url=self.kwargs.get("DOCLING_SERVER_URL"), + file_path=file_path, + mime_type=file_content_type, + ) elif ( self.engine == "document_intelligence" and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != "" diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index ac38c236e..85ffdde74 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -358,6 +358,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "content_extraction": { "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": request.app.state.config.TIKA_SERVER_URL, + "docling_server_url": request.app.state.config.DOCLING_SERVER_URL, "document_intelligence_config": { "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, @@ -428,6 +429,7 @@ class DocumentIntelligenceConfigForm(BaseModel): class ContentExtractionConfig(BaseModel): engine: str = "" tika_server_url: Optional[str] = None + docling_server_url: Optional[str] = None document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None @@ -540,6 +542,9 @@ async def update_rag_config( request.app.state.config.TIKA_SERVER_URL = ( form_data.content_extraction.tika_server_url ) + request.app.state.config.DOCLING_SERVER_URL = ( + form_data.content_extraction.docling_server_url + ) if form_data.content_extraction.document_intelligence_config is not None: request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( form_data.content_extraction.document_intelligence_config.endpoint @@ -648,6 +653,7 @@ async def update_rag_config( "content_extraction": { "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": request.app.state.config.TIKA_SERVER_URL, + "docling_server_url": request.app.state.config.DOCLING_SERVER_URL, "document_intelligence_config": { "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, @@ -990,6 +996,7 @@ def process_file( loader = Loader( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, + DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 408c344b6..5498d1d08 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -49,6 +49,8 @@ let contentExtractionEngine = 'default'; let tikaServerUrl = ''; let showTikaServerUrl = false; + let doclingServerUrl = ''; + let showDoclingServerUrl = false; let documentIntelligenceEndpoint = ''; let documentIntelligenceKey = ''; let showDocumentIntelligenceConfig = false; @@ -175,6 +177,10 @@ toast.error($i18n.t('Tika Server URL required.')); return; } + if (contentExtractionEngine === 'docling' && doclingServerUrl === '') { + toast.error($i18n.t('Docling Server URL required.')); + return; + } if ( contentExtractionEngine === 'document_intelligence' && (documentIntelligenceEndpoint === '' || documentIntelligenceKey === '') @@ -209,6 +215,7 @@ content_extraction: { engine: contentExtractionEngine, tika_server_url: tikaServerUrl, + docling_server_url: doclingServerUrl, document_intelligence_config: { key: documentIntelligenceKey, endpoint: documentIntelligenceEndpoint @@ -269,7 +276,10 @@ contentExtractionEngine = res.content_extraction.engine; tikaServerUrl = res.content_extraction.tika_server_url; + doclingServerUrl = res.content_extraction.docling_server_url; + showTikaServerUrl = contentExtractionEngine === 'tika'; + showDoclingServerUrl = contentExtractionEngine === 'docling'; documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint; documentIntelligenceKey = res.content_extraction.document_intelligence_config.key; showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence'; @@ -337,6 +347,7 @@ > + @@ -351,6 +362,14 @@ /> + {:else if contentExtractionEngine === 'docling'} +
+ +
{:else if contentExtractionEngine === 'document_intelligence'}