mirror of
				https://github.com/open-webui/open-webui
				synced 2025-06-26 18:26:48 +00:00 
			
		
		
		
	Merge pull request #11464 from FabioPolito24/docling_context_extraction_engine
feat: Docling context extraction engine
This commit is contained in:
		
						commit
						f8ac44cfbd
					
				| @ -1654,6 +1654,12 @@ TIKA_SERVER_URL = PersistentConfig( | ||||
|     os.getenv("TIKA_SERVER_URL", "http://tika:9998"),  # Default for sidecar deployment | ||||
| ) | ||||
| 
 | ||||
| DOCLING_SERVER_URL = PersistentConfig( | ||||
|     "DOCLING_SERVER_URL", | ||||
|     "rag.docling_server_url", | ||||
|     os.getenv("DOCLING_SERVER_URL", "http://docling:5001"), | ||||
| ) | ||||
| 
 | ||||
| DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( | ||||
|     "DOCUMENT_INTELLIGENCE_ENDPOINT", | ||||
|     "rag.document_intelligence_endpoint", | ||||
|  | ||||
| @ -186,6 +186,7 @@ from open_webui.config import ( | ||||
|     CHUNK_SIZE, | ||||
|     CONTENT_EXTRACTION_ENGINE, | ||||
|     TIKA_SERVER_URL, | ||||
|     DOCLING_SERVER_URL, | ||||
|     DOCUMENT_INTELLIGENCE_ENDPOINT, | ||||
|     DOCUMENT_INTELLIGENCE_KEY, | ||||
|     RAG_TOP_K, | ||||
| @ -551,6 +552,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( | ||||
| 
 | ||||
| app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE | ||||
| app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL | ||||
| app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL | ||||
| app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT | ||||
| app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY | ||||
| 
 | ||||
|  | ||||
| @ -117,6 +117,52 @@ class TikaLoader: | ||||
|             raise Exception(f"Error calling Tika: {r.reason}") | ||||
| 
 | ||||
| 
 | ||||
| class DoclingLoader: | ||||
|     def __init__(self, url, file_path=None, mime_type=None): | ||||
|         self.url = url.rstrip("/") | ||||
|         self.file_path = file_path | ||||
|         self.mime_type = mime_type | ||||
| 
 | ||||
|     def load(self) -> list[Document]: | ||||
|         with open(self.file_path, "rb") as f: | ||||
|             files = { | ||||
|                 "files": ( | ||||
|                     self.file_path, | ||||
|                     f, | ||||
|                     self.mime_type or "application/octet-stream", | ||||
|                 ) | ||||
|             } | ||||
| 
 | ||||
|             params = { | ||||
|                 "image_export_mode": "placeholder", | ||||
|                 "table_mode": "accurate", | ||||
|             } | ||||
| 
 | ||||
|             endpoint = f"{self.url}/v1alpha/convert/file" | ||||
|             r = requests.post(endpoint, files=files, data=params) | ||||
| 
 | ||||
|         if r.ok: | ||||
|             result = r.json() | ||||
|             document_data = result.get("document", {}) | ||||
|             text = document_data.get("md_content", "<No text content found>") | ||||
| 
 | ||||
|             metadata = {"Content-Type": self.mime_type} if self.mime_type else {} | ||||
| 
 | ||||
|             log.debug("Docling extracted text: %s", text) | ||||
| 
 | ||||
|             return [Document(page_content=text, metadata=metadata)] | ||||
|         else: | ||||
|             error_msg = f"Error calling Docling API: {r.reason}" | ||||
|             if r.text: | ||||
|                 try: | ||||
|                     error_data = r.json() | ||||
|                     if "detail" in error_data: | ||||
|                         error_msg += f" - {error_data['detail']}" | ||||
|                 except Exception: | ||||
|                     error_msg += f" - {r.text}" | ||||
|             raise Exception(f"Error calling Docling: {error_msg}") | ||||
| 
 | ||||
| 
 | ||||
| class Loader: | ||||
|     def __init__(self, engine: str = "", **kwargs): | ||||
|         self.engine = engine | ||||
| @ -149,6 +195,12 @@ class Loader: | ||||
|                     file_path=file_path, | ||||
|                     mime_type=file_content_type, | ||||
|                 ) | ||||
|         elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"): | ||||
|             loader = DoclingLoader( | ||||
|                 url=self.kwargs.get("DOCLING_SERVER_URL"), | ||||
|                 file_path=file_path, | ||||
|                 mime_type=file_content_type, | ||||
|             ) | ||||
|         elif ( | ||||
|             self.engine == "document_intelligence" | ||||
|             and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != "" | ||||
|  | ||||
| @ -358,6 +358,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): | ||||
|         "content_extraction": { | ||||
|             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, | ||||
|             "tika_server_url": request.app.state.config.TIKA_SERVER_URL, | ||||
|             "docling_server_url": request.app.state.config.DOCLING_SERVER_URL, | ||||
|             "document_intelligence_config": { | ||||
|                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, | ||||
|                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, | ||||
| @ -428,6 +429,7 @@ class DocumentIntelligenceConfigForm(BaseModel): | ||||
| class ContentExtractionConfig(BaseModel): | ||||
|     engine: str = "" | ||||
|     tika_server_url: Optional[str] = None | ||||
|     docling_server_url: Optional[str] = None | ||||
|     document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None | ||||
| 
 | ||||
| 
 | ||||
| @ -540,6 +542,9 @@ async def update_rag_config( | ||||
|         request.app.state.config.TIKA_SERVER_URL = ( | ||||
|             form_data.content_extraction.tika_server_url | ||||
|         ) | ||||
|         request.app.state.config.DOCLING_SERVER_URL = ( | ||||
|             form_data.content_extraction.docling_server_url | ||||
|         ) | ||||
|         if form_data.content_extraction.document_intelligence_config is not None: | ||||
|             request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( | ||||
|                 form_data.content_extraction.document_intelligence_config.endpoint | ||||
| @ -648,6 +653,7 @@ async def update_rag_config( | ||||
|         "content_extraction": { | ||||
|             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, | ||||
|             "tika_server_url": request.app.state.config.TIKA_SERVER_URL, | ||||
|             "docling_server_url": request.app.state.config.DOCLING_SERVER_URL, | ||||
|             "document_intelligence_config": { | ||||
|                 "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, | ||||
|                 "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, | ||||
| @ -990,6 +996,7 @@ def process_file( | ||||
|                 loader = Loader( | ||||
|                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, | ||||
|                     TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, | ||||
|                     DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, | ||||
|                     PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, | ||||
|                     DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, | ||||
|                     DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, | ||||
|  | ||||
| @ -49,6 +49,8 @@ | ||||
| 	let contentExtractionEngine = 'default'; | ||||
| 	let tikaServerUrl = ''; | ||||
| 	let showTikaServerUrl = false; | ||||
| 	let doclingServerUrl = ''; | ||||
| 	let showDoclingServerUrl = false; | ||||
| 	let documentIntelligenceEndpoint = ''; | ||||
| 	let documentIntelligenceKey = ''; | ||||
| 	let showDocumentIntelligenceConfig = false; | ||||
| @ -175,6 +177,10 @@ | ||||
| 			toast.error($i18n.t('Tika Server URL required.')); | ||||
| 			return; | ||||
| 		} | ||||
| 		if (contentExtractionEngine === 'docling' && doclingServerUrl === '') { | ||||
| 			toast.error($i18n.t('Docling Server URL required.')); | ||||
| 			return; | ||||
| 		} | ||||
| 		if ( | ||||
| 			contentExtractionEngine === 'document_intelligence' && | ||||
| 			(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '') | ||||
| @ -209,6 +215,7 @@ | ||||
| 			content_extraction: { | ||||
| 				engine: contentExtractionEngine, | ||||
| 				tika_server_url: tikaServerUrl, | ||||
| 				docling_server_url: doclingServerUrl, | ||||
| 				document_intelligence_config: { | ||||
| 					key: documentIntelligenceKey, | ||||
| 					endpoint: documentIntelligenceEndpoint | ||||
| @ -269,7 +276,10 @@ | ||||
| 
 | ||||
| 			contentExtractionEngine = res.content_extraction.engine; | ||||
| 			tikaServerUrl = res.content_extraction.tika_server_url; | ||||
| 			doclingServerUrl = res.content_extraction.docling_server_url; | ||||
| 
 | ||||
| 			showTikaServerUrl = contentExtractionEngine === 'tika'; | ||||
| 			showDoclingServerUrl = contentExtractionEngine === 'docling'; | ||||
| 			documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint; | ||||
| 			documentIntelligenceKey = res.content_extraction.document_intelligence_config.key; | ||||
| 			showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence'; | ||||
| @ -337,6 +347,7 @@ | ||||
| 							> | ||||
| 								<option value="">{$i18n.t('Default')} </option> | ||||
| 								<option value="tika">{$i18n.t('Tika')}</option> | ||||
| 								<option value="docling">{ $i18n.t('Docling') }</option> | ||||
| 								<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option> | ||||
| 							</select> | ||||
| 						</div> | ||||
| @ -351,6 +362,14 @@ | ||||
| 								/> | ||||
| 							</div> | ||||
| 						</div> | ||||
| 					{:else if contentExtractionEngine === 'docling'} | ||||
| 						<div class="flex w-full mt-1"> | ||||
| 							<input | ||||
| 								class="flex-1 w-full rounded-lg text-sm bg-transparent outline-hidden" | ||||
| 								placeholder={$i18n.t('Enter Docling Server URL')} | ||||
| 								bind:value={doclingServerUrl} | ||||
| 							/> | ||||
| 						</div> | ||||
| 					{:else if contentExtractionEngine === 'document_intelligence'} | ||||
| 						<div class="my-0.5 flex gap-2 pr-2"> | ||||
| 							<input | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user