diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 5e0e4f0a1..fe5abbed9 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1579,6 +1579,18 @@ TIKA_SERVER_URL = PersistentConfig( os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment ) +DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( + "DOCUMENT_INTELLIGENCE_ENDPOINT", + "rag.document_intelligence_endpoint", + os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT", ""), +) + +DOCUMENT_INTELLIGENCE_KEY = PersistentConfig( + "DOCUMENT_INTELLIGENCE_KEY", + "rag.document_intelligence_key", + os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""), +) + RAG_TOP_K = PersistentConfig( "RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3")) ) diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 346d28d6c..1371f7d15 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -180,6 +180,8 @@ from open_webui.config import ( CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL, + DOCUMENT_INTELLIGENCE_ENDPOINT, + DOCUMENT_INTELLIGENCE_KEY, RAG_TOP_K, RAG_TEXT_SPLITTER, TIKTOKEN_ENCODING_NAME, @@ -533,6 +535,8 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL +app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT +app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index a9372f65a..19d590f5c 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -4,6 +4,7 @@ import ftfy import sys from langchain_community.document_loaders import ( + AzureAIDocumentIntelligenceLoader, BSHTMLLoader, CSVLoader, Docx2txtLoader, @@ -147,6 +148,27 @@ class Loader: file_path=file_path, mime_type=file_content_type, ) + elif ( + self.engine == "document_intelligence" + and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != "" + and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != "" + and ( + file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"] + or file_content_type + in [ + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.ms-powerpoint", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ] + ) + ): + loader = AzureAIDocumentIntelligenceLoader( + file_path=file_path, + api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"), + api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"), + ) else: if file_ext == "pdf": loader = PyPDFLoader( diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index e69d2ce96..9611051b3 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -356,6 +356,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "content_extraction": { "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": request.app.state.config.TIKA_SERVER_URL, + "document_intelligence_config": { + "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, + }, }, "chunk": { "text_splitter": request.app.state.config.TEXT_SPLITTER, @@ -411,9 +415,15 @@ class FileConfig(BaseModel): max_count: Optional[int] = None +class DocumentIntelligenceConfigForm(BaseModel): + endpoint: str + key: str + + class ContentExtractionConfig(BaseModel): engine: str = "" tika_server_url: Optional[str] = None + document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None class ChunkParamUpdateForm(BaseModel): @@ -501,13 +511,22 @@ async def update_rag_config( request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count if form_data.content_extraction is not None: - log.info(f"Updating text settings: {form_data.content_extraction}") + log.info( + f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}" + ) request.app.state.config.CONTENT_EXTRACTION_ENGINE = ( form_data.content_extraction.engine ) request.app.state.config.TIKA_SERVER_URL = ( form_data.content_extraction.tika_server_url ) + if form_data.content_extraction.document_intelligence_config is not None: + request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( + form_data.content_extraction.document_intelligence_config.endpoint + ) + request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = ( + form_data.content_extraction.document_intelligence_config.key + ) if form_data.chunk is not None: request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter @@ -604,6 +623,10 @@ async def update_rag_config( "content_extraction": { "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": request.app.state.config.TIKA_SERVER_URL, + "document_intelligence_config": { + "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, + }, }, "chunk": { "text_splitter": request.app.state.config.TEXT_SPLITTER, @@ -937,6 +960,8 @@ def process_file( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, + DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, ) docs = loader.load( file.filename, file.meta.get("content_type"), file_path diff --git a/backend/requirements.txt b/backend/requirements.txt index 965741f78..47d441ad5 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -71,6 +71,7 @@ validators==0.34.0 psutil sentencepiece soundfile==0.13.1 +azure-ai-documentintelligence==1.0.0 opencv-python-headless==4.11.0.86 rapidocr-onnxruntime==1.3.24 diff --git a/pyproject.toml b/pyproject.toml index 5cd54da64..5282a9dba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ dependencies = [ "psutil", "sentencepiece", "soundfile==0.13.1", + "azure-ai-documentintelligence==1.0.0", "opencv-python-headless==4.11.0.86", "rapidocr-onnxruntime==1.3.24", diff --git a/src/lib/apis/retrieval/index.ts b/src/lib/apis/retrieval/index.ts index c35c37847..ed07ab5d0 100644 --- a/src/lib/apis/retrieval/index.ts +++ b/src/lib/apis/retrieval/index.ts @@ -32,9 +32,15 @@ type ChunkConfigForm = { chunk_overlap: number; }; +type DocumentIntelligenceConfigForm = { + key: string; + endpoint: string; +}; + type ContentExtractConfigForm = { engine: string; tika_server_url: string | null; + document_intelligence_config: DocumentIntelligenceConfigForm | null; }; type YoutubeConfigForm = { diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index c7c1f0e8f..b79086309 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -49,6 +49,9 @@ let contentExtractionEngine = 'default'; let tikaServerUrl = ''; let showTikaServerUrl = false; + let documentIntelligenceEndpoint = ''; + let documentIntelligenceKey = ''; + let showDocumentIntelligenceConfig = false; let textSplitter = ''; let chunkSize = 0; @@ -176,6 +179,13 @@ toast.error($i18n.t('Tika Server URL required.')); return; } + if ( + contentExtractionEngine === 'document_intelligence' && + (documentIntelligenceEndpoint === '' || documentIntelligenceKey === '') + ) { + toast.error($i18n.t('Document Intelligence endpoint and key required.')); + return; + } const res = await updateRAGConfig(localStorage.token, { pdf_extract_images: pdfExtractImages, enable_google_drive_integration: enableGoogleDriveIntegration, @@ -191,7 +201,11 @@ }, content_extraction: { engine: contentExtractionEngine, - tika_server_url: tikaServerUrl + tika_server_url: tikaServerUrl, + document_intelligence_config: { + key: documentIntelligenceKey, + endpoint: documentIntelligenceEndpoint + } } }); @@ -249,6 +263,9 @@ contentExtractionEngine = res.content_extraction.engine; tikaServerUrl = res.content_extraction.tika_server_url; showTikaServerUrl = contentExtractionEngine === 'tika'; + documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint; + documentIntelligenceKey = res.content_extraction.document_intelligence_config.key; + showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence'; fileMaxSize = res?.file.max_size ?? ''; fileMaxCount = res?.file.max_count ?? ''; @@ -585,10 +602,12 @@ bind:value={contentExtractionEngine} on:change={(e) => { showTikaServerUrl = e.target.value === 'tika'; + showDocumentIntelligenceConfig = e.target.value === 'document_intelligence'; }} > + @@ -604,6 +623,21 @@ {/if} + + {#if showDocumentIntelligenceConfig} +
+ + + +
+ {/if}