Merge pull request #10486 from Micca/feature/document_intelligence_support

Feat: Adding Support for Azure AI Document Intelligence for Content Extraction (Revised)
2025-06-26 18:26:48 +00:00 · 2025-02-21 10:56:18 -08:00
parent a3c2ff61ed 35f3824932
commit ab1b910d80
8 changed files with 107 additions and 2 deletions
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@@ -1579,6 +1579,18 @@ TIKA_SERVER_URL = PersistentConfig(
    os.getenv("TIKA_SERVER_URL", "http://tika:9998"),  # Default for sidecar deployment
 )

+DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
+    "DOCUMENT_INTELLIGENCE_ENDPOINT",
+    "rag.document_intelligence_endpoint",
+    os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT", ""),
+)
+
+DOCUMENT_INTELLIGENCE_KEY = PersistentConfig(
+    "DOCUMENT_INTELLIGENCE_KEY",
+    "rag.document_intelligence_key",
+    os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
+)
+
 RAG_TOP_K = PersistentConfig(
    "RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
 )
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -180,6 +180,8 @@ from open_webui.config import (
    CHUNK_SIZE,
    CONTENT_EXTRACTION_ENGINE,
    TIKA_SERVER_URL,
+    DOCUMENT_INTELLIGENCE_ENDPOINT,
+    DOCUMENT_INTELLIGENCE_KEY,
    RAG_TOP_K,
    RAG_TEXT_SPLITTER,
    TIKTOKEN_ENCODING_NAME,
@@ -533,6 +535,8 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (

 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
 app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
+app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
+app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY

 app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
 app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
--- a/backend/open_webui/retrieval/loaders/main.py
+++ b/backend/open_webui/retrieval/loaders/main.py
@@ -4,6 +4,7 @@ import ftfy
 import sys

 from langchain_community.document_loaders import (
+    AzureAIDocumentIntelligenceLoader,
    BSHTMLLoader,
    CSVLoader,
    Docx2txtLoader,
@@ -147,6 +148,27 @@ class Loader:
                    file_path=file_path,
                    mime_type=file_content_type,
                )
+        elif (
+            self.engine == "document_intelligence"
+            and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
+            and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
+            and (
+                file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]
+                or file_content_type
+                in [
+                    "application/vnd.ms-excel",
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    "application/vnd.ms-powerpoint",
+                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                ]
+            )
+        ):
+            loader = AzureAIDocumentIntelligenceLoader(
+                file_path=file_path,
+                api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
+                api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
+            )
        else:
            if file_ext == "pdf":
                loader = PyPDFLoader(
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -356,6 +356,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
        "content_extraction": {
            "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
            "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "document_intelligence_config": {
+                "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
+                "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
+            },
        },
        "chunk": {
            "text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -411,9 +415,15 @@ class FileConfig(BaseModel):
    max_count: Optional[int] = None


+class DocumentIntelligenceConfigForm(BaseModel):
+    endpoint: str
+    key: str
+
+
 class ContentExtractionConfig(BaseModel):
    engine: str = ""
    tika_server_url: Optional[str] = None
+    document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None


 class ChunkParamUpdateForm(BaseModel):
@@ -501,13 +511,22 @@ async def update_rag_config(
        request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count

    if form_data.content_extraction is not None:
-        log.info(f"Updating text settings: {form_data.content_extraction}")
+        log.info(
+            f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}"
+        )
        request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
            form_data.content_extraction.engine
        )
        request.app.state.config.TIKA_SERVER_URL = (
            form_data.content_extraction.tika_server_url
        )
+        if form_data.content_extraction.document_intelligence_config is not None:
+            request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
+                form_data.content_extraction.document_intelligence_config.endpoint
+            )
+            request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
+                form_data.content_extraction.document_intelligence_config.key
+            )

    if form_data.chunk is not None:
        request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
@@ -604,6 +623,10 @@ async def update_rag_config(
        "content_extraction": {
            "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
            "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "document_intelligence_config": {
+                "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
+                "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
+            },
        },
        "chunk": {
            "text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -937,6 +960,8 @@ def process_file(
                    engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
                    TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
                    PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
+                    DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
+                    DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
                )
                docs = loader.load(
                    file.filename, file.meta.get("content_type"), file_path