feat: docling support for document preprocessing

2025-06-26 18:26:48 +00:00 · 2025-02-14 12:08:03 +00:00
parent 22c100bb6b
commit 2419ef06a0
6 changed files with 163 additions and 515 deletions
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@@ -1378,6 +1378,12 @@ TIKA_SERVER_URL = PersistentConfig(
    os.getenv("TIKA_SERVER_URL", "http://tika:9998"),  # Default for sidecar deployment
 )

+DOCLING_SERVER_URL = PersistentConfig(
+    "DOCLING_SERVER_URL",
+    "rag.docling_server_url",
+    os.getenv("DOCLING_SERVER_URL", "http://docling:5001"),
+)
+
 RAG_TOP_K = PersistentConfig(
    "RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
 )
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -154,6 +154,7 @@ from open_webui.config import (
    CHUNK_SIZE,
    CONTENT_EXTRACTION_ENGINE,
    TIKA_SERVER_URL,
+    DOCLING_SERVER_URL,
    RAG_TOP_K,
    RAG_TEXT_SPLITTER,
    TIKTOKEN_ENCODING_NAME,
@@ -477,6 +478,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (

 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
 app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
+app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL

 app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
 app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
--- a/backend/open_webui/retrieval/loaders/main.py
+++ b/backend/open_webui/retrieval/loaders/main.py
@@ -115,6 +115,61 @@ class TikaLoader:
            raise Exception(f"Error calling Tika: {r.reason}")


+class DoclingLoader:
+    def __init__(self, url, file_path=None, mime_type=None):
+        self.url = url.rstrip("/")  # Ensure no trailing slash
+        self.file_path = file_path
+        self.mime_type = mime_type
+
+    def load(self) -> list[Document]:
+        if self.file_path is None:
+            raise ValueError("File path is required for DoclingLoader")
+
+        with open(self.file_path, "rb") as f:
+            files = {"files": (self.file_path, f, self.mime_type or "application/octet-stream")}
+            
+            params = {
+            "from_formats": ["docx", "pptx", "html", "xml_pubmed", "image", "pdf", "asciidoc", "md", "xlsx", "xml_uspto", "json_docling"],
+            "to_formats": ["md"],
+            "image_export_mode": "placeholder",
+            "do_ocr": True,
+            "force_ocr": False,
+            "ocr_engine": "easyocr",
+            "ocr_lang": None,
+            "pdf_backend": "dlparse_v2",
+            "table_mode": "fast",
+            "abort_on_error": False,
+            "return_as_file": False,
+            "do_table_structure": True,
+            "include_images": True,
+            "images_scale": 2.0,
+        }
+
+            endpoint = f"{self.url}/v1alpha/convert/file"
+            response = requests.post(endpoint, files=files, data=params)
+
+        if response.ok:
+            result = response.json()
+            document_data = result.get("document", {})
+            text = document_data.get("md_content", "<No text content found>")
+
+            metadata = {"Content-Type": self.mime_type} if self.mime_type else {}
+            
+            log.debug("Docling extracted text: %s", text)
+
+            return [Document(page_content=text, metadata=metadata)]
+        else:
+            error_msg = f"Error calling Docling API: {response.status_code}"
+            if response.text:
+                try:
+                    error_data = response.json()
+                    if "detail" in error_data:
+                        error_msg += f" - {error_data['detail']}"
+                except:
+                    error_msg += f" - {response.text}"
+            raise Exception(f"Error calling Docling: {error_msg}")
+
+
 class Loader:
    def __init__(self, engine: str = "", **kwargs):
        self.engine = engine
@@ -147,6 +202,12 @@ class Loader:
                    file_path=file_path,
                    mime_type=file_content_type,
                )
+        elif self.engine == "docling":
+            loader = DoclingLoader(
+                url=self.kwargs.get("DOCLING_SERVER_URL"),
+                file_path=file_path,
+                mime_type=file_content_type,
+            )
        else:
            if file_ext == "pdf":
                loader = PyPDFLoader(
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -351,6 +351,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
        "content_extraction": {
            "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
            "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
        },
        "chunk": {
            "text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -403,6 +404,7 @@ class FileConfig(BaseModel):
 class ContentExtractionConfig(BaseModel):
    engine: str = ""
    tika_server_url: Optional[str] = None
+    docling_server_url: Optional[str] = None


 class ChunkParamUpdateForm(BaseModel):
@@ -483,6 +485,9 @@ async def update_rag_config(
        request.app.state.config.TIKA_SERVER_URL = (
            form_data.content_extraction.tika_server_url
        )
+        request.app.state.config.DOCLING_SERVER_URL = (
+            form_data.content_extraction.docling_server_url
+        )

    if form_data.chunk is not None:
        request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
@@ -559,6 +564,7 @@ async def update_rag_config(
        "content_extraction": {
            "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
            "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "docling_server_url": request.app.state.config.DOCLING_SERVER_URL,
        },
        "chunk": {
            "text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -879,6 +885,7 @@ def process_file(
                loader = Loader(
                    engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
                    TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
+                    DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL,
                    PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
                )
                docs = loader.load(