mirror of
https://github.com/open-webui/open-webui
synced 2025-05-30 18:39:15 +00:00
feat: Implement Document Intelligence as Content Extraction Engine
This commit is contained in:
parent
e9d6ada25c
commit
35f3824932
@ -1431,6 +1431,18 @@ TIKA_SERVER_URL = PersistentConfig(
|
|||||||
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
|
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
|
||||||
)
|
)
|
||||||
|
|
||||||
|
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
|
||||||
|
"DOCUMENT_INTELLIGENCE_ENDPOINT",
|
||||||
|
"rag.document_intelligence_endpoint",
|
||||||
|
os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
DOCUMENT_INTELLIGENCE_KEY = PersistentConfig(
|
||||||
|
"DOCUMENT_INTELLIGENCE_KEY",
|
||||||
|
"rag.document_intelligence_key",
|
||||||
|
os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
|
||||||
|
)
|
||||||
|
|
||||||
RAG_TOP_K = PersistentConfig(
|
RAG_TOP_K = PersistentConfig(
|
||||||
"RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
|
"RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
|
||||||
)
|
)
|
||||||
|
@ -154,6 +154,8 @@ from open_webui.config import (
|
|||||||
CHUNK_SIZE,
|
CHUNK_SIZE,
|
||||||
CONTENT_EXTRACTION_ENGINE,
|
CONTENT_EXTRACTION_ENGINE,
|
||||||
TIKA_SERVER_URL,
|
TIKA_SERVER_URL,
|
||||||
|
DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
|
DOCUMENT_INTELLIGENCE_KEY,
|
||||||
RAG_TOP_K,
|
RAG_TOP_K,
|
||||||
RAG_TEXT_SPLITTER,
|
RAG_TEXT_SPLITTER,
|
||||||
TIKTOKEN_ENCODING_NAME,
|
TIKTOKEN_ENCODING_NAME,
|
||||||
@ -478,6 +480,8 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
|
|||||||
|
|
||||||
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
||||||
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
|
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
|
||||||
|
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
|
||||||
|
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
|
||||||
|
|
||||||
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
|
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
|
||||||
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
|
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
|
||||||
|
@ -4,6 +4,7 @@ import ftfy
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import (
|
||||||
|
AzureAIDocumentIntelligenceLoader,
|
||||||
BSHTMLLoader,
|
BSHTMLLoader,
|
||||||
CSVLoader,
|
CSVLoader,
|
||||||
Docx2txtLoader,
|
Docx2txtLoader,
|
||||||
@ -147,6 +148,27 @@ class Loader:
|
|||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
mime_type=file_content_type,
|
mime_type=file_content_type,
|
||||||
)
|
)
|
||||||
|
elif (
|
||||||
|
self.engine == "document_intelligence"
|
||||||
|
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
|
||||||
|
and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
|
||||||
|
and (
|
||||||
|
file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]
|
||||||
|
or file_content_type
|
||||||
|
in [
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"application/vnd.ms-powerpoint",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
):
|
||||||
|
loader = AzureAIDocumentIntelligenceLoader(
|
||||||
|
file_path=file_path,
|
||||||
|
api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
|
||||||
|
api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if file_ext == "pdf":
|
if file_ext == "pdf":
|
||||||
loader = PyPDFLoader(
|
loader = PyPDFLoader(
|
||||||
|
@ -352,6 +352,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
|||||||
"content_extraction": {
|
"content_extraction": {
|
||||||
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
||||||
|
"document_intelligence_config": {
|
||||||
|
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
|
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"chunk": {
|
"chunk": {
|
||||||
"text_splitter": request.app.state.config.TEXT_SPLITTER,
|
"text_splitter": request.app.state.config.TEXT_SPLITTER,
|
||||||
@ -402,9 +406,15 @@ class FileConfig(BaseModel):
|
|||||||
max_count: Optional[int] = None
|
max_count: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentIntelligenceConfigForm(BaseModel):
|
||||||
|
endpoint: str
|
||||||
|
key: str
|
||||||
|
|
||||||
|
|
||||||
class ContentExtractionConfig(BaseModel):
|
class ContentExtractionConfig(BaseModel):
|
||||||
engine: str = ""
|
engine: str = ""
|
||||||
tika_server_url: Optional[str] = None
|
tika_server_url: Optional[str] = None
|
||||||
|
document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
|
||||||
|
|
||||||
|
|
||||||
class ChunkParamUpdateForm(BaseModel):
|
class ChunkParamUpdateForm(BaseModel):
|
||||||
@ -479,13 +489,22 @@ async def update_rag_config(
|
|||||||
request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count
|
request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count
|
||||||
|
|
||||||
if form_data.content_extraction is not None:
|
if form_data.content_extraction is not None:
|
||||||
log.info(f"Updating text settings: {form_data.content_extraction}")
|
log.info(
|
||||||
|
f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}"
|
||||||
|
)
|
||||||
request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
|
request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
|
||||||
form_data.content_extraction.engine
|
form_data.content_extraction.engine
|
||||||
)
|
)
|
||||||
request.app.state.config.TIKA_SERVER_URL = (
|
request.app.state.config.TIKA_SERVER_URL = (
|
||||||
form_data.content_extraction.tika_server_url
|
form_data.content_extraction.tika_server_url
|
||||||
)
|
)
|
||||||
|
if form_data.content_extraction.document_intelligence_config is not None:
|
||||||
|
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
|
||||||
|
form_data.content_extraction.document_intelligence_config.endpoint
|
||||||
|
)
|
||||||
|
request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
|
||||||
|
form_data.content_extraction.document_intelligence_config.key
|
||||||
|
)
|
||||||
|
|
||||||
if form_data.chunk is not None:
|
if form_data.chunk is not None:
|
||||||
request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
|
request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
|
||||||
@ -564,6 +583,10 @@ async def update_rag_config(
|
|||||||
"content_extraction": {
|
"content_extraction": {
|
||||||
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
|
||||||
|
"document_intelligence_config": {
|
||||||
|
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
|
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"chunk": {
|
"chunk": {
|
||||||
"text_splitter": request.app.state.config.TEXT_SPLITTER,
|
"text_splitter": request.app.state.config.TEXT_SPLITTER,
|
||||||
@ -887,6 +910,8 @@ def process_file(
|
|||||||
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
|
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
|
||||||
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
|
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
||||||
|
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
||||||
)
|
)
|
||||||
docs = loader.load(
|
docs = loader.load(
|
||||||
file.filename, file.meta.get("content_type"), file_path
|
file.filename, file.meta.get("content_type"), file_path
|
||||||
|
@ -72,6 +72,7 @@ validators==0.34.0
|
|||||||
psutil
|
psutil
|
||||||
sentencepiece
|
sentencepiece
|
||||||
soundfile==0.13.1
|
soundfile==0.13.1
|
||||||
|
azure-ai-documentintelligence==1.0.0
|
||||||
|
|
||||||
opencv-python-headless==4.11.0.86
|
opencv-python-headless==4.11.0.86
|
||||||
rapidocr-onnxruntime==1.3.24
|
rapidocr-onnxruntime==1.3.24
|
||||||
|
@ -77,6 +77,7 @@ dependencies = [
|
|||||||
"psutil",
|
"psutil",
|
||||||
"sentencepiece",
|
"sentencepiece",
|
||||||
"soundfile==0.13.1",
|
"soundfile==0.13.1",
|
||||||
|
"azure-ai-documentintelligence==1.0.0",
|
||||||
|
|
||||||
"opencv-python-headless==4.11.0.86",
|
"opencv-python-headless==4.11.0.86",
|
||||||
"rapidocr-onnxruntime==1.3.24",
|
"rapidocr-onnxruntime==1.3.24",
|
||||||
|
@ -32,9 +32,15 @@ type ChunkConfigForm = {
|
|||||||
chunk_overlap: number;
|
chunk_overlap: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type DocumentIntelligenceConfigForm = {
|
||||||
|
key: string;
|
||||||
|
endpoint: string;
|
||||||
|
};
|
||||||
|
|
||||||
type ContentExtractConfigForm = {
|
type ContentExtractConfigForm = {
|
||||||
engine: string;
|
engine: string;
|
||||||
tika_server_url: string | null;
|
tika_server_url: string | null;
|
||||||
|
document_intelligence_config: DocumentIntelligenceConfigForm | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
type YoutubeConfigForm = {
|
type YoutubeConfigForm = {
|
||||||
|
@ -50,6 +50,9 @@
|
|||||||
let contentExtractionEngine = 'default';
|
let contentExtractionEngine = 'default';
|
||||||
let tikaServerUrl = '';
|
let tikaServerUrl = '';
|
||||||
let showTikaServerUrl = false;
|
let showTikaServerUrl = false;
|
||||||
|
let documentIntelligenceEndpoint = '';
|
||||||
|
let documentIntelligenceKey = '';
|
||||||
|
let showDocumentIntelligenceConfig = false;
|
||||||
|
|
||||||
let textSplitter = '';
|
let textSplitter = '';
|
||||||
let chunkSize = 0;
|
let chunkSize = 0;
|
||||||
@ -175,6 +178,13 @@
|
|||||||
toast.error($i18n.t('Tika Server URL required.'));
|
toast.error($i18n.t('Tika Server URL required.'));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (
|
||||||
|
contentExtractionEngine === 'document_intelligence' &&
|
||||||
|
(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
|
||||||
|
) {
|
||||||
|
toast.error($i18n.t('Document Intelligence endpoint and key required.'));
|
||||||
|
return;
|
||||||
|
}
|
||||||
const res = await updateRAGConfig(localStorage.token, {
|
const res = await updateRAGConfig(localStorage.token, {
|
||||||
pdf_extract_images: pdfExtractImages,
|
pdf_extract_images: pdfExtractImages,
|
||||||
enable_google_drive_integration: enableGoogleDriveIntegration,
|
enable_google_drive_integration: enableGoogleDriveIntegration,
|
||||||
@ -189,7 +199,11 @@
|
|||||||
},
|
},
|
||||||
content_extraction: {
|
content_extraction: {
|
||||||
engine: contentExtractionEngine,
|
engine: contentExtractionEngine,
|
||||||
tika_server_url: tikaServerUrl
|
tika_server_url: tikaServerUrl,
|
||||||
|
document_intelligence_config: {
|
||||||
|
key: documentIntelligenceKey,
|
||||||
|
endpoint: documentIntelligenceEndpoint
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -245,6 +259,9 @@
|
|||||||
contentExtractionEngine = res.content_extraction.engine;
|
contentExtractionEngine = res.content_extraction.engine;
|
||||||
tikaServerUrl = res.content_extraction.tika_server_url;
|
tikaServerUrl = res.content_extraction.tika_server_url;
|
||||||
showTikaServerUrl = contentExtractionEngine === 'tika';
|
showTikaServerUrl = contentExtractionEngine === 'tika';
|
||||||
|
documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
|
||||||
|
documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
|
||||||
|
showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
|
||||||
|
|
||||||
fileMaxSize = res?.file.max_size ?? '';
|
fileMaxSize = res?.file.max_size ?? '';
|
||||||
fileMaxCount = res?.file.max_count ?? '';
|
fileMaxCount = res?.file.max_count ?? '';
|
||||||
@ -568,10 +585,12 @@
|
|||||||
bind:value={contentExtractionEngine}
|
bind:value={contentExtractionEngine}
|
||||||
on:change={(e) => {
|
on:change={(e) => {
|
||||||
showTikaServerUrl = e.target.value === 'tika';
|
showTikaServerUrl = e.target.value === 'tika';
|
||||||
|
showDocumentIntelligenceConfig = e.target.value === 'document_intelligence';
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<option value="">{$i18n.t('Default')} </option>
|
<option value="">{$i18n.t('Default')} </option>
|
||||||
<option value="tika">{$i18n.t('Tika')}</option>
|
<option value="tika">{$i18n.t('Tika')}</option>
|
||||||
|
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -587,6 +606,21 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
|
|
||||||
|
{#if showDocumentIntelligenceConfig}
|
||||||
|
<div class="my-0.5 flex gap-2 pr-2">
|
||||||
|
<input
|
||||||
|
class="flex-1 w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
|
||||||
|
placeholder={$i18n.t('Enter Document Intelligence Endpoint')}
|
||||||
|
bind:value={documentIntelligenceEndpoint}
|
||||||
|
/>
|
||||||
|
|
||||||
|
<SensitiveInput
|
||||||
|
placeholder={$i18n.t('Enter Document Intelligence Key')}
|
||||||
|
bind:value={documentIntelligenceKey}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
{/if}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<hr class=" dark:border-gray-850" />
|
<hr class=" dark:border-gray-850" />
|
||||||
|
Loading…
Reference in New Issue
Block a user