diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index e3bb2bf5e..24b4a7453 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -91,7 +91,7 @@ from config import ( SRC_LOG_LEVELS, UPLOAD_DIR, DOCS_DIR, - TEXT_EXTRACTION_ENGINE, + CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL, RAG_TOP_K, RAG_RELEVANCE_THRESHOLD, @@ -148,7 +148,7 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION ) -app.state.config.TEXT_EXTRACTION_ENGINE = TEXT_EXTRACTION_ENGINE +app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL app.state.config.CHUNK_SIZE = CHUNK_SIZE @@ -395,8 +395,8 @@ async def get_rag_config(user=Depends(get_admin_user)): return { "status": True, "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES, - "text_extraction": { - "engine": app.state.config.TEXT_EXTRACTION_ENGINE, + "content_extraction": { + "engine": app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": app.state.config.TIKA_SERVER_URL, }, "chunk": { @@ -428,7 +428,7 @@ async def get_rag_config(user=Depends(get_admin_user)): } -class TextExtractionConfig(BaseModel): +class ContentExtractionConfig(BaseModel): engine: str = "" tika_server_url: Optional[str] = None @@ -466,7 +466,7 @@ class WebConfig(BaseModel): class ConfigUpdateForm(BaseModel): pdf_extract_images: Optional[bool] = None - text_extraction: Optional[TextExtractionConfig] = None + content_extraction: Optional[ContentExtractionConfig] = None chunk: Optional[ChunkParamUpdateForm] = None youtube: Optional[YoutubeLoaderConfig] = None web: Optional[WebConfig] = None @@ -480,10 +480,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_ else app.state.config.PDF_EXTRACT_IMAGES ) - if form_data.text_extraction is not None: - log.info(f"Updating text settings: {form_data.text_extraction}") - app.state.config.TEXT_EXTRACTION_ENGINE = form_data.text_extraction.engine - app.state.config.TIKA_SERVER_URL = form_data.text_extraction.tika_server_url + if form_data.content_extraction is not None: + log.info(f"Updating text settings: {form_data.content_extraction}") + app.state.config.CONTENT_EXTRACTION_ENGINE = form_data.content_extraction.engine + app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url if form_data.chunk is not None: app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size @@ -521,8 +521,8 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_ return { "status": True, "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES, - "text_extraction": { - "engine": app.state.config.TEXT_EXTRACTION_ENGINE, + "content_extraction": { + "engine": app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": app.state.config.TIKA_SERVER_URL, }, "chunk": { @@ -1017,7 +1017,7 @@ class TikaLoader: self.mime_type = mime_type def load(self) -> List[Document]: - with (open(self.file_path, "rb") as f): + with open(self.file_path, "rb") as f: data = f.read() if self.mime_type is not None: @@ -1096,9 +1096,12 @@ def get_loader(filename: str, file_content_type: str, file_path: str): "msg", ] - if app.state.config.TEXT_EXTRACTION_ENGINE == "tika" and app.state.config.TIKA_SERVER_URL: + if ( + app.state.config.CONTENT_EXTRACTION_ENGINE == "tika" + and app.state.config.TIKA_SERVER_URL + ): if file_ext in known_source_ext or ( - file_content_type and file_content_type.find("text/") >= 0 + file_content_type and file_content_type.find("text/") >= 0 ): loader = TextLoader(file_path, autodetect_encoding=True) else: diff --git a/backend/config.py b/backend/config.py index 064ddff33..d6efde563 100644 --- a/backend/config.py +++ b/backend/config.py @@ -886,13 +886,13 @@ if WEBUI_AUTH and WEBUI_SECRET_KEY == "": raise ValueError(ERROR_MESSAGES.ENV_VAR_NOT_FOUND) #################################### -# RAG document text extraction +# RAG document content extraction #################################### -TEXT_EXTRACTION_ENGINE = PersistentConfig( - "TEXT_EXTRACTION_ENGINE", - "rag.text_extraction_engine", - os.environ.get("TEXT_EXTRACTION_ENGINE", "").lower() +CONTENT_EXTRACTION_ENGINE = PersistentConfig( + "CONTENT_EXTRACTION_ENGINE", + "rag.CONTENT_EXTRACTION_ENGINE", + os.environ.get("CONTENT_EXTRACTION_ENGINE", "").lower(), ) TIKA_SERVER_URL = PersistentConfig( diff --git a/src/lib/apis/rag/index.ts b/src/lib/apis/rag/index.ts index 4047c419a..b32e544ee 100644 --- a/src/lib/apis/rag/index.ts +++ b/src/lib/apis/rag/index.ts @@ -32,7 +32,7 @@ type ChunkConfigForm = { chunk_overlap: number; }; -type TextExtractConfigForm = { +type ContentExtractConfigForm = { engine: string; tika_server_url: string | null; }; @@ -45,7 +45,7 @@ type YoutubeConfigForm = { type RAGConfigForm = { pdf_extract_images?: boolean; chunk?: ChunkConfigForm; - text_extraction?: TextExtractConfigForm; + content_extraction?: ContentExtractConfigForm; web_loader_ssl_verification?: boolean; youtube?: YoutubeConfigForm; }; diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 1377eb5bb..2094d0421 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -37,7 +37,7 @@ let embeddingModel = ''; let rerankingModel = ''; - let textExtractionEngine = 'default'; + let contentExtractionEngine = 'default'; let tikaServerUrl = ''; let showTikaServerUrl = false; @@ -167,7 +167,7 @@ rerankingModelUpdateHandler(); } - if (textExtractionEngine === 'tika' && tikaServerUrl === '') { + if (contentExtractionEngine === 'tika' && tikaServerUrl === '') { toast.error($i18n.t('Tika Server URL required.')); return; } @@ -178,8 +178,8 @@ chunk_overlap: chunkOverlap, chunk_size: chunkSize }, - text_extraction: { - engine: textExtractionEngine, + content_extraction: { + engine: contentExtractionEngine, tika_server_url: tikaServerUrl } }); @@ -227,9 +227,9 @@ chunkSize = res.chunk.chunk_size; chunkOverlap = res.chunk.chunk_overlap; - textExtractionEngine = res.text_extraction.engine; - tikaServerUrl = res.text_extraction.tika_server_url; - showTikaServerUrl = textExtractionEngine === 'tika'; + contentExtractionEngine = res.content_extraction.engine; + tikaServerUrl = res.content_extraction.tika_server_url; + showTikaServerUrl = contentExtractionEngine === 'tika'; } }); @@ -414,11 +414,11 @@
{$i18n.t('Engine')}