From b46da1ae868c932fd60c3c9d15b743af87e39ae3 Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 09:14:23 +0200 Subject: [PATCH 01/62] Added DEFAULT_RAG_SETTINGS option for individual rag configuration --- backend/open_webui/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 3b40977f2..b8dc91b85 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2013,6 +2013,11 @@ YOUTUBE_LOADER_PROXY_URL = PersistentConfig( os.getenv("YOUTUBE_LOADER_PROXY_URL", ""), ) +DEFAULT_RAG_SETTINGS = PersistentConfig( + "DEFAULT_RAG_SETTINGS", + "rag.default_settings", + os.getenv("DEFAULT_RAG_SETTINGS", "True").lower() == "true", +) #################################### # Web Search (RAG) From 45ee9c33ade1a37f9d7283cb28a98fdcf5054900 Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 09:16:40 +0200 Subject: [PATCH 02/62] Function: Added get knowledge by collection name and adjusted insert new knowledge to insert rag config to DB --- backend/open_webui/models/knowledge.py | 31 +++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/backend/open_webui/models/knowledge.py b/backend/open_webui/models/knowledge.py index bed3d5542..5aa7699a6 100644 --- a/backend/open_webui/models/knowledge.py +++ b/backend/open_webui/models/knowledge.py @@ -99,20 +99,24 @@ class KnowledgeForm(BaseModel): access_control: Optional[dict] = None +class RAGConfigForm(BaseModel): + rag_config: Optional[dict] = None + + class KnowledgeTable: def insert_new_knowledge( - self, user_id: str, form_data: KnowledgeForm + self, user_id: str, form_data: KnowledgeForm, rag_data: RAGConfigForm ) -> Optional[KnowledgeModel]: with get_db() as db: - knowledge = KnowledgeModel( - **{ - **form_data.model_dump(), - "id": str(uuid.uuid4()), - "user_id": user_id, - "created_at": int(time.time()), - "updated_at": int(time.time()), - } - ) + knowledge_data = { + **form_data.model_dump(), + "data": {"rag_config": rag_data.rag_config}, + "id": str(uuid.uuid4()), + "user_id": user_id, + "created_at": int(time.time()), + "updated_at": int(time.time()), + } + knowledge = KnowledgeModel(**knowledge_data) try: result = Knowledge(**knowledge.model_dump()) @@ -217,5 +221,12 @@ class KnowledgeTable: except Exception: return False + def get_knowledge_by_collection_name(self, name: str) -> Optional[KnowledgeModel]: + try: + with get_db() as db: + knowledge = db.query(Knowledge).filter_by(name=name).first() + return KnowledgeModel.model_validate(knowledge) if knowledge else None + except Exception: + return None Knowledges = KnowledgeTable() From 220ad3723eecd15ed5ce392be6b87f42b9071cb4 Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 09:17:33 +0200 Subject: [PATCH 03/62] Function: Adjusted create new knowledge to send rag config --- backend/open_webui/routers/knowledge.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/open_webui/routers/knowledge.py b/backend/open_webui/routers/knowledge.py index 15547afa7..ed7fb449d 100644 --- a/backend/open_webui/routers/knowledge.py +++ b/backend/open_webui/routers/knowledge.py @@ -8,6 +8,7 @@ from open_webui.models.knowledge import ( KnowledgeForm, KnowledgeResponse, KnowledgeUserResponse, + RAGConfigForm ) from open_webui.models.files import Files, FileModel from open_webui.retrieval.vector.connector import VECTOR_DB_CLIENT @@ -140,7 +141,7 @@ async def get_knowledge_list(user=Depends(get_verified_user)): @router.post("/create", response_model=Optional[KnowledgeResponse]) async def create_new_knowledge( - request: Request, form_data: KnowledgeForm, user=Depends(get_verified_user) + request: Request, form_data: KnowledgeForm, rag_data: RAGConfigForm, user=Depends(get_verified_user) ): if user.role != "admin" and not has_permission( user.id, "workspace.knowledge", request.app.state.config.USER_PERMISSIONS @@ -150,7 +151,7 @@ async def create_new_knowledge( detail=ERROR_MESSAGES.UNAUTHORIZED, ) - knowledge = Knowledges.insert_new_knowledge(user.id, form_data) + knowledge = Knowledges.insert_new_knowledge(user.id, form_data, rag_data) if knowledge: return knowledge From e3a93b24a01d3084f162af5edd3c43f8c5927be3 Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 09:55:53 +0200 Subject: [PATCH 04/62] Feature: Adjusted all necessary functions to handle individual rag configuration - fallback to default configuration if individual config is not used --- backend/open_webui/routers/retrieval.py | 1027 ++++++++++++++--------- 1 file changed, 626 insertions(+), 401 deletions(-) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 13f012483..405d0d820 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -193,30 +193,70 @@ async def get_status(request: Request): } -@router.get("/embedding") -async def get_embedding_config(request: Request, user=Depends(get_admin_user)): - return { - "status": True, - "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE, - "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL, - "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, - "openai_config": { - "url": request.app.state.config.RAG_OPENAI_API_BASE_URL, - "key": request.app.state.config.RAG_OPENAI_API_KEY, - }, - "ollama_config": { - "url": request.app.state.config.RAG_OLLAMA_BASE_URL, - "key": request.app.state.config.RAG_OLLAMA_API_KEY, - }, - } +@router.post("/embedding") +async def get_embedding_config(request: Request, collectionForm: CollectionNameForm, user=Depends(get_verified_user)): + """ + Retrieve the embedding configuration. + If DEFAULT_RAG_SETTINGS is True, return the default embedding settings. + Otherwise, return the embedding configuration stored in the database. + """ + knowledge_base = Knowledges.get_knowledge_by_collection_name(collectionForm.collection_name) + if knowledge_base and not knowledge_base.data.get("DEFAULT_RAG_SETTINGS", True): + # Return the embedding configuration from the database + rag_config = knowledge_base.data.get("rag_config", {}) + return { + "status": True, + "embedding_engine": rag_config.get("embedding_engine", request.app.state.config.RAG_EMBEDDING_ENGINE), + "embedding_model": rag_config.get("embedding_model", request.app.state.config.RAG_EMBEDDING_MODEL), + "embedding_batch_size": rag_config.get("embedding_batch_size", request.app.state.config.RAG_EMBEDDING_BATCH_SIZE), + "openai_config": rag_config.get("openai_config", { + "url": request.app.state.config.RAG_OPENAI_API_BASE_URL, + "key": request.app.state.config.RAG_OPENAI_API_KEY, + }), + "ollama_config": rag_config.get("ollama_config", { + "url": request.app.state.config.RAG_OLLAMA_BASE_URL, + "key": request.app.state.config.RAG_OLLAMA_API_KEY, + }), + } + else: + # Return default embedding settings + return { + "status": True, + "embedding_engine": request.app.state.config.RAG_EMBEDDING_ENGINE, + "embedding_model": request.app.state.config.RAG_EMBEDDING_MODEL, + "embedding_batch_size": request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, + "openai_config": { + "url": request.app.state.config.RAG_OPENAI_API_BASE_URL, + "key": request.app.state.config.RAG_OPENAI_API_KEY, + }, + "ollama_config": { + "url": request.app.state.config.RAG_OLLAMA_BASE_URL, + "key": request.app.state.config.RAG_OLLAMA_API_KEY, + }, + } -@router.get("/reranking") -async def get_reraanking_config(request: Request, user=Depends(get_admin_user)): - return { - "status": True, - "reranking_model": request.app.state.config.RAG_RERANKING_MODEL, - } +@router.post("/reranking") +async def get_reranking_config(request: Request, collectionForm: CollectionNameForm, user=Depends(get_verified_user)): + """ + Retrieve the reranking configuration. + If DEFAULT_RAG_SETTINGS is True, return the default reranking settings. + Otherwise, return the reranking configuration stored in the database. + """ + knowledge_base = Knowledges.get_knowledge_by_collection_name(collectionForm.collection_name) + if knowledge_base and not knowledge_base.data.get("DEFAULT_RAG_SETTINGS", True): + # Return the reranking configuration from the database + rag_config = knowledge_base.data.get("rag_config", {}) + return { + "status": True, + "reranking_model": rag_config.get("reranking_model", request.app.state.config.RAG_RERANKING_MODEL), + } + else: + # Return default reranking settings + return { + "status": True, + "reranking_model": request.app.state.config.RAG_RERANKING_MODEL, + } class OpenAIConfigForm(BaseModel): @@ -321,25 +361,47 @@ class RerankingModelUpdateForm(BaseModel): async def update_reranking_config( request: Request, form_data: RerankingModelUpdateForm, user=Depends(get_admin_user) ): - log.info( - f"Updating reranking model: {request.app.state.config.RAG_RERANKING_MODEL} to {form_data.reranking_model}" - ) + """ + Update the reranking model configuration. + If DEFAULT_RAG_SETTINGS is True, update the global configuration. + Otherwise, update the RAG configuration in the database for the user's knowledge base. + """ try: - request.app.state.config.RAG_RERANKING_MODEL = form_data.reranking_model + knowledge_base = Knowledges.get_knowledge_base_by_user_id(user.id) - try: - request.app.state.rf = get_rf( - request.app.state.config.RAG_RERANKING_MODEL, - True, + if knowledge_base and not knowledge_base.data.get("DEFAULT_RAG_SETTINGS", True): + # Update the RAG configuration in the database + rag_config = knowledge_base.data.get("rag_config", {}) + rag_config["reranking_model"] = form_data.reranking_model + Knowledges.update_knowledge_data_by_id( + id=knowledge_base.id, data={"rag_config": rag_config} ) - except Exception as e: - log.error(f"Error loading reranking model: {e}") - request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = False + return { + "status": True, + "reranking_model": rag_config["reranking_model"], + "message": "Reranking model updated in the database.", + } + else: + # Update the global configuration + log.info( + f"Updating reranking model: {request.app.state.config.RAG_RERANKING_MODEL} to {form_data.reranking_model}" + ) + request.app.state.config.RAG_RERANKING_MODEL = form_data.reranking_model - return { - "status": True, - "reranking_model": request.app.state.config.RAG_RERANKING_MODEL, - } + try: + request.app.state.rf = get_rf( + request.app.state.config.RAG_RERANKING_MODEL, + True, + ) + except Exception as e: + log.error(f"Error loading reranking model: {e}") + request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = False + + return { + "status": True, + "reranking_model": request.app.state.config.RAG_RERANKING_MODEL, + "message": "Reranking model updated globally.", + } except Exception as e: log.exception(f"Problem updating reranking model: {e}") raise HTTPException( @@ -348,81 +410,165 @@ async def update_reranking_config( ) -@router.get("/config") -async def get_rag_config(request: Request, user=Depends(get_admin_user)): - return { - "status": True, - # RAG settings - "RAG_TEMPLATE": request.app.state.config.RAG_TEMPLATE, - "TOP_K": request.app.state.config.TOP_K, - "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL, - "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, - # Hybrid search settings - "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, - "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER, - "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD, - # Content extraction settings - "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, - "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, - "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, - "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, - "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, - "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, - "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, - # Chunking settings - "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER, - "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE, - "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP, - # File upload settings - "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE, - "FILE_MAX_COUNT": request.app.state.config.FILE_MAX_COUNT, - # Integration settings - "ENABLE_GOOGLE_DRIVE_INTEGRATION": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, - "ENABLE_ONEDRIVE_INTEGRATION": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION, - # Web search settings - "web": { - "ENABLE_WEB_SEARCH": request.app.state.config.ENABLE_WEB_SEARCH, - "WEB_SEARCH_ENGINE": request.app.state.config.WEB_SEARCH_ENGINE, - "WEB_SEARCH_TRUST_ENV": request.app.state.config.WEB_SEARCH_TRUST_ENV, - "WEB_SEARCH_RESULT_COUNT": request.app.state.config.WEB_SEARCH_RESULT_COUNT, - "WEB_SEARCH_CONCURRENT_REQUESTS": request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS, - "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, - "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, - "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL, - "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY, - "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID, - "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY, - "KAGI_SEARCH_API_KEY": request.app.state.config.KAGI_SEARCH_API_KEY, - "MOJEEK_SEARCH_API_KEY": request.app.state.config.MOJEEK_SEARCH_API_KEY, - "BOCHA_SEARCH_API_KEY": request.app.state.config.BOCHA_SEARCH_API_KEY, - "SERPSTACK_API_KEY": request.app.state.config.SERPSTACK_API_KEY, - "SERPSTACK_HTTPS": request.app.state.config.SERPSTACK_HTTPS, - "SERPER_API_KEY": request.app.state.config.SERPER_API_KEY, - "SERPLY_API_KEY": request.app.state.config.SERPLY_API_KEY, - "TAVILY_API_KEY": request.app.state.config.TAVILY_API_KEY, - "SEARCHAPI_API_KEY": request.app.state.config.SEARCHAPI_API_KEY, - "SEARCHAPI_ENGINE": request.app.state.config.SEARCHAPI_ENGINE, - "SERPAPI_API_KEY": request.app.state.config.SERPAPI_API_KEY, - "SERPAPI_ENGINE": request.app.state.config.SERPAPI_ENGINE, - "JINA_API_KEY": request.app.state.config.JINA_API_KEY, - "BING_SEARCH_V7_ENDPOINT": request.app.state.config.BING_SEARCH_V7_ENDPOINT, - "BING_SEARCH_V7_SUBSCRIPTION_KEY": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY, - "EXA_API_KEY": request.app.state.config.EXA_API_KEY, - "PERPLEXITY_API_KEY": request.app.state.config.PERPLEXITY_API_KEY, - "SOUGOU_API_SID": request.app.state.config.SOUGOU_API_SID, - "SOUGOU_API_SK": request.app.state.config.SOUGOU_API_SK, - "WEB_LOADER_ENGINE": request.app.state.config.WEB_LOADER_ENGINE, - "ENABLE_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION, - "PLAYWRIGHT_WS_URL": request.app.state.config.PLAYWRIGHT_WS_URL, - "PLAYWRIGHT_TIMEOUT": request.app.state.config.PLAYWRIGHT_TIMEOUT, - "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, - "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, - "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, - "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, - "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, - "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, - }, - } +@router.post("/config") +async def get_rag_config(request: Request, collectionForm: CollectionNameForm, user=Depends(get_admin_user)): + """ + Retrieve the full RAG configuration. + If DEFAULT_RAG_SETTINGS is True, return the default settings. + Otherwise, return the RAG configuration stored in the database. + """ + knowledge_base = Knowledges.get_knowledge_by_collection_name(collectionForm.collection_name) + if knowledge_base and not knowledge_base.data.get("DEFAULT_RAG_SETTINGS", True): + # Return the RAG configuration from the database + rag_config = knowledge_base.data.get("rag_config", {}) + return { + "status": True, + # RAG settings + "RAG_TEMPLATE": rag_config.get("template", request.app.state.config.RAG_TEMPLATE), + "TOP_K": rag_config.get("top_k", request.app.state.config.TOP_K), + "BYPASS_EMBEDDING_AND_RETRIEVAL": rag_config.get("bypass_embedding_and_retrieval", request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL), + "RAG_FULL_CONTEXT": rag_config.get("rag_full_context", request.app.state.config.RAG_FULL_CONTEXT), + # Hybrid search settings + "ENABLE_RAG_HYBRID_SEARCH": rag_config.get("enable_rag_hybrid_search", request.app.state.config.ENABLE_RAG_HYBRID_SEARCH), + "TOP_K_RERANKER": rag_config.get("top_k_reranker", request.app.state.config.TOP_K_RERANKER), + "RELEVANCE_THRESHOLD": rag_config.get("relevance_threshold", request.app.state.config.RELEVANCE_THRESHOLD), + # Content extraction settings + "CONTENT_EXTRACTION_ENGINE": rag_config.get("content_extraction_engine", request.app.state.config.CONTENT_EXTRACTION_ENGINE), + "PDF_EXTRACT_IMAGES": rag_config.get("pdf_extract_images", request.app.state.config.PDF_EXTRACT_IMAGES), + "TIKA_SERVER_URL": rag_config.get("tika_server_url", request.app.state.config.TIKA_SERVER_URL), + "DOCLING_SERVER_URL": rag_config.get("docling_server_url", request.app.state.config.DOCLING_SERVER_URL), + "DOCUMENT_INTELLIGENCE_ENDPOINT": rag_config.get("document_intelligence_endpoint", request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT), + "DOCUMENT_INTELLIGENCE_KEY": rag_config.get("document_intelligence_key", request.app.state.config.DOCUMENT_INTELLIGENCE_KEY), + "MISTRAL_OCR_API_KEY": rag_config.get("mistral_ocr_api_key", request.app.state.config.MISTRAL_OCR_API_KEY), + # Chunking settings + "TEXT_SPLITTER": rag_config.get("text_splitter", request.app.state.config.TEXT_SPLITTER), + "CHUNK_SIZE": rag_config.get("chunk_size", request.app.state.config.CHUNK_SIZE), + "CHUNK_OVERLAP": rag_config.get("chunk_overlap", request.app.state.config.CHUNK_OVERLAP), + # File upload settings + "FILE_MAX_SIZE": rag_config.get("file_max_size", request.app.state.config.FILE_MAX_SIZE), + "FILE_MAX_COUNT": rag_config.get("file_max_count", request.app.state.config.FILE_MAX_COUNT), + # Integration settings + "ENABLE_GOOGLE_DRIVE_INTEGRATION": rag_config.get("enable_google_drive_integration", request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION), + "ENABLE_ONEDRIVE_INTEGRATION": rag_config.get("enable_onedrive_integration", request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION), + # Web search settings + "web": { + "ENABLE_WEB_SEARCH": rag_config.get("enable_web_search", request.app.state.config.ENABLE_WEB_SEARCH), + "WEB_SEARCH_ENGINE": rag_config.get("web_search_engine", request.app.state.config.WEB_SEARCH_ENGINE), + "WEB_SEARCH_TRUST_ENV": rag_config.get("web_search_trust_env", request.app.state.config.WEB_SEARCH_TRUST_ENV), + "WEB_SEARCH_RESULT_COUNT": rag_config.get("web_search_result_count", request.app.state.config.WEB_SEARCH_RESULT_COUNT), + "WEB_SEARCH_CONCURRENT_REQUESTS": rag_config.get("web_search_concurrent_requests", request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS), + "WEB_SEARCH_DOMAIN_FILTER_LIST": rag_config.get("web_search_domain_filter_list", request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST), + "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": rag_config.get("bypass_web_search_embedding_and_retrieval", request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL), + "SEARXNG_QUERY_URL": rag_config.get("searxng_query_url", request.app.state.config.SEARXNG_QUERY_URL), + "GOOGLE_PSE_API_KEY": rag_config.get("google_pse_api_key", request.app.state.config.GOOGLE_PSE_API_KEY), + "GOOGLE_PSE_ENGINE_ID": rag_config.get("google_pse_engine_id", request.app.state.config.GOOGLE_PSE_ENGINE_ID), + "BRAVE_SEARCH_API_KEY": rag_config.get("brave_search_api_key", request.app.state.config.BRAVE_SEARCH_API_KEY), + "KAGI_SEARCH_API_KEY": rag_config.get("kagi_search_api_key", request.app.state.config.KAGI_SEARCH_API_KEY), + "MOJEEK_SEARCH_API_KEY": rag_config.get("mojeek_search_api_key", request.app.state.config.MOJEEK_SEARCH_API_KEY), + "BOCHA_SEARCH_API_KEY": rag_config.get("bocha_search_api_key", request.app.state.config.BOCHA_SEARCH_API_KEY), + "SERPSTACK_API_KEY": rag_config.get("serpstack_api_key", request.app.state.config.SERPSTACK_API_KEY), + "SERPSTACK_HTTPS": rag_config.get("serpstack_https", request.app.state.config.SERPSTACK_HTTPS), + "SERPER_API_KEY": rag_config.get("serper_api_key", request.app.state.config.SERPER_API_KEY), + "SERPLY_API_KEY": rag_config.get("serply_api_key", request.app.state.config.SERPLY_API_KEY), + "TAVILY_API_KEY": rag_config.get("tavily_api_key", request.app.state.config.TAVILY_API_KEY), + "SEARCHAPI_API_KEY": rag_config.get("searchapi_api_key", request.app.state.config.SEARCHAPI_API_KEY), + "SEARCHAPI_ENGINE": rag_config.get("searchapi_engine", request.app.state.config.SEARCHAPI_ENGINE), + "SERPAPI_API_KEY": rag_config.get("serpapi_api_key", request.app.state.config.SERPAPI_API_KEY), + "SERPAPI_ENGINE": rag_config.get("serpapi_engine", request.app.state.config.SERPAPI_ENGINE), + "JINA_API_KEY": rag_config.get("jina_api_key", request.app.state.config.JINA_API_KEY), + "BING_SEARCH_V7_ENDPOINT": rag_config.get("bing_search_v7_endpoint", request.app.state.config.BING_SEARCH_V7_ENDPOINT), + "BING_SEARCH_V7_SUBSCRIPTION_KEY": rag_config.get("bing_search_v7_subscription_key", request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY), + "EXA_API_KEY": rag_config.get("exa_api_key", request.app.state.config.EXA_API_KEY), + "PERPLEXITY_API_KEY": rag_config.get("perplexity_api_key", request.app.state.config.PERPLEXITY_API_KEY), + "SOUGOU_API_SID": rag_config.get("sougou_api_sid", request.app.state.config.SOUGOU_API_SID), + "SOUGOU_API_SK": rag_config.get("sougou_api_sk", request.app.state.config.SOUGOU_API_SK), + "WEB_LOADER_ENGINE": rag_config.get("web_loader_engine", request.app.state.config.WEB_LOADER_ENGINE), + "ENABLE_WEB_LOADER_SSL_VERIFICATION": rag_config.get("enable_web_loader_ssl_verification", request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION), + "PLAYWRIGHT_WS_URL": rag_config.get("playwright_ws_url", request.app.state.config.PLAYWRIGHT_WS_URL), + "PLAYWRIGHT_TIMEOUT": rag_config.get("playwright_timeout", request.app.state.config.PLAYWRIGHT_TIMEOUT), + "FIRECRAWL_API_KEY": rag_config.get("firecrawl_api_key", request.app.state.config.FIRECRAWL_API_KEY), + "FIRECRAWL_API_BASE_URL": rag_config.get("firecrawl_api_base_url", request.app.state.config.FIRECRAWL_API_BASE_URL), + "TAVILY_EXTRACT_DEPTH": rag_config.get("tavily_extract_depth", request.app.state.config.TAVILY_EXTRACT_DEPTH), + "YOUTUBE_LOADER_LANGUAGE": rag_config.get("youtube_loader_language", request.app.state.config.YOUTUBE_LOADER_LANGUAGE), + "YOUTUBE_LOADER_PROXY_URL": rag_config.get("youtube_loader_proxy_url", request.app.state.config.YOUTUBE_LOADER_PROXY_URL), + "YOUTUBE_LOADER_TRANSLATION": rag_config.get("youtube_loader_translation", request.app.state.config.YOUTUBE_LOADER_TRANSLATION), + }, + } + else: + # Return default RAG settings + return { + "status": True, + # RAG settings + "RAG_TEMPLATE": request.app.state.config.RAG_TEMPLATE, + "TOP_K": request.app.state.config.TOP_K, + "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL, + "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, + # Hybrid search settings + "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, + "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER, + "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD, + # Content extraction settings + "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, + "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, + "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, + "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, + "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, + # Chunking settings + "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER, + "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE, + "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP, + # File upload settings + "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE, + "FILE_MAX_COUNT": request.app.state.config.FILE_MAX_COUNT, + # Integration settings + "ENABLE_GOOGLE_DRIVE_INTEGRATION": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, + "ENABLE_ONEDRIVE_INTEGRATION": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION, + # Web search settings + "web": { + "ENABLE_WEB_SEARCH": request.app.state.config.ENABLE_WEB_SEARCH, + "WEB_SEARCH_ENGINE": request.app.state.config.WEB_SEARCH_ENGINE, + "WEB_SEARCH_TRUST_ENV": request.app.state.config.WEB_SEARCH_TRUST_ENV, + "WEB_SEARCH_RESULT_COUNT": request.app.state.config.WEB_SEARCH_RESULT_COUNT, + "WEB_SEARCH_CONCURRENT_REQUESTS": request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS, + "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, + "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, + "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL, + "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY, + "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID, + "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY, + "KAGI_SEARCH_API_KEY": request.app.state.config.KAGI_SEARCH_API_KEY, + "MOJEEK_SEARCH_API_KEY": request.app.state.config.MOJEEK_SEARCH_API_KEY, + "BOCHA_SEARCH_API_KEY": request.app.state.config.BOCHA_SEARCH_API_KEY, + "SERPSTACK_API_KEY": request.app.state.config.SERPSTACK_API_KEY, + "SERPSTACK_HTTPS": request.app.state.config.SERPSTACK_HTTPS, + "SERPER_API_KEY": request.app.state.config.SERPER_API_KEY, + "SERPLY_API_KEY": request.app.state.config.SERPLY_API_KEY, + "TAVILY_API_KEY": request.app.state.config.TAVILY_API_KEY, + "SEARCHAPI_API_KEY": request.app.state.config.SEARCHAPI_API_KEY, + "SEARCHAPI_ENGINE": request.app.state.config.SEARCHAPI_ENGINE, + "SERPAPI_API_KEY": request.app.state.config.SERPAPI_API_KEY, + "SERPAPI_ENGINE": request.app.state.config.SERPAPI_ENGINE, + "JINA_API_KEY": request.app.state.config.JINA_API_KEY, + "BING_SEARCH_V7_ENDPOINT": request.app.state.config.BING_SEARCH_V7_ENDPOINT, + "BING_SEARCH_V7_SUBSCRIPTION_KEY": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY, + "EXA_API_KEY": request.app.state.config.EXA_API_KEY, + "PERPLEXITY_API_KEY": request.app.state.config.PERPLEXITY_API_KEY, + "SOUGOU_API_SID": request.app.state.config.SOUGOU_API_SID, + "SOUGOU_API_SK": request.app.state.config.SOUGOU_API_SK, + "WEB_LOADER_ENGINE": request.app.state.config.WEB_LOADER_ENGINE, + "ENABLE_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION, + "PLAYWRIGHT_WS_URL": request.app.state.config.PLAYWRIGHT_WS_URL, + "PLAYWRIGHT_TIMEOUT": request.app.state.config.PLAYWRIGHT_TIMEOUT, + "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, + "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, + "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, + "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, + "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, + "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, + }, + } class WebConfig(BaseModel): @@ -508,281 +654,305 @@ class ConfigForm(BaseModel): @router.post("/config/update") async def update_rag_config( - request: Request, form_data: ConfigForm, user=Depends(get_admin_user) + request: Request, form_data: ConfigForm, collectionForm: CollectionNameForm, user=Depends(get_admin_user) ): - # RAG settings - request.app.state.config.RAG_TEMPLATE = ( - form_data.RAG_TEMPLATE - if form_data.RAG_TEMPLATE is not None - else request.app.state.config.RAG_TEMPLATE - ) - request.app.state.config.TOP_K = ( - form_data.TOP_K - if form_data.TOP_K is not None - else request.app.state.config.TOP_K - ) - request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = ( - form_data.BYPASS_EMBEDDING_AND_RETRIEVAL - if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None - else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL - ) - request.app.state.config.RAG_FULL_CONTEXT = ( - form_data.RAG_FULL_CONTEXT - if form_data.RAG_FULL_CONTEXT is not None - else request.app.state.config.RAG_FULL_CONTEXT - ) + """ + Update the RAG configuration. + If DEFAULT_RAG_SETTINGS is True, update the global configuration. + Otherwise, update the RAG configuration in the database for the user's knowledge base. + """ + knowledge_base = Knowledges.get_knowledge_by_collection_name(collectionForm.collection_name) + + if knowledge_base and not knowledge_base.data.get("DEFAULT_RAG_SETTINGS", True): + # Update the RAG configuration in the database + rag_config = knowledge_base.data.get("rag_config", {}) - # Hybrid search settings - request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = ( - form_data.ENABLE_RAG_HYBRID_SEARCH - if form_data.ENABLE_RAG_HYBRID_SEARCH is not None - else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH - ) - # Free up memory if hybrid search is disabled - if not request.app.state.config.ENABLE_RAG_HYBRID_SEARCH: - request.app.state.rf = None + # Update only the provided fields in the rag_config + for field, value in form_data.dict(exclude_unset=True).items(): + if field == "web" and value is not None: + rag_config["web"] = {**rag_config.get("web", {}), **value.dict(exclude_unset=True)} + else: + rag_config[field] = value - request.app.state.config.TOP_K_RERANKER = ( - form_data.TOP_K_RERANKER - if form_data.TOP_K_RERANKER is not None - else request.app.state.config.TOP_K_RERANKER - ) - request.app.state.config.RELEVANCE_THRESHOLD = ( - form_data.RELEVANCE_THRESHOLD - if form_data.RELEVANCE_THRESHOLD is not None - else request.app.state.config.RELEVANCE_THRESHOLD - ) - - # Content extraction settings - request.app.state.config.CONTENT_EXTRACTION_ENGINE = ( - form_data.CONTENT_EXTRACTION_ENGINE - if form_data.CONTENT_EXTRACTION_ENGINE is not None - else request.app.state.config.CONTENT_EXTRACTION_ENGINE - ) - request.app.state.config.PDF_EXTRACT_IMAGES = ( - form_data.PDF_EXTRACT_IMAGES - if form_data.PDF_EXTRACT_IMAGES is not None - else request.app.state.config.PDF_EXTRACT_IMAGES - ) - request.app.state.config.TIKA_SERVER_URL = ( - form_data.TIKA_SERVER_URL - if form_data.TIKA_SERVER_URL is not None - else request.app.state.config.TIKA_SERVER_URL - ) - request.app.state.config.DOCLING_SERVER_URL = ( - form_data.DOCLING_SERVER_URL - if form_data.DOCLING_SERVER_URL is not None - else request.app.state.config.DOCLING_SERVER_URL - ) - request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( - form_data.DOCUMENT_INTELLIGENCE_ENDPOINT - if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None - else request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT - ) - request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = ( - form_data.DOCUMENT_INTELLIGENCE_KEY - if form_data.DOCUMENT_INTELLIGENCE_KEY is not None - else request.app.state.config.DOCUMENT_INTELLIGENCE_KEY - ) - request.app.state.config.MISTRAL_OCR_API_KEY = ( - form_data.MISTRAL_OCR_API_KEY - if form_data.MISTRAL_OCR_API_KEY is not None - else request.app.state.config.MISTRAL_OCR_API_KEY - ) - - # Chunking settings - request.app.state.config.TEXT_SPLITTER = ( - form_data.TEXT_SPLITTER - if form_data.TEXT_SPLITTER is not None - else request.app.state.config.TEXT_SPLITTER - ) - request.app.state.config.CHUNK_SIZE = ( - form_data.CHUNK_SIZE - if form_data.CHUNK_SIZE is not None - else request.app.state.config.CHUNK_SIZE - ) - request.app.state.config.CHUNK_OVERLAP = ( - form_data.CHUNK_OVERLAP - if form_data.CHUNK_OVERLAP is not None - else request.app.state.config.CHUNK_OVERLAP - ) - - # File upload settings - request.app.state.config.FILE_MAX_SIZE = ( - form_data.FILE_MAX_SIZE - if form_data.FILE_MAX_SIZE is not None - else request.app.state.config.FILE_MAX_SIZE - ) - request.app.state.config.FILE_MAX_COUNT = ( - form_data.FILE_MAX_COUNT - if form_data.FILE_MAX_COUNT is not None - else request.app.state.config.FILE_MAX_COUNT - ) - - # Integration settings - request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ( - form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION - if form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION is not None - else request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION - ) - request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION = ( - form_data.ENABLE_ONEDRIVE_INTEGRATION - if form_data.ENABLE_ONEDRIVE_INTEGRATION is not None - else request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION - ) - - if form_data.web is not None: - # Web search settings - request.app.state.config.ENABLE_WEB_SEARCH = form_data.web.ENABLE_WEB_SEARCH - request.app.state.config.WEB_SEARCH_ENGINE = form_data.web.WEB_SEARCH_ENGINE - request.app.state.config.WEB_SEARCH_TRUST_ENV = ( - form_data.web.WEB_SEARCH_TRUST_ENV - ) - request.app.state.config.WEB_SEARCH_RESULT_COUNT = ( - form_data.web.WEB_SEARCH_RESULT_COUNT - ) - request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS = ( - form_data.web.WEB_SEARCH_CONCURRENT_REQUESTS - ) - request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST = ( - form_data.web.WEB_SEARCH_DOMAIN_FILTER_LIST - ) - request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = ( - form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL - ) - request.app.state.config.SEARXNG_QUERY_URL = form_data.web.SEARXNG_QUERY_URL - request.app.state.config.GOOGLE_PSE_API_KEY = form_data.web.GOOGLE_PSE_API_KEY - request.app.state.config.GOOGLE_PSE_ENGINE_ID = ( - form_data.web.GOOGLE_PSE_ENGINE_ID - ) - request.app.state.config.BRAVE_SEARCH_API_KEY = ( - form_data.web.BRAVE_SEARCH_API_KEY - ) - request.app.state.config.KAGI_SEARCH_API_KEY = form_data.web.KAGI_SEARCH_API_KEY - request.app.state.config.MOJEEK_SEARCH_API_KEY = ( - form_data.web.MOJEEK_SEARCH_API_KEY - ) - request.app.state.config.BOCHA_SEARCH_API_KEY = ( - form_data.web.BOCHA_SEARCH_API_KEY - ) - request.app.state.config.SERPSTACK_API_KEY = form_data.web.SERPSTACK_API_KEY - request.app.state.config.SERPSTACK_HTTPS = form_data.web.SERPSTACK_HTTPS - request.app.state.config.SERPER_API_KEY = form_data.web.SERPER_API_KEY - request.app.state.config.SERPLY_API_KEY = form_data.web.SERPLY_API_KEY - request.app.state.config.TAVILY_API_KEY = form_data.web.TAVILY_API_KEY - request.app.state.config.SEARCHAPI_API_KEY = form_data.web.SEARCHAPI_API_KEY - request.app.state.config.SEARCHAPI_ENGINE = form_data.web.SEARCHAPI_ENGINE - request.app.state.config.SERPAPI_API_KEY = form_data.web.SERPAPI_API_KEY - request.app.state.config.SERPAPI_ENGINE = form_data.web.SERPAPI_ENGINE - request.app.state.config.JINA_API_KEY = form_data.web.JINA_API_KEY - request.app.state.config.BING_SEARCH_V7_ENDPOINT = ( - form_data.web.BING_SEARCH_V7_ENDPOINT - ) - request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY = ( - form_data.web.BING_SEARCH_V7_SUBSCRIPTION_KEY - ) - request.app.state.config.EXA_API_KEY = form_data.web.EXA_API_KEY - request.app.state.config.PERPLEXITY_API_KEY = form_data.web.PERPLEXITY_API_KEY - request.app.state.config.SOUGOU_API_SID = form_data.web.SOUGOU_API_SID - request.app.state.config.SOUGOU_API_SK = form_data.web.SOUGOU_API_SK - - # Web loader settings - request.app.state.config.WEB_LOADER_ENGINE = form_data.web.WEB_LOADER_ENGINE - request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ( - form_data.web.ENABLE_WEB_LOADER_SSL_VERIFICATION - ) - request.app.state.config.PLAYWRIGHT_WS_URL = form_data.web.PLAYWRIGHT_WS_URL - request.app.state.config.PLAYWRIGHT_TIMEOUT = form_data.web.PLAYWRIGHT_TIMEOUT - request.app.state.config.FIRECRAWL_API_KEY = form_data.web.FIRECRAWL_API_KEY - request.app.state.config.FIRECRAWL_API_BASE_URL = ( - form_data.web.FIRECRAWL_API_BASE_URL - ) - request.app.state.config.TAVILY_EXTRACT_DEPTH = ( - form_data.web.TAVILY_EXTRACT_DEPTH - ) - request.app.state.config.YOUTUBE_LOADER_LANGUAGE = ( - form_data.web.YOUTUBE_LOADER_LANGUAGE - ) - request.app.state.config.YOUTUBE_LOADER_PROXY_URL = ( - form_data.web.YOUTUBE_LOADER_PROXY_URL - ) - request.app.state.YOUTUBE_LOADER_TRANSLATION = ( - form_data.web.YOUTUBE_LOADER_TRANSLATION + Knowledges.update_knowledge_data_by_id( + id=knowledge_base.id, data={"rag_config": rag_config} ) - return { - "status": True, + return rag_config + else: + # Update the global configuration # RAG settings - "RAG_TEMPLATE": request.app.state.config.RAG_TEMPLATE, - "TOP_K": request.app.state.config.TOP_K, - "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL, - "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, + request.app.state.config.RAG_TEMPLATE = ( + form_data.RAG_TEMPLATE + if form_data.RAG_TEMPLATE is not None + else request.app.state.config.RAG_TEMPLATE + ) + request.app.state.config.TOP_K = ( + form_data.TOP_K + if form_data.TOP_K is not None + else request.app.state.config.TOP_K + ) + request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = ( + form_data.BYPASS_EMBEDDING_AND_RETRIEVAL + if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None + else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL + ) + request.app.state.config.RAG_FULL_CONTEXT = ( + form_data.RAG_FULL_CONTEXT + if form_data.RAG_FULL_CONTEXT is not None + else request.app.state.config.RAG_FULL_CONTEXT + ) + # Hybrid search settings - "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, - "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER, - "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD, + request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = ( + form_data.ENABLE_RAG_HYBRID_SEARCH + if form_data.ENABLE_RAG_HYBRID_SEARCH is not None + else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH + ) + if not request.app.state.config.ENABLE_RAG_HYBRID_SEARCH: + request.app.state.rf = None + + request.app.state.config.TOP_K_RERANKER = ( + form_data.TOP_K_RERANKER + if form_data.TOP_K_RERANKER is not None + else request.app.state.config.TOP_K_RERANKER + ) + request.app.state.config.RELEVANCE_THRESHOLD = ( + form_data.RELEVANCE_THRESHOLD + if form_data.RELEVANCE_THRESHOLD is not None + else request.app.state.config.RELEVANCE_THRESHOLD + ) + # Content extraction settings - "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, - "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, - "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, - "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, - "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, - "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, - "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, + request.app.state.config.CONTENT_EXTRACTION_ENGINE = ( + form_data.CONTENT_EXTRACTION_ENGINE + if form_data.CONTENT_EXTRACTION_ENGINE is not None + else request.app.state.config.CONTENT_EXTRACTION_ENGINE + ) + request.app.state.config.PDF_EXTRACT_IMAGES = ( + form_data.PDF_EXTRACT_IMAGES + if form_data.PDF_EXTRACT_IMAGES is not None + else request.app.state.config.PDF_EXTRACT_IMAGES + ) + request.app.state.config.TIKA_SERVER_URL = ( + form_data.TIKA_SERVER_URL + if form_data.TIKA_SERVER_URL is not None + else request.app.state.config.TIKA_SERVER_URL + ) + request.app.state.config.DOCLING_SERVER_URL = ( + form_data.DOCLING_SERVER_URL + if form_data.DOCLING_SERVER_URL is not None + else request.app.state.config.DOCLING_SERVER_URL + ) + request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( + form_data.DOCUMENT_INTELLIGENCE_ENDPOINT + if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None + else request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT + ) + request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = ( + form_data.DOCUMENT_INTELLIGENCE_KEY + if form_data.DOCUMENT_INTELLIGENCE_KEY is not None + else request.app.state.config.DOCUMENT_INTELLIGENCE_KEY + ) + request.app.state.config.MISTRAL_OCR_API_KEY = ( + form_data.MISTRAL_OCR_API_KEY + if form_data.MISTRAL_OCR_API_KEY is not None + else request.app.state.config.MISTRAL_OCR_API_KEY + ) + # Chunking settings - "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER, - "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE, - "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP, + request.app.state.config.TEXT_SPLITTER = ( + form_data.TEXT_SPLITTER + if form_data.TEXT_SPLITTER is not None + else request.app.state.config.TEXT_SPLITTER + ) + request.app.state.config.CHUNK_SIZE = ( + form_data.CHUNK_SIZE + if form_data.CHUNK_SIZE is not None + else request.app.state.config.CHUNK_SIZE + ) + request.app.state.config.CHUNK_OVERLAP = ( + form_data.CHUNK_OVERLAP + if form_data.CHUNK_OVERLAP is not None + else request.app.state.config.CHUNK_OVERLAP + ) + # File upload settings - "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE, - "FILE_MAX_COUNT": request.app.state.config.FILE_MAX_COUNT, + request.app.state.config.FILE_MAX_SIZE = ( + form_data.FILE_MAX_SIZE + if form_data.FILE_MAX_SIZE is not None + else request.app.state.config.FILE_MAX_SIZE + ) + request.app.state.config.FILE_MAX_COUNT = ( + form_data.FILE_MAX_COUNT + if form_data.FILE_MAX_COUNT is not None + else request.app.state.config.FILE_MAX_COUNT + ) + # Integration settings - "ENABLE_GOOGLE_DRIVE_INTEGRATION": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, - "ENABLE_ONEDRIVE_INTEGRATION": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION, - # Web search settings - "web": { - "ENABLE_WEB_SEARCH": request.app.state.config.ENABLE_WEB_SEARCH, - "WEB_SEARCH_ENGINE": request.app.state.config.WEB_SEARCH_ENGINE, - "WEB_SEARCH_TRUST_ENV": request.app.state.config.WEB_SEARCH_TRUST_ENV, - "WEB_SEARCH_RESULT_COUNT": request.app.state.config.WEB_SEARCH_RESULT_COUNT, - "WEB_SEARCH_CONCURRENT_REQUESTS": request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS, - "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, - "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, - "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL, - "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY, - "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID, - "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY, - "KAGI_SEARCH_API_KEY": request.app.state.config.KAGI_SEARCH_API_KEY, - "MOJEEK_SEARCH_API_KEY": request.app.state.config.MOJEEK_SEARCH_API_KEY, - "BOCHA_SEARCH_API_KEY": request.app.state.config.BOCHA_SEARCH_API_KEY, - "SERPSTACK_API_KEY": request.app.state.config.SERPSTACK_API_KEY, - "SERPSTACK_HTTPS": request.app.state.config.SERPSTACK_HTTPS, - "SERPER_API_KEY": request.app.state.config.SERPER_API_KEY, - "SERPLY_API_KEY": request.app.state.config.SERPLY_API_KEY, - "TAVILY_API_KEY": request.app.state.config.TAVILY_API_KEY, - "SEARCHAPI_API_KEY": request.app.state.config.SEARCHAPI_API_KEY, - "SEARCHAPI_ENGINE": request.app.state.config.SEARCHAPI_ENGINE, - "SERPAPI_API_KEY": request.app.state.config.SERPAPI_API_KEY, - "SERPAPI_ENGINE": request.app.state.config.SERPAPI_ENGINE, - "JINA_API_KEY": request.app.state.config.JINA_API_KEY, - "BING_SEARCH_V7_ENDPOINT": request.app.state.config.BING_SEARCH_V7_ENDPOINT, - "BING_SEARCH_V7_SUBSCRIPTION_KEY": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY, - "EXA_API_KEY": request.app.state.config.EXA_API_KEY, - "PERPLEXITY_API_KEY": request.app.state.config.PERPLEXITY_API_KEY, - "SOUGOU_API_SID": request.app.state.config.SOUGOU_API_SID, - "SOUGOU_API_SK": request.app.state.config.SOUGOU_API_SK, - "WEB_LOADER_ENGINE": request.app.state.config.WEB_LOADER_ENGINE, - "ENABLE_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION, - "PLAYWRIGHT_WS_URL": request.app.state.config.PLAYWRIGHT_WS_URL, - "PLAYWRIGHT_TIMEOUT": request.app.state.config.PLAYWRIGHT_TIMEOUT, - "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, - "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, - "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, - "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, - "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, - "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, - }, - } + request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ( + form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION + if form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION is not None + else request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION + ) + request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION = ( + form_data.ENABLE_ONEDRIVE_INTEGRATION + if form_data.ENABLE_ONEDRIVE_INTEGRATION is not None + else request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION + ) + + if form_data.web is not None: + # Web search settings + request.app.state.config.ENABLE_WEB_SEARCH = form_data.web.ENABLE_WEB_SEARCH + request.app.state.config.WEB_SEARCH_ENGINE = form_data.web.WEB_SEARCH_ENGINE + request.app.state.config.WEB_SEARCH_TRUST_ENV = ( + form_data.web.WEB_SEARCH_TRUST_ENV + ) + request.app.state.config.WEB_SEARCH_RESULT_COUNT = ( + form_data.web.WEB_SEARCH_RESULT_COUNT + ) + request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS = ( + form_data.web.WEB_SEARCH_CONCURRENT_REQUESTS + ) + request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST = ( + form_data.web.WEB_SEARCH_DOMAIN_FILTER_LIST + ) + request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = ( + form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL + ) + request.app.state.config.SEARXNG_QUERY_URL = form_data.web.SEARXNG_QUERY_URL + request.app.state.config.GOOGLE_PSE_API_KEY = form_data.web.GOOGLE_PSE_API_KEY + request.app.state.config.GOOGLE_PSE_ENGINE_ID = ( + form_data.web.GOOGLE_PSE_ENGINE_ID + ) + request.app.state.config.BRAVE_SEARCH_API_KEY = ( + form_data.web.BRAVE_SEARCH_API_KEY + ) + request.app.state.config.KAGI_SEARCH_API_KEY = form_data.web.KAGI_SEARCH_API_KEY + request.app.state.config.MOJEEK_SEARCH_API_KEY = ( + form_data.web.MOJEEK_SEARCH_API_KEY + ) + request.app.state.config.BOCHA_SEARCH_API_KEY = ( + form_data.web.BOCHA_SEARCH_API_KEY + ) + request.app.state.config.SERPSTACK_API_KEY = form_data.web.SERPSTACK_API_KEY + request.app.state.config.SERPSTACK_HTTPS = form_data.web.SERPSTACK_HTTPS + request.app.state.config.SERPER_API_KEY = form_data.web.SERPER_API_KEY + request.app.state.config.SERPLY_API_KEY = form_data.web.SERPLY_API_KEY + request.app.state.config.TAVILY_API_KEY = form_data.web.TAVILY_API_KEY + request.app.state.config.SEARCHAPI_API_KEY = form_data.web.SEARCHAPI_API_KEY + request.app.state.config.SEARCHAPI_ENGINE = form_data.web.SEARCHAPI_ENGINE + request.app.state.config.SERPAPI_API_KEY = form_data.web.SERPAPI_API_KEY + request.app.state.config.SERPAPI_ENGINE = form_data.web.SERPAPI_ENGINE + request.app.state.config.JINA_API_KEY = form_data.web.JINA_API_KEY + request.app.state.config.BING_SEARCH_V7_ENDPOINT = ( + form_data.web.BING_SEARCH_V7_ENDPOINT + ) + request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY = ( + form_data.web.BING_SEARCH_V7_SUBSCRIPTION_KEY + ) + request.app.state.config.EXA_API_KEY = form_data.web.EXA_API_KEY + request.app.state.config.PERPLEXITY_API_KEY = form_data.web.PERPLEXITY_API_KEY + request.app.state.config.SOUGOU_API_SID = form_data.web.SOUGOU_API_SID + request.app.state.config.SOUGOU_API_SK = form_data.web.SOUGOU_API_SK + + # Web loader settings + request.app.state.config.WEB_LOADER_ENGINE = form_data.web.WEB_LOADER_ENGINE + request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ( + form_data.web.ENABLE_WEB_LOADER_SSL_VERIFICATION + ) + request.app.state.config.PLAYWRIGHT_WS_URL = form_data.web.PLAYWRIGHT_WS_URL + request.app.state.config.PLAYWRIGHT_TIMEOUT = form_data.web.PLAYWRIGHT_TIMEOUT + request.app.state.config.FIRECRAWL_API_KEY = form_data.web.FIRECRAWL_API_KEY + request.app.state.config.FIRECRAWL_API_BASE_URL = ( + form_data.web.FIRECRAWL_API_BASE_URL + ) + request.app.state.config.TAVILY_EXTRACT_DEPTH = ( + form_data.web.TAVILY_EXTRACT_DEPTH + ) + request.app.state.config.YOUTUBE_LOADER_LANGUAGE = ( + form_data.web.YOUTUBE_LOADER_LANGUAGE + ) + request.app.state.config.YOUTUBE_LOADER_PROXY_URL = ( + form_data.web.YOUTUBE_LOADER_PROXY_URL + ) + request.app.state.YOUTUBE_LOADER_TRANSLATION = ( + form_data.web.YOUTUBE_LOADER_TRANSLATION + ) + + return { + "status": True, + # RAG settings + "RAG_TEMPLATE": request.app.state.config.RAG_TEMPLATE, + "TOP_K": request.app.state.config.TOP_K, + "BYPASS_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL, + "RAG_FULL_CONTEXT": request.app.state.config.RAG_FULL_CONTEXT, + # Hybrid search settings + "ENABLE_RAG_HYBRID_SEARCH": request.app.state.config.ENABLE_RAG_HYBRID_SEARCH, + "TOP_K_RERANKER": request.app.state.config.TOP_K_RERANKER, + "RELEVANCE_THRESHOLD": request.app.state.config.RELEVANCE_THRESHOLD, + # Content extraction settings + "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, + "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, + "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, + "DOCUMENT_INTELLIGENCE_ENDPOINT": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + "DOCUMENT_INTELLIGENCE_KEY": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, + "MISTRAL_OCR_API_KEY": request.app.state.config.MISTRAL_OCR_API_KEY, + # Chunking settings + "TEXT_SPLITTER": request.app.state.config.TEXT_SPLITTER, + "CHUNK_SIZE": request.app.state.config.CHUNK_SIZE, + "CHUNK_OVERLAP": request.app.state.config.CHUNK_OVERLAP, + # File upload settings + "FILE_MAX_SIZE": request.app.state.config.FILE_MAX_SIZE, + "FILE_MAX_COUNT": request.app.state.config.FILE_MAX_COUNT, + # Integration settings + "ENABLE_GOOGLE_DRIVE_INTEGRATION": request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION, + "ENABLE_ONEDRIVE_INTEGRATION": request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION, + # Web search settings + "web": { + "ENABLE_WEB_SEARCH": request.app.state.config.ENABLE_WEB_SEARCH, + "WEB_SEARCH_ENGINE": request.app.state.config.WEB_SEARCH_ENGINE, + "WEB_SEARCH_TRUST_ENV": request.app.state.config.WEB_SEARCH_TRUST_ENV, + "WEB_SEARCH_RESULT_COUNT": request.app.state.config.WEB_SEARCH_RESULT_COUNT, + "WEB_SEARCH_CONCURRENT_REQUESTS": request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS, + "WEB_SEARCH_DOMAIN_FILTER_LIST": request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, + "BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL": request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL, + "SEARXNG_QUERY_URL": request.app.state.config.SEARXNG_QUERY_URL, + "GOOGLE_PSE_API_KEY": request.app.state.config.GOOGLE_PSE_API_KEY, + "GOOGLE_PSE_ENGINE_ID": request.app.state.config.GOOGLE_PSE_ENGINE_ID, + "BRAVE_SEARCH_API_KEY": request.app.state.config.BRAVE_SEARCH_API_KEY, + "KAGI_SEARCH_API_KEY": request.app.state.config.KAGI_SEARCH_API_KEY, + "MOJEEK_SEARCH_API_KEY": request.app.state.config.MOJEEK_SEARCH_API_KEY, + "BOCHA_SEARCH_API_KEY": request.app.state.config.BOCHA_SEARCH_API_KEY, + "SERPSTACK_API_KEY": request.app.state.config.SERPSTACK_API_KEY, + "SERPSTACK_HTTPS": request.app.state.config.SERPSTACK_HTTPS, + "SERPER_API_KEY": request.app.state.config.SERPER_API_KEY, + "SERPLY_API_KEY": request.app.state.config.SERPLY_API_KEY, + "TAVILY_API_KEY": request.app.state.config.TAVILY_API_KEY, + "SEARCHAPI_API_KEY": request.app.state.config.SEARCHAPI_API_KEY, + "SEARCHAPI_ENGINE": request.app.state.config.SEARCHAPI_ENGINE, + "SERPAPI_API_KEY": request.app.state.config.SERPAPI_API_KEY, + "SERPAPI_ENGINE": request.app.state.config.SERPAPI_ENGINE, + "JINA_API_KEY": request.app.state.config.JINA_API_KEY, + "BING_SEARCH_V7_ENDPOINT": request.app.state.config.BING_SEARCH_V7_ENDPOINT, + "BING_SEARCH_V7_SUBSCRIPTION_KEY": request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY, + "EXA_API_KEY": request.app.state.config.EXA_API_KEY, + "PERPLEXITY_API_KEY": request.app.state.config.PERPLEXITY_API_KEY, + "SOUGOU_API_SID": request.app.state.config.SOUGOU_API_SID, + "SOUGOU_API_SK": request.app.state.config.SOUGOU_API_SK, + "WEB_LOADER_ENGINE": request.app.state.config.WEB_LOADER_ENGINE, + "ENABLE_WEB_LOADER_SSL_VERIFICATION": request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION, + "PLAYWRIGHT_WS_URL": request.app.state.config.PLAYWRIGHT_WS_URL, + "PLAYWRIGHT_TIMEOUT": request.app.state.config.PLAYWRIGHT_TIMEOUT, + "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, + "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, + "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, + "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, + "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, + "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, + }, + } #################################### @@ -822,6 +992,28 @@ def save_docs_to_vector_db( f"save_docs_to_vector_db: document {_get_docs_info(docs)} {collection_name}" ) + # Retrieve the knowledge base using the collection_name + knowledge_base = Knowledges.get_knowledge_base_by_collection_name(collection_name) + if not knowledge_base: + raise ValueError(f"Knowledge base not found for collection: {collection_name}") + + # Retrieve the RAG configuration + rag_config = {} + if not knowledge_base.data.get("DEFAULT_RAG_SETTINGS", True): + rag_config = knowledge_base.data.get("rag_config", {}) + + # Use knowledge-base-specific or default configurations + text_splitter_type = rag_config.get("text_splitter", request.app.state.config.TEXT_SPLITTER) + chunk_size = rag_config.get("chunk_size", request.app.state.config.CHUNK_SIZE) + chunk_overlap = rag_config.get("chunk_overlap", request.app.state.config.CHUNK_OVERLAP) + embedding_engine = rag_config.get("embedding_engine", request.app.state.config.RAG_EMBEDDING_ENGINE) + embedding_model = rag_config.get("embedding_model", request.app.state.config.RAG_EMBEDDING_MODEL) + embedding_batch_size = rag_config.get("embedding_batch_size", request.app.state.config.RAG_EMBEDDING_BATCH_SIZE) + openai_api_base_url = rag_config.get("openai_api_base_url", request.app.state.config.RAG_OPENAI_API_BASE_URL) + openai_api_key = rag_config.get("openai_api_key", request.app.state.config.RAG_OPENAI_API_KEY) + ollama_base_url = rag_config.get("ollama", {}).get("url", request.app.state.config.RAG_OLLAMA_BASE_URL) + ollama_api_key = rag_config.get("ollama", {}).get("key", request.app.state.config.RAG_OLLAMA_API_KEY) + # Check if entries with the same hash (metadata.hash) already exist if metadata and "hash" in metadata: result = VECTOR_DB_CLIENT.query( @@ -836,13 +1028,13 @@ def save_docs_to_vector_db( raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT) if split: - if request.app.state.config.TEXT_SPLITTER in ["", "character"]: + if text_splitter_type in ["", "character"]: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=request.app.state.config.CHUNK_SIZE, - chunk_overlap=request.app.state.config.CHUNK_OVERLAP, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, add_start_index=True, ) - elif request.app.state.config.TEXT_SPLITTER == "token": + elif text_splitter_type == "token": log.info( f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}" ) @@ -850,8 +1042,8 @@ def save_docs_to_vector_db( tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME)) text_splitter = TokenTextSplitter( encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME), - chunk_size=request.app.state.config.CHUNK_SIZE, - chunk_overlap=request.app.state.config.CHUNK_OVERLAP, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, add_start_index=True, ) else: @@ -869,8 +1061,8 @@ def save_docs_to_vector_db( **(metadata if metadata else {}), "embedding_config": json.dumps( { - "engine": request.app.state.config.RAG_EMBEDDING_ENGINE, - "model": request.app.state.config.RAG_EMBEDDING_MODEL, + "engine": embedding_engine, + "model": embedding_model, } ), } @@ -903,20 +1095,20 @@ def save_docs_to_vector_db( log.info(f"adding to collection {collection_name}") embedding_function = get_embedding_function( - request.app.state.config.RAG_EMBEDDING_ENGINE, - request.app.state.config.RAG_EMBEDDING_MODEL, + embedding_engine, + embedding_model, request.app.state.ef, ( - request.app.state.config.RAG_OPENAI_API_BASE_URL - if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai" - else request.app.state.config.RAG_OLLAMA_BASE_URL + openai_api_base_url + if embedding_engine == "openai" + else ollama_base_url ), ( - request.app.state.config.RAG_OPENAI_API_KEY - if request.app.state.config.RAG_EMBEDDING_ENGINE == "openai" - else request.app.state.config.RAG_OLLAMA_API_KEY + openai_api_key + if embedding_engine == "openai" + else ollama_api_key ), - request.app.state.config.RAG_EMBEDDING_BATCH_SIZE, + embedding_batch_size, ) embeddings = embedding_function( @@ -966,6 +1158,39 @@ def process_file( if collection_name is None: collection_name = f"file-{file.id}" + # Retrieve the knowledge base using the collection name + knowledge_base = Knowledges.get_knowledge_base_by_collection_name(collection_name) + if not knowledge_base: + raise ValueError(f"Knowledge base not found for collection: {collection_name}") + + # Retrieve the RAG configuration + rag_config = {} + if not knowledge_base.data.get("DEFAULT_RAG_SETTINGS", True): + rag_config = knowledge_base.data.get("rag_config", {}) + + # Use knowledge-base-specific or default configurations + content_extraction_engine = rag_config.get( + "content_extraction_engine", request.app.state.config.CONTENT_EXTRACTION_ENGINE + ) + tika_server_url = rag_config.get( + "tika_server_url", request.app.state.config.TIKA_SERVER_URL + ) + docling_server_url = rag_config.get( + "docling_server_url", request.app.state.config.DOCLING_SERVER_URL + ) + pdf_extract_images = rag_config.get( + "pdf_extract_images", request.app.state.config.PDF_EXTRACT_IMAGES + ) + document_intelligence_endpoint = rag_config.get( + "document_intelligence_endpoint", request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT + ) + document_intelligence_key = rag_config.get( + "document_intelligence_key", request.app.state.config.DOCUMENT_INTELLIGENCE_KEY + ) + mistral_ocr_api_key = rag_config.get( + "mistral_ocr_api_key", request.app.state.config.MISTRAL_OCR_API_KEY + ) + if form_data.content: # Update the content in the file # Usage: /files/{file_id}/data/content/update, /files/ (audio file upload pipeline) @@ -1029,13 +1254,13 @@ def process_file( if file_path: file_path = Storage.get_file(file_path) loader = Loader( - engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, - TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, - DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, - PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, - DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, - DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, - MISTRAL_OCR_API_KEY=request.app.state.config.MISTRAL_OCR_API_KEY, + engine=content_extraction_engine, + TIKA_SERVER_URL=tika_server_url, + DOCLING_SERVER_URL=docling_server_url, + PDF_EXTRACT_IMAGES=pdf_extract_images, + DOCUMENT_INTELLIGENCE_ENDPOINT=document_intelligence_endpoint, + DOCUMENT_INTELLIGENCE_KEY=document_intelligence_key, + MISTRAL_OCR_API_KEY=mistral_ocr_api_key, ) docs = loader.load( file.filename, file.meta.get("content_type"), file_path @@ -1078,7 +1303,7 @@ def process_file( hash = calculate_sha256_string(text_content) Files.update_file_hash_by_id(file.id, hash) - if not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL: + if not rag_config.get("bypass_embedding_and_retrieval", request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL): try: result = save_docs_to_vector_db( request, From 4f93577d3dd21559ba828a9fb334ebfd501c5640 Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 09:59:42 +0200 Subject: [PATCH 05/62] Feature: Adjusted process_chat_payload to handle individual rag config (send individual rag template) --- backend/open_webui/utils/middleware.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/backend/open_webui/utils/middleware.py b/backend/open_webui/utils/middleware.py index 4070bc697..1d514d79b 100644 --- a/backend/open_webui/utils/middleware.py +++ b/backend/open_webui/utils/middleware.py @@ -912,20 +912,24 @@ async def process_chat_payload(request, form_data, user, metadata, model): f"With a 0 relevancy threshold for RAG, the context cannot be empty" ) + # Adjusted RAG template step to use knowledge-base-specific configuration + rag_template_config = request.app.state.config.RAG_TEMPLATE + + if model_knowledge and not model_knowledge.data.get("DEFAULT_RAG_SETTINGS", True): + rag_template_config = model_knowledge.data.get("rag_config", {}).get( + "template", request.app.state.config.RAG_TEMPLATE + ) + # Workaround for Ollama 2.0+ system prompt issue # TODO: replace with add_or_update_system_message if model.get("owned_by") == "ollama": form_data["messages"] = prepend_to_first_user_message_content( - rag_template( - request.app.state.config.RAG_TEMPLATE, context_string, prompt - ), + rag_template(rag_template_config, context_string, prompt), form_data["messages"], ) else: form_data["messages"] = add_or_update_system_message( - rag_template( - request.app.state.config.RAG_TEMPLATE, context_string, prompt - ), + rag_template(rag_template_config, context_string, prompt), form_data["messages"], ) From 6e2155dd8c9fd8a1bd2f255aabc4c2bcb731f1a6 Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 20:12:54 +0200 Subject: [PATCH 06/62] Feature: adjusted createNewKnowledge to pass individual rag config --- src/lib/apis/knowledge/index.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/lib/apis/knowledge/index.ts b/src/lib/apis/knowledge/index.ts index 92fda2a95..69c74b699 100644 --- a/src/lib/apis/knowledge/index.ts +++ b/src/lib/apis/knowledge/index.ts @@ -4,7 +4,8 @@ export const createNewKnowledge = async ( token: string, name: string, description: string, - accessControl: null | object + accessControl: null | object, + rag_config: null | object ) => { let error = null; @@ -16,9 +17,10 @@ export const createNewKnowledge = async ( authorization: `Bearer ${token}` }, body: JSON.stringify({ - name: name, + form_data: {name: name, description: description, - access_control: accessControl + access_control: accessControl}, + rag_data: rag_config }) }) .then(async (res) => { From 4f1c9a8c4f6de0e1e40a03eab6383d75e4c6c30d Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 20:14:11 +0200 Subject: [PATCH 07/62] Feature: adjusted retrieval restpoints to handle indivudal rag config - passing rag config and changing from get to post --- src/lib/apis/retrieval/index.ts | 36 +++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/lib/apis/retrieval/index.ts b/src/lib/apis/retrieval/index.ts index f4b937b68..b0c532536 100644 --- a/src/lib/apis/retrieval/index.ts +++ b/src/lib/apis/retrieval/index.ts @@ -1,14 +1,17 @@ import { RETRIEVAL_API_BASE_URL } from '$lib/constants'; -export const getRAGConfig = async (token: string) => { +export const getRAGConfig = async (token: string, collectionForm?: CollectionNameForm) => { let error = null; const res = await fetch(`${RETRIEVAL_API_BASE_URL}/config`, { - method: 'GET', + method: 'POST', headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${token}` - } + }, + body: JSON.stringify( + collectionForm ? {collectionForm: collectionForm} : {} + ) }) .then(async (res) => { if (!res.ok) throw await res.json(); @@ -59,7 +62,11 @@ type RAGConfigForm = { youtube?: YoutubeConfigForm; }; -export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => { +type CollectionNameForm = { + collection_name: string; +}; + +export const updateRAGConfig = async (token: string, payload: RAGConfigForm, collectionForm?: CollectionNameForm) => { let error = null; const res = await fetch(`${RETRIEVAL_API_BASE_URL}/config/update`, { @@ -69,7 +76,8 @@ export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => Authorization: `Bearer ${token}` }, body: JSON.stringify({ - ...payload + ...payload, + ...(collectionForm ? { collectionForm: collectionForm } : {}) }) }) .then(async (res) => { @@ -152,15 +160,18 @@ export const updateQuerySettings = async (token: string, settings: QuerySettings return res; }; -export const getEmbeddingConfig = async (token: string) => { +export const getEmbeddingConfig = async (token: string, collectionForm?: CollectionNameForm) => { let error = null; const res = await fetch(`${RETRIEVAL_API_BASE_URL}/embedding`, { - method: 'GET', + method: 'POST', headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${token}` - } + }, + body: JSON.stringify( + collectionForm ? {collectionForm: collectionForm} : {} + ) }) .then(async (res) => { if (!res.ok) throw await res.json(); @@ -221,15 +232,18 @@ export const updateEmbeddingConfig = async (token: string, payload: EmbeddingMod return res; }; -export const getRerankingConfig = async (token: string) => { +export const getRerankingConfig = async (token: string, collectionForm?: CollectionNameForm) => { let error = null; const res = await fetch(`${RETRIEVAL_API_BASE_URL}/reranking`, { - method: 'GET', + method: 'POST', headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${token}` - } + }, + body: JSON.stringify( + collectionForm ? {collectionForm: collectionForm} : {} + ) }) .then(async (res) => { if (!res.ok) throw await res.json(); From 79ff99550c16b18ea76dd6402edbb465402646f9 Mon Sep 17 00:00:00 2001 From: Maytown Date: Fri, 2 May 2025 20:15:06 +0200 Subject: [PATCH 08/62] Feature: Added Rag Configuration to knowledge creation - if disabled falls back to default/general rag settings from documentsettings --- .../Knowledge/CreateKnowledgeBase.svelte | 1001 ++++++++++++++--- 1 file changed, 871 insertions(+), 130 deletions(-) diff --git a/src/lib/components/workspace/Knowledge/CreateKnowledgeBase.svelte b/src/lib/components/workspace/Knowledge/CreateKnowledgeBase.svelte index e7c1248f5..4a7bab5fe 100644 --- a/src/lib/components/workspace/Knowledge/CreateKnowledgeBase.svelte +++ b/src/lib/components/workspace/Knowledge/CreateKnowledgeBase.svelte @@ -1,114 +1,352 @@
- + -
{ + { submitHandler(); }} - > -
-
- {$i18n.t('Create a knowledge base')} -
+ > +
+
+ {$i18n.t('Create a knowledge base')} +
-
-
-
{$i18n.t('What are you working on?')}
+
+
+
{$i18n.t('What are you working on?')}
-
- -
-
+
+ +
+
-
-
{$i18n.t('What are you trying to achieve?')}
+
+
{$i18n.t('What are you trying to achieve?')}
-
- +
+
+
+
@@ -120,46 +358,549 @@
-
-
- +
+
+ {$i18n.t('Enable Individual RAG Configuration')} + { + RAGConfig.DEFAULT_RAG_SETTINGS = !enableIndividualRagConfig; + }} + />
-
- +
+ + {#if enableIndividualRagConfig} +
+ {#if RAGConfig.CONTENT_EXTRACTION_ENGINE === ''} +
+
+
+ {$i18n.t('PDF Extract Images (OCR)')} +
+
+ +
+
+
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'tika'} +
+
+ +
+
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling'} +
+ +
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'} +
+ + +
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mistral_ocr'} +
+ +
+ {/if} +
+ +
+
+ + {$i18n.t('Bypass Embedding and Retrieval')} + +
+
+ + + +
+
+ + {#if !RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL} +
+
{$i18n.t('Text Splitter')}
+
+ +
+
+ +
+
+
+
+ {$i18n.t('Chunk Size')} +
+
+ +
+
+ +
+
+ {$i18n.t('Chunk Overlap')} +
+ +
+ +
+
+
+
+ {/if} + + {#if !RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL} +
+
{$i18n.t('Embedding')}
+ +
+ +
+
+
+ {$i18n.t('Embedding Model Engine')} +
+
+ +
+
+ + {#if embeddingEngine === 'openai'} +
+ + + +
+ {:else if embeddingEngine === 'ollama'} +
+ + + +
+ {/if} +
+ +
+
{$i18n.t('Embedding Model')}
+ +
+ {#if embeddingEngine === 'ollama'} +
+
+ +
+
+ {:else} +
+
+ +
+ + {#if embeddingEngine === ''} + + {/if} +
+ {/if} +
+ +
+ {$i18n.t( + 'Warning: If you update or change your embedding model, you will need to re-import all documents.' + )} +
+
+ + {#if embeddingEngine === 'ollama' || embeddingEngine === 'openai'} +
+
+ {$i18n.t('Embedding Batch Size')} +
+ +
+ +
+
+ {/if} +
+ +
+
{$i18n.t('Retrieval')}
+ +
+ +
+
{$i18n.t('Full Context Mode')}
+
+ + + +
+
+ + {#if !RAGConfig.RAG_FULL_CONTEXT} +
+
{$i18n.t('Hybrid Search')}
+
+ { + submitHandler(); + }} + /> +
+
+ + {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} +
+
{$i18n.t('Reranking Model')}
+ +
+
+
+ +
+ +
+
+
+ {/if} + +
+
{$i18n.t('Top K')}
+
+ +
+
+ + {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} +
+
{$i18n.t('Top K Reranker')}
+
+ +
+
+ {/if} + + {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} +
+
+
+ {$i18n.t('Relevance Threshold')} +
+
+ +
+
+
+ {$i18n.t( + 'Note: If you set a minimum score, the search will only return documents with a score greater than or equal to the minimum score.' + )} +
+
+ {/if} + {/if} + +
+
{$i18n.t('RAG Template')}
+
+ + +
+
+
+
@@ -120,46 +310,635 @@
-
-
- +
+
+ {$i18n.t('Enable Individual RAG Configuration')} + { + RAGConfig.DEFAULT_RAG_SETTINGS = !enableIndividualRagConfig; + }} + />
-
- +
+ + {#if enableIndividualRagConfig} +
+
+ {$i18n.t('Content Extraction Engine')} +
+
+ +
+
+
+ {#if RAGConfig.CONTENT_EXTRACTION_ENGINE === ''} +
+
+
+ {$i18n.t('PDF Extract Images (OCR)')} +
+
+ +
+
+
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'external'} +
+ + +
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'tika'} +
+
+ +
+
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'docling'} +
+ +
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence'} +
+ + +
+ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'mistral_ocr'} +
+ +
+ {/if} +
+ +
+
+ + {$i18n.t('Bypass Embedding and Retrieval')} + +
+
+ + + +
+
+ + {#if !RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL} +
+
{$i18n.t('Text Splitter')}
+
+ +
+
+ +
+
+
+
+ {$i18n.t('Chunk Size')} +
+
+ +
+
+ +
+
+ {$i18n.t('Chunk Overlap')} +
+ +
+ +
+
+
+
+ {/if} + + {#if !RAGConfig.BYPASS_EMBEDDING_AND_RETRIEVAL} +
+
{$i18n.t('Embedding')}
+ +
+ +
+
+
+ {$i18n.t('Embedding Model Engine')} +
+
+ +
+
+ + {#if embeddingEngine === 'openai'} +
+ + + +
+ {:else if embeddingEngine === 'ollama'} +
+ + + +
+ {/if} +
+ +
+
{$i18n.t('Embedding Model')}
+ {#if $user?.role === 'admin'} +
+ {#if embeddingEngine === 'ollama'} +
+
+ +
+
+ {:else} +
+
+ +
+ + {#if embeddingEngine === ''} + + {/if} +
+ {/if} +
+ {/if} +
+
+ +
+
+
+ {$i18n.t( + 'Warning: If you update or change your embedding model, you will need to re-import all documents.' + )} +
+
+ + {#if embeddingEngine === 'ollama' || embeddingEngine === 'openai'} +
+
+ {$i18n.t('Embedding Batch Size')} +
+ +
+ +
+
+ {/if} +
+ +
+
{$i18n.t('Retrieval')}
+ +
+ +
+
{$i18n.t('Full Context Mode')}
+
+ + + +
+
+ + {#if !RAGConfig.RAG_FULL_CONTEXT} +
+
{$i18n.t('Hybrid Search')}
+
+ { + if (!RAGConfig.ENABLE_RAG_HYBRID_SEARCH) { + RAGConfig.RAG_RERANKING_MODEL = ""; + } + }} + /> +
+
+ + {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} +
+
+
+ {$i18n.t('Reranking Engine')} +
+
+ +
+
+ + {#if RAGConfig.RAG_RERANKING_ENGINE === 'external'} +
+ + + +
+ {/if} +
+
+
{$i18n.t('Reranking Model')}
+ {#if $user?.role === 'admin'} +
+
+
+ { + }} + /> +
+
+
+ {/if} +
+
+ +
+
+
+ {/if} + +
+
{$i18n.t('Top K')}
+
+ +
+
+ + {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} +
+
{$i18n.t('Top K Reranker')}
+
+ +
+
+ {/if} + + {#if RAGConfig.ENABLE_RAG_HYBRID_SEARCH === true} +
+
+
+ {$i18n.t('Relevance Threshold')} +
+
+ +
+
+
+ {$i18n.t( + 'Note: If you set a minimum score, the search will only return documents with a score greater than or equal to the minimum score.' + )} +
+
+ {/if} + {/if} + +
+
{$i18n.t('RAG Template')}
+
+ +