refac: web/rag config

This commit is contained in:
Timothy Jaeryang Baek
2025-04-12 16:33:36 -07:00
parent c3497da5dd
commit 48a23ce3fe
11 changed files with 1367 additions and 1530 deletions

View File

@@ -201,7 +201,10 @@ def save_config(config):
T = TypeVar("T")
ENABLE_PERSISTENT_CONFIG = os.environ.get("ENABLE_PERSISTENT_CONFIG", "True").lower() == "true"
ENABLE_PERSISTENT_CONFIG = (
os.environ.get("ENABLE_PERSISTENT_CONFIG", "True").lower() == "true"
)
class PersistentConfig(Generic[T]):
def __init__(self, env_name: str, config_path: str, env_value: T):
@@ -612,10 +615,16 @@ def load_oauth_providers():
"scope": OAUTH_SCOPES.value,
}
if OAUTH_CODE_CHALLENGE_METHOD.value and OAUTH_CODE_CHALLENGE_METHOD.value == "S256":
if (
OAUTH_CODE_CHALLENGE_METHOD.value
and OAUTH_CODE_CHALLENGE_METHOD.value == "S256"
):
client_kwargs["code_challenge_method"] = "S256"
elif OAUTH_CODE_CHALLENGE_METHOD.value:
raise Exception('Code challenge methods other than "%s" not supported. Given: "%s"' % ("S256", OAUTH_CODE_CHALLENGE_METHOD.value))
raise Exception(
'Code challenge methods other than "%s" not supported. Given: "%s"'
% ("S256", OAUTH_CODE_CHALLENGE_METHOD.value)
)
client.register(
name="oidc",
@@ -1820,12 +1829,6 @@ RAG_FILE_MAX_SIZE = PersistentConfig(
),
)
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = PersistentConfig(
"ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION",
"rag.enable_web_loader_ssl_verification",
os.environ.get("ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION", "True").lower() == "true",
)
RAG_EMBEDDING_ENGINE = PersistentConfig(
"RAG_EMBEDDING_ENGINE",
"rag.embedding_engine",
@@ -1990,16 +1993,20 @@ YOUTUBE_LOADER_PROXY_URL = PersistentConfig(
)
ENABLE_RAG_WEB_SEARCH = PersistentConfig(
"ENABLE_RAG_WEB_SEARCH",
####################################
# Web Search (RAG)
####################################
ENABLE_WEB_SEARCH = PersistentConfig(
"ENABLE_WEB_SEARCH",
"rag.web.search.enable",
os.getenv("ENABLE_RAG_WEB_SEARCH", "False").lower() == "true",
os.getenv("ENABLE_WEB_SEARCH", "False").lower() == "true",
)
RAG_WEB_SEARCH_ENGINE = PersistentConfig(
"RAG_WEB_SEARCH_ENGINE",
WEB_SEARCH_ENGINE = PersistentConfig(
"WEB_SEARCH_ENGINE",
"rag.web.search.engine",
os.getenv("RAG_WEB_SEARCH_ENGINE", ""),
os.getenv("WEB_SEARCH_ENGINE", ""),
)
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = PersistentConfig(
@@ -2008,10 +2015,18 @@ BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = PersistentConfig(
os.getenv("BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL", "False").lower() == "true",
)
WEB_SEARCH_RESULT_COUNT = PersistentConfig(
"WEB_SEARCH_RESULT_COUNT",
"rag.web.search.result_count",
int(os.getenv("WEB_SEARCH_RESULT_COUNT", "3")),
)
# You can provide a list of your own websites to filter after performing a web search.
# This ensures the highest level of safety and reliability of the information sources.
RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = PersistentConfig(
"RAG_WEB_SEARCH_DOMAIN_FILTER_LIST",
WEB_SEARCH_DOMAIN_FILTER_LIST = PersistentConfig(
"WEB_SEARCH_DOMAIN_FILTER_LIST",
"rag.web.search.domain.filter_list",
[
# "wikipedia.com",
@@ -2020,6 +2035,30 @@ RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = PersistentConfig(
],
)
WEB_SEARCH_CONCURRENT_REQUESTS = PersistentConfig(
"WEB_SEARCH_CONCURRENT_REQUESTS",
"rag.web.search.concurrent_requests",
int(os.getenv("WEB_SEARCH_CONCURRENT_REQUESTS", "10")),
)
WEB_LOADER_ENGINE = PersistentConfig(
"WEB_LOADER_ENGINE",
"rag.web.loader.engine",
os.environ.get("WEB_LOADER_ENGINE", ""),
)
ENABLE_WEB_LOADER_SSL_VERIFICATION = PersistentConfig(
"ENABLE_WEB_LOADER_SSL_VERIFICATION",
"rag.web.loader.ssl_verification",
os.environ.get("ENABLE_WEB_LOADER_SSL_VERIFICATION", "True").lower() == "true",
)
WEB_SEARCH_TRUST_ENV = PersistentConfig(
"WEB_SEARCH_TRUST_ENV",
"rag.web.search.trust_env",
os.getenv("WEB_SEARCH_TRUST_ENV", "False").lower() == "true",
)
SEARXNG_QUERY_URL = PersistentConfig(
"SEARXNG_QUERY_URL",
@@ -2155,34 +2194,22 @@ SOUGOU_API_SK = PersistentConfig(
os.getenv("SOUGOU_API_SK", ""),
)
RAG_WEB_SEARCH_RESULT_COUNT = PersistentConfig(
"RAG_WEB_SEARCH_RESULT_COUNT",
"rag.web.search.result_count",
int(os.getenv("RAG_WEB_SEARCH_RESULT_COUNT", "3")),
TAVILY_API_KEY = PersistentConfig(
"TAVILY_API_KEY",
"rag.web.search.tavily_api_key",
os.getenv("TAVILY_API_KEY", ""),
)
RAG_WEB_SEARCH_CONCURRENT_REQUESTS = PersistentConfig(
"RAG_WEB_SEARCH_CONCURRENT_REQUESTS",
"rag.web.search.concurrent_requests",
int(os.getenv("RAG_WEB_SEARCH_CONCURRENT_REQUESTS", "10")),
TAVILY_EXTRACT_DEPTH = PersistentConfig(
"TAVILY_EXTRACT_DEPTH",
"rag.web.search.tavily_extract_depth",
os.getenv("TAVILY_EXTRACT_DEPTH", "basic"),
)
RAG_WEB_LOADER_ENGINE = PersistentConfig(
"RAG_WEB_LOADER_ENGINE",
"rag.web.loader.engine",
os.environ.get("RAG_WEB_LOADER_ENGINE", "safe_web"),
)
RAG_WEB_SEARCH_TRUST_ENV = PersistentConfig(
"RAG_WEB_SEARCH_TRUST_ENV",
"rag.web.search.trust_env",
os.getenv("RAG_WEB_SEARCH_TRUST_ENV", "False").lower() == "true",
)
PLAYWRIGHT_WS_URI = PersistentConfig(
"PLAYWRIGHT_WS_URI",
"rag.web.loader.playwright_ws_uri",
os.environ.get("PLAYWRIGHT_WS_URI", ""),
PLAYWRIGHT_WS_URL = PersistentConfig(
"PLAYWRIGHT_WS_URL",
"rag.web.loader.PLAYWRIGHT_WS_URL",
os.environ.get("PLAYWRIGHT_WS_URL", ""),
)
PLAYWRIGHT_TIMEOUT = PersistentConfig(
@@ -2203,17 +2230,6 @@ FIRECRAWL_API_BASE_URL = PersistentConfig(
os.environ.get("FIRECRAWL_API_BASE_URL", "https://api.firecrawl.dev"),
)
TAVILY_API_KEY = PersistentConfig(
"TAVILY_API_KEY",
"rag.web.loader.tavily_api_key",
os.getenv("TAVILY_API_KEY", ""),
)
TAVILY_EXTRACT_DEPTH = PersistentConfig(
"TAVILY_EXTRACT_DEPTH",
"rag.web.loader.tavily_extract_depth",
os.getenv("TAVILY_EXTRACT_DEPTH", "basic"),
)
####################################
# Images

View File

@@ -160,11 +160,11 @@ from open_webui.config import (
AUDIO_TTS_VOICE,
AUDIO_TTS_AZURE_SPEECH_REGION,
AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT,
PLAYWRIGHT_WS_URI,
PLAYWRIGHT_WS_URL,
PLAYWRIGHT_TIMEOUT,
FIRECRAWL_API_BASE_URL,
FIRECRAWL_API_KEY,
RAG_WEB_LOADER_ENGINE,
WEB_LOADER_ENGINE,
WHISPER_MODEL,
DEEPGRAM_API_KEY,
WHISPER_MODEL_AUTO_UPDATE,
@@ -205,12 +205,13 @@ from open_webui.config import (
YOUTUBE_LOADER_LANGUAGE,
YOUTUBE_LOADER_PROXY_URL,
# Retrieval (Web Search)
RAG_WEB_SEARCH_ENGINE,
ENABLE_WEB_SEARCH,
WEB_SEARCH_ENGINE,
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
RAG_WEB_SEARCH_RESULT_COUNT,
RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
RAG_WEB_SEARCH_TRUST_ENV,
RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
WEB_SEARCH_RESULT_COUNT,
WEB_SEARCH_CONCURRENT_REQUESTS,
WEB_SEARCH_TRUST_ENV,
WEB_SEARCH_DOMAIN_FILTER_LIST,
JINA_API_KEY,
SEARCHAPI_API_KEY,
SEARCHAPI_ENGINE,
@@ -240,8 +241,7 @@ from open_webui.config import (
ONEDRIVE_CLIENT_ID,
ENABLE_RAG_HYBRID_SEARCH,
ENABLE_RAG_LOCAL_WEB_FETCH,
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
ENABLE_RAG_WEB_SEARCH,
ENABLE_WEB_LOADER_SSL_VERIFICATION,
ENABLE_GOOGLE_DRIVE_INTEGRATION,
ENABLE_ONEDRIVE_INTEGRATION,
UPLOAD_DIR,
@@ -594,9 +594,7 @@ app.state.config.FILE_MAX_COUNT = RAG_FILE_MAX_COUNT
app.state.config.RAG_FULL_CONTEXT = RAG_FULL_CONTEXT
app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = BYPASS_EMBEDDING_AND_RETRIEVAL
app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
)
app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
@@ -629,12 +627,16 @@ app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
app.state.config.YOUTUBE_LOADER_PROXY_URL = YOUTUBE_LOADER_PROXY_URL
app.state.config.ENABLE_RAG_WEB_SEARCH = ENABLE_RAG_WEB_SEARCH
app.state.config.RAG_WEB_SEARCH_ENGINE = RAG_WEB_SEARCH_ENGINE
app.state.config.ENABLE_WEB_SEARCH = ENABLE_WEB_SEARCH
app.state.config.WEB_SEARCH_ENGINE = WEB_SEARCH_ENGINE
app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST = WEB_SEARCH_DOMAIN_FILTER_LIST
app.state.config.WEB_SEARCH_RESULT_COUNT = WEB_SEARCH_RESULT_COUNT
app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS = WEB_SEARCH_CONCURRENT_REQUESTS
app.state.config.WEB_LOADER_ENGINE = WEB_LOADER_ENGINE
app.state.config.WEB_SEARCH_TRUST_ENV = WEB_SEARCH_TRUST_ENV
app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = (
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
)
app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = RAG_WEB_SEARCH_DOMAIN_FILTER_LIST
app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = ENABLE_GOOGLE_DRIVE_INTEGRATION
app.state.config.ENABLE_ONEDRIVE_INTEGRATION = ENABLE_ONEDRIVE_INTEGRATION
@@ -662,11 +664,8 @@ app.state.config.PERPLEXITY_API_KEY = PERPLEXITY_API_KEY
app.state.config.SOUGOU_API_SID = SOUGOU_API_SID
app.state.config.SOUGOU_API_SK = SOUGOU_API_SK
app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = RAG_WEB_SEARCH_RESULT_COUNT
app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = RAG_WEB_SEARCH_CONCURRENT_REQUESTS
app.state.config.RAG_WEB_LOADER_ENGINE = RAG_WEB_LOADER_ENGINE
app.state.config.RAG_WEB_SEARCH_TRUST_ENV = RAG_WEB_SEARCH_TRUST_ENV
app.state.config.PLAYWRIGHT_WS_URI = PLAYWRIGHT_WS_URI
app.state.config.PLAYWRIGHT_WS_URL = PLAYWRIGHT_WS_URL
app.state.config.PLAYWRIGHT_TIMEOUT = PLAYWRIGHT_TIMEOUT
app.state.config.FIRECRAWL_API_BASE_URL = FIRECRAWL_API_BASE_URL
app.state.config.FIRECRAWL_API_KEY = FIRECRAWL_API_KEY
@@ -1261,7 +1260,7 @@ async def get_app_config(request: Request):
{
"enable_direct_connections": app.state.config.ENABLE_DIRECT_CONNECTIONS,
"enable_channels": app.state.config.ENABLE_CHANNELS,
"enable_web_search": app.state.config.ENABLE_RAG_WEB_SEARCH,
"enable_web_search": app.state.config.ENABLE_WEB_SEARCH,
"enable_code_execution": app.state.config.ENABLE_CODE_EXECUTION,
"enable_code_interpreter": app.state.config.ENABLE_CODE_INTERPRETER,
"enable_image_generation": app.state.config.ENABLE_IMAGE_GENERATION,

View File

@@ -28,9 +28,9 @@ from open_webui.retrieval.loaders.tavily import TavilyLoader
from open_webui.constants import ERROR_MESSAGES
from open_webui.config import (
ENABLE_RAG_LOCAL_WEB_FETCH,
PLAYWRIGHT_WS_URI,
PLAYWRIGHT_WS_URL,
PLAYWRIGHT_TIMEOUT,
RAG_WEB_LOADER_ENGINE,
WEB_LOADER_ENGINE,
FIRECRAWL_API_BASE_URL,
FIRECRAWL_API_KEY,
TAVILY_API_KEY,
@@ -584,13 +584,6 @@ class SafeWebBaseLoader(WebBaseLoader):
return [document async for document in self.alazy_load()]
RAG_WEB_LOADER_ENGINES = defaultdict(lambda: SafeWebBaseLoader)
RAG_WEB_LOADER_ENGINES["playwright"] = SafePlaywrightURLLoader
RAG_WEB_LOADER_ENGINES["safe_web"] = SafeWebBaseLoader
RAG_WEB_LOADER_ENGINES["firecrawl"] = SafeFireCrawlLoader
RAG_WEB_LOADER_ENGINES["tavily"] = SafeTavilyLoader
def get_web_loader(
urls: Union[str, Sequence[str]],
verify_ssl: bool = True,
@@ -608,27 +601,36 @@ def get_web_loader(
"trust_env": trust_env,
}
if RAG_WEB_LOADER_ENGINE.value == "playwright":
if WEB_LOADER_ENGINE.value == "" or WEB_LOADER_ENGINE.value == "safe_web":
WebLoaderClass = SafeWebBaseLoader
if WEB_LOADER_ENGINE.value == "playwright":
WebLoaderClass = SafePlaywrightURLLoader
web_loader_args["playwright_timeout"] = PLAYWRIGHT_TIMEOUT.value * 1000
if PLAYWRIGHT_WS_URI.value:
web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URI.value
if PLAYWRIGHT_WS_URL.value:
web_loader_args["playwright_ws_url"] = PLAYWRIGHT_WS_URL.value
if RAG_WEB_LOADER_ENGINE.value == "firecrawl":
if WEB_LOADER_ENGINE.value == "firecrawl":
WebLoaderClass = SafeFireCrawlLoader
web_loader_args["api_key"] = FIRECRAWL_API_KEY.value
web_loader_args["api_url"] = FIRECRAWL_API_BASE_URL.value
if RAG_WEB_LOADER_ENGINE.value == "tavily":
if WEB_LOADER_ENGINE.value == "tavily":
WebLoaderClass = SafeTavilyLoader
web_loader_args["api_key"] = TAVILY_API_KEY.value
web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value
# Create the appropriate WebLoader based on the configuration
WebLoaderClass = RAG_WEB_LOADER_ENGINES[RAG_WEB_LOADER_ENGINE.value]
web_loader = WebLoaderClass(**web_loader_args)
if WebLoaderClass:
web_loader = WebLoaderClass(**web_loader_args)
log.debug(
"Using RAG_WEB_LOADER_ENGINE %s for %s URLs",
web_loader.__class__.__name__,
len(safe_urls),
)
log.debug(
"Using WEB_LOADER_ENGINE %s for %s URLs",
web_loader.__class__.__name__,
len(safe_urls),
)
return web_loader
return web_loader
else:
raise ValueError(
f"Invalid WEB_LOADER_ENGINE: {WEB_LOADER_ENGINE.value}. "
"Please set it to 'safe_web', 'playwright', 'firecrawl', or 'tavily'."
)

File diff suppressed because it is too large Load Diff