diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index fc46d78c4..aec2a8730 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -228,7 +228,10 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin): mode=self.mode, params=self.params, ) - yield from loader.lazy_load() + for document in loader.lazy_load(): + if not document.metadata.get("source"): + document.metadata["source"] = document.metadata.get("sourceURL") + yield document except Exception as e: if self.continue_on_failure: log.exception(f"Error loading {url}: {e}") @@ -248,6 +251,8 @@ class SafeFireCrawlLoader(BaseLoader, RateLimitMixin, URLProcessingMixin): params=self.params, ) async for document in loader.alazy_load(): + if not document.metadata.get("source"): + document.metadata["source"] = document.metadata.get("sourceURL") yield document except Exception as e: if self.continue_on_failure: diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index bcb8aa413..76d7fc76c 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -1536,8 +1536,8 @@ async def process_web_search( ) docs = await loader.aload() urls = [ - doc.metadata["source"] for doc in docs - ] # only keep URLs which could be retrieved + doc.metadata.get("source") for doc in docs if doc.metadata.get("source") + ] # only keep URLs if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: return {