Merge pull request #12507 from Ithanil/fix_web_result_collection_source_ids

fix: fix web results all getting the same source id when using embedding and retrieval
This commit is contained in:
Timothy Jaeryang Baek 2025-04-06 15:43:21 -07:00 committed by GitHub
commit 9825d03602
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 27 additions and 23 deletions

View File

@ -1464,11 +1464,9 @@ async def process_web_search(
log.debug(f"web_results: {web_results}") log.debug(f"web_results: {web_results}")
try: try:
collection_name = form_data.collection_name collection_basename = form_data.collection_name
if collection_name == "" or collection_name is None: if collection_basename == "" or collection_basename is None:
collection_name = f"web-search-{calculate_sha256_string(form_data.query)}"[ collection_basename = "web-search"
:63
]
urls = [result.link for result in web_results] urls = [result.link for result in web_results]
loader = get_web_loader( loader = get_web_loader(
@ -1495,18 +1493,23 @@ async def process_web_search(
"loaded_count": len(docs), "loaded_count": len(docs),
} }
else: else:
await run_in_threadpool( collection_names = []
save_docs_to_vector_db, for doc_idx, doc in enumerate(docs):
request, collection_sha = calculate_sha256_string(f"{form_data.query}-{urls[doc_idx]}")
docs, doc_collection_name = f"{collection_basename}-{collection_sha}"[:63]
collection_name, collection_names.append(doc_collection_name)
overwrite=True, await run_in_threadpool(
user=user, save_docs_to_vector_db,
) request,
[doc],
doc_collection_name,
overwrite=True,
user=user,
)
return { return {
"status": True, "status": True,
"collection_name": collection_name, "collection_names": collection_names,
"filenames": urls, "filenames": urls,
"loaded_count": len(docs), "loaded_count": len(docs),
} }

View File

@ -399,15 +399,16 @@ async def chat_web_search_handler(
all_results.append(results) all_results.append(results)
files = form_data.get("files", []) files = form_data.get("files", [])
if results.get("collection_name"): if results.get("collection_names"):
files.append( for col_idx, collection_name in enumerate(results.get("collection_names")):
{ files.append(
"collection_name": results["collection_name"], {
"name": searchQuery, "collection_name": collection_name,
"type": "web_search", "name": searchQuery,
"urls": results["filenames"], "type": "web_search",
} "urls": [results["filenames"][col_idx]],
) }
)
elif results.get("docs"): elif results.get("docs"):
# Invoked when bypass embedding and retrieval is set to True # Invoked when bypass embedding and retrieval is set to True
docs = results["docs"] docs = results["docs"]