This commit is contained in:
Timothy Jaeryang Baek 2024-12-17 18:40:50 -08:00
parent c7e3692678
commit e500461dc0
2 changed files with 24 additions and 31 deletions

View File

@ -82,15 +82,15 @@ class SafeWebBaseLoader(WebBaseLoader):
def get_web_loader( def get_web_loader(
url: Union[str, Sequence[str]], urls: Union[str, Sequence[str]],
verify_ssl: bool = True, verify_ssl: bool = True,
requests_per_second: int = 2, requests_per_second: int = 2,
): ):
# Check if the URL is valid # Check if the URL is valid
if not validate_url(url): if not validate_url(urls):
raise ValueError(ERROR_MESSAGES.INVALID_URL) raise ValueError(ERROR_MESSAGES.INVALID_URL)
return SafeWebBaseLoader( return SafeWebBaseLoader(
url, urls,
verify_ssl=verify_ssl, verify_ssl=verify_ssl,
requests_per_second=requests_per_second, requests_per_second=requests_per_second,
continue_on_failure=True, continue_on_failure=True,

View File

@ -1256,7 +1256,7 @@ def process_web_search(
urls = [result.link for result in web_results] urls = [result.link for result in web_results]
loader = get_web_loader( loader = get_web_loader(
urls=urls, urls,
verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION, verify_ssl=request.app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, requests_per_second=request.app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
) )
@ -1429,19 +1429,23 @@ if ENV == "dev":
async def get_embeddings(request: Request, text: Optional[str] = "Hello World!"): async def get_embeddings(request: Request, text: Optional[str] = "Hello World!"):
return {"result": request.app.state.EMBEDDING_FUNCTION(text)} return {"result": request.app.state.EMBEDDING_FUNCTION(text)}
class BatchProcessFilesForm(BaseModel): class BatchProcessFilesForm(BaseModel):
files: List[FileModel] files: List[FileModel]
collection_name: str collection_name: str
class BatchProcessFilesResult(BaseModel): class BatchProcessFilesResult(BaseModel):
file_id: str file_id: str
status: str status: str
error: Optional[str] = None error: Optional[str] = None
class BatchProcessFilesResponse(BaseModel): class BatchProcessFilesResponse(BaseModel):
results: List[BatchProcessFilesResult] results: List[BatchProcessFilesResult]
errors: List[BatchProcessFilesResult] errors: List[BatchProcessFilesResult]
@router.post("/process/files/batch") @router.post("/process/files/batch")
def process_files_batch( def process_files_batch(
form_data: BatchProcessFilesForm, form_data: BatchProcessFilesForm,
@ -1478,47 +1482,36 @@ def process_files_batch(
Files.update_file_data_by_id(file.id, {"content": text_content}) Files.update_file_data_by_id(file.id, {"content": text_content})
all_docs.extend(docs) all_docs.extend(docs)
results.append(BatchProcessFilesResult( results.append(BatchProcessFilesResult(file_id=file.id, status="prepared"))
file_id=file.id,
status="prepared"
))
except Exception as e: except Exception as e:
log.error(f"process_files_batch: Error processing file {file.id}: {str(e)}") log.error(f"process_files_batch: Error processing file {file.id}: {str(e)}")
errors.append(BatchProcessFilesResult( errors.append(
file_id=file.id, BatchProcessFilesResult(file_id=file.id, status="failed", error=str(e))
status="failed", )
error=str(e)
))
# Save all documents in one batch # Save all documents in one batch
if all_docs: if all_docs:
try: try:
save_docs_to_vector_db( save_docs_to_vector_db(
docs=all_docs, docs=all_docs, collection_name=collection_name, add=True
collection_name=collection_name,
add=True
) )
# Update all files with collection name # Update all files with collection name
for result in results: for result in results:
Files.update_file_metadata_by_id( Files.update_file_metadata_by_id(
result.file_id, result.file_id, {"collection_name": collection_name}
{"collection_name": collection_name}
) )
result.status = "completed" result.status = "completed"
except Exception as e: except Exception as e:
log.error(f"process_files_batch: Error saving documents to vector DB: {str(e)}") log.error(
f"process_files_batch: Error saving documents to vector DB: {str(e)}"
)
for result in results: for result in results:
result.status = "failed" result.status = "failed"
errors.append(BatchProcessFilesResult( errors.append(
file_id=result.file_id, BatchProcessFilesResult(file_id=result.file_id, error=str(e))
error=str(e) )
))
return BatchProcessFilesResponse(
results=results,
errors=errors
)
return BatchProcessFilesResponse(results=results, errors=errors)