mirror of
https://github.com/open-webui/open-webui
synced 2025-06-25 09:47:41 +00:00
refac
This commit is contained in:
parent
2428878f42
commit
00eb022450
@ -628,40 +628,26 @@ async def update_query_settings(
|
|||||||
####################################
|
####################################
|
||||||
|
|
||||||
|
|
||||||
def store_data_in_vector_db(
|
def save_docs_to_vector_db(
|
||||||
data, collection_name, metadata: Optional[dict] = None, overwrite: bool = False
|
docs,
|
||||||
|
collection_name,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
overwrite: bool = False,
|
||||||
|
split: bool = True,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
text_splitter = RecursiveCharacterTextSplitter(
|
log.info(f"save_docs_to_vector_db {docs} {collection_name}")
|
||||||
chunk_size=app.state.config.CHUNK_SIZE,
|
|
||||||
chunk_overlap=app.state.config.CHUNK_OVERLAP,
|
|
||||||
add_start_index=True,
|
|
||||||
)
|
|
||||||
docs = text_splitter.split_documents(data)
|
|
||||||
|
|
||||||
if len(docs) > 0:
|
if split:
|
||||||
log.info(f"store_data_in_vector_db {docs}")
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
return store_docs_in_vector_db(docs, collection_name, metadata, overwrite)
|
chunk_size=app.state.config.CHUNK_SIZE,
|
||||||
else:
|
chunk_overlap=app.state.config.CHUNK_OVERLAP,
|
||||||
|
add_start_index=True,
|
||||||
|
)
|
||||||
|
docs = text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
if len(docs) == 0:
|
||||||
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
||||||
|
|
||||||
|
|
||||||
def store_text_in_vector_db(
|
|
||||||
text, metadata, collection_name, overwrite: bool = False
|
|
||||||
) -> bool:
|
|
||||||
text_splitter = RecursiveCharacterTextSplitter(
|
|
||||||
chunk_size=app.state.config.CHUNK_SIZE,
|
|
||||||
chunk_overlap=app.state.config.CHUNK_OVERLAP,
|
|
||||||
add_start_index=True,
|
|
||||||
)
|
|
||||||
docs = text_splitter.create_documents([text], metadatas=[metadata])
|
|
||||||
return store_docs_in_vector_db(docs, collection_name, overwrite=overwrite)
|
|
||||||
|
|
||||||
|
|
||||||
def store_docs_in_vector_db(
|
|
||||||
docs, collection_name, metadata: Optional[dict] = None, overwrite: bool = False
|
|
||||||
) -> bool:
|
|
||||||
log.info(f"store_docs_in_vector_db {docs} {collection_name}")
|
|
||||||
|
|
||||||
texts = [doc.page_content for doc in docs]
|
texts = [doc.page_content for doc in docs]
|
||||||
metadatas = [{**doc.metadata, **(metadata if metadata else {})} for doc in docs]
|
metadatas = [{**doc.metadata, **(metadata if metadata else {})} for doc in docs]
|
||||||
|
|
||||||
@ -728,21 +714,24 @@ def process_file(
|
|||||||
file = Files.get_file_by_id(form_data.file_id)
|
file = Files.get_file_by_id(form_data.file_id)
|
||||||
file_path = file.meta.get("path", f"{UPLOAD_DIR}/{file.filename}")
|
file_path = file.meta.get("path", f"{UPLOAD_DIR}/{file.filename}")
|
||||||
|
|
||||||
loader = Loader(
|
|
||||||
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
|
|
||||||
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
|
|
||||||
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
|
||||||
)
|
|
||||||
data = loader.load(file.filename, file.meta.get("content_type"), file_path)
|
|
||||||
|
|
||||||
collection_name = form_data.collection_name
|
collection_name = form_data.collection_name
|
||||||
if collection_name is None:
|
if collection_name is None:
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
collection_name = calculate_sha256(f)[:63]
|
collection_name = calculate_sha256(f)[:63]
|
||||||
|
|
||||||
|
loader = Loader(
|
||||||
|
engine=app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||||
|
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
|
||||||
|
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
|
)
|
||||||
|
docs = loader.load(file.filename, file.meta.get("content_type"), file_path)
|
||||||
|
|
||||||
|
raw_content = " ".join([doc.page_content for doc in docs])
|
||||||
|
print(raw_content)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = store_data_in_vector_db(
|
result = save_docs_to_vector_db(
|
||||||
data,
|
docs,
|
||||||
collection_name,
|
collection_name,
|
||||||
{
|
{
|
||||||
"file_id": form_data.file_id,
|
"file_id": form_data.file_id,
|
||||||
@ -790,11 +779,13 @@ def process_text(
|
|||||||
if collection_name is None:
|
if collection_name is None:
|
||||||
collection_name = calculate_sha256_string(form_data.content)
|
collection_name = calculate_sha256_string(form_data.content)
|
||||||
|
|
||||||
result = store_text_in_vector_db(
|
docs = [
|
||||||
form_data.content,
|
Document(
|
||||||
metadata={"name": form_data.name, "created_by": user.id},
|
page_content=form_data.content,
|
||||||
collection_name=collection_name,
|
metadata={"name": form_data.name, "created_by": user.id},
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
result = save_docs_to_vector_db(docs, collection_name)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
return {"status": True, "collection_name": collection_name}
|
return {"status": True, "collection_name": collection_name}
|
||||||
@ -822,10 +813,10 @@ def process_docs_dir(user=Depends(get_admin_user)):
|
|||||||
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
|
TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL,
|
||||||
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES,
|
||||||
)
|
)
|
||||||
data = loader.load(filename, file_content_type[0], str(path))
|
docs = loader.load(filename, file_content_type[0], str(path))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = store_data_in_vector_db(data, collection_name)
|
result = save_docs_to_vector_db(docs, collection_name)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
sanitized_filename = sanitize_filename(filename)
|
sanitized_filename = sanitize_filename(filename)
|
||||||
@ -870,19 +861,19 @@ def process_docs_dir(user=Depends(get_admin_user)):
|
|||||||
@app.post("/process/youtube")
|
@app.post("/process/youtube")
|
||||||
def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
|
def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
|
||||||
try:
|
try:
|
||||||
|
collection_name = form_data.collection_name
|
||||||
|
if not collection_name:
|
||||||
|
collection_name = calculate_sha256_string(form_data.url)[:63]
|
||||||
|
|
||||||
loader = YoutubeLoader.from_youtube_url(
|
loader = YoutubeLoader.from_youtube_url(
|
||||||
form_data.url,
|
form_data.url,
|
||||||
add_video_info=True,
|
add_video_info=True,
|
||||||
language=app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
language=app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
||||||
translation=app.state.YOUTUBE_LOADER_TRANSLATION,
|
translation=app.state.YOUTUBE_LOADER_TRANSLATION,
|
||||||
)
|
)
|
||||||
data = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
collection_name = form_data.collection_name
|
save_docs_to_vector_db(docs, collection_name, overwrite=True)
|
||||||
if not collection_name:
|
|
||||||
collection_name = calculate_sha256_string(form_data.url)[:63]
|
|
||||||
|
|
||||||
store_data_in_vector_db(data, collection_name, overwrite=True)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": True,
|
"status": True,
|
||||||
@ -900,18 +891,17 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u
|
|||||||
@app.post("/process/web")
|
@app.post("/process/web")
|
||||||
def process_web(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
|
def process_web(form_data: ProcessUrlForm, user=Depends(get_verified_user)):
|
||||||
try:
|
try:
|
||||||
|
collection_name = form_data.collection_name
|
||||||
|
if not collection_name:
|
||||||
|
collection_name = calculate_sha256_string(form_data.url)[:63]
|
||||||
|
|
||||||
loader = get_web_loader(
|
loader = get_web_loader(
|
||||||
form_data.url,
|
form_data.url,
|
||||||
verify_ssl=app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
|
verify_ssl=app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
|
||||||
requests_per_second=app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
|
requests_per_second=app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
|
||||||
)
|
)
|
||||||
data = loader.load()
|
docs = loader.load()
|
||||||
|
save_docs_to_vector_db(docs, collection_name, overwrite=True)
|
||||||
collection_name = form_data.collection_name
|
|
||||||
if not collection_name:
|
|
||||||
collection_name = calculate_sha256_string(form_data.url)[:63]
|
|
||||||
|
|
||||||
store_data_in_vector_db(data, collection_name, overwrite=True)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": True,
|
"status": True,
|
||||||
@ -1060,15 +1050,16 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
urls = [result.link for result in web_results]
|
|
||||||
loader = get_web_loader(urls)
|
|
||||||
data = loader.load()
|
|
||||||
|
|
||||||
collection_name = form_data.collection_name
|
collection_name = form_data.collection_name
|
||||||
if collection_name == "":
|
if collection_name == "":
|
||||||
collection_name = calculate_sha256_string(form_data.query)[:63]
|
collection_name = calculate_sha256_string(form_data.query)[:63]
|
||||||
|
|
||||||
store_data_in_vector_db(data, collection_name, overwrite=True)
|
urls = [result.link for result in web_results]
|
||||||
|
|
||||||
|
loader = get_web_loader(urls)
|
||||||
|
docs = loader.load()
|
||||||
|
save_docs_to_vector_db(docs, collection_name, overwrite=True)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": True,
|
"status": True,
|
||||||
"collection_name": collection_name,
|
"collection_name": collection_name,
|
||||||
|
Loading…
Reference in New Issue
Block a user