diff --git a/backend/open_webui/apps/retrieval/main.py b/backend/open_webui/apps/retrieval/main.py index 87242db02..118a3e203 100644 --- a/backend/open_webui/apps/retrieval/main.py +++ b/backend/open_webui/apps/retrieval/main.py @@ -726,7 +726,6 @@ def process_file( ) docs = loader.load(file.filename, file.meta.get("content_type"), file_path) text_content = " ".join([doc.page_content for doc in docs]) - log.debug(f"text_content: {text_content}") Files.update_files_metadata_by_id( @@ -795,10 +794,17 @@ def process_text( metadata={"name": form_data.name, "created_by": user.id}, ) ] + text_content = form_data.content + log.debug(f"text_content: {text_content}") + result = save_docs_to_vector_db(docs, collection_name) if result: - return {"status": True, "collection_name": collection_name} + return { + "status": True, + "collection_name": collection_name, + "content": text_content, + } else: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, @@ -806,68 +812,6 @@ def process_text( ) -@app.get("/process/dir") -def process_docs_dir(user=Depends(get_admin_user)): - for path in Path(DOCS_DIR).rglob("./**/*"): - try: - if path.is_file() and not path.name.startswith("."): - tags = extract_folders_after_data_docs(path) - filename = path.name - file_content_type = mimetypes.guess_type(path) - - with open(path, "rb") as f: - collection_name = calculate_sha256(f)[:63] - - loader = Loader( - engine=app.state.config.CONTENT_EXTRACTION_ENGINE, - TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL, - PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES, - ) - docs = loader.load(filename, file_content_type[0], str(path)) - - try: - result = save_docs_to_vector_db(docs, collection_name) - - if result: - sanitized_filename = sanitize_filename(filename) - doc = Documents.get_doc_by_name(sanitized_filename) - - if doc is None: - doc = Documents.insert_new_doc( - user.id, - DocumentForm( - **{ - "name": sanitized_filename, - "title": filename, - "collection_name": collection_name, - "filename": filename, - "content": ( - json.dumps( - { - "tags": list( - map( - lambda name: {"name": name}, - tags, - ) - ) - } - ) - if len(tags) - else "{}" - ), - } - ), - ) - except Exception as e: - log.exception(e) - pass - - except Exception as e: - log.exception(e) - - return True - - @app.post("/process/youtube") def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_user)): try: @@ -882,12 +826,15 @@ def process_youtube_video(form_data: ProcessUrlForm, user=Depends(get_verified_u translation=app.state.YOUTUBE_LOADER_TRANSLATION, ) docs = loader.load() + text_content = " ".join([doc.page_content for doc in docs]) + log.debug(f"text_content: {text_content}") save_docs_to_vector_db(docs, collection_name, overwrite=True) return { "status": True, "collection_name": collection_name, "filename": form_data.url, + "content": text_content, } except Exception as e: log.exception(e) @@ -910,12 +857,15 @@ def process_web(form_data: ProcessUrlForm, user=Depends(get_verified_user)): requests_per_second=app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS, ) docs = loader.load() + text_content = " ".join([doc.page_content for doc in docs]) + log.debug(f"text_content: {text_content}") save_docs_to_vector_db(docs, collection_name, overwrite=True) return { "status": True, "collection_name": collection_name, "filename": form_data.url, + "content": text_content, } except Exception as e: log.exception(e) @@ -1067,6 +1017,7 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)): loader = get_web_loader(urls) docs = loader.load() + save_docs_to_vector_db(docs, collection_name, overwrite=True) return { @@ -1082,6 +1033,68 @@ def process_web_search(form_data: SearchForm, user=Depends(get_verified_user)): ) +@app.get("/process/dir") +def process_docs_dir(user=Depends(get_admin_user)): + for path in Path(DOCS_DIR).rglob("./**/*"): + try: + if path.is_file() and not path.name.startswith("."): + tags = extract_folders_after_data_docs(path) + filename = path.name + file_content_type = mimetypes.guess_type(path) + + with open(path, "rb") as f: + collection_name = calculate_sha256(f)[:63] + + loader = Loader( + engine=app.state.config.CONTENT_EXTRACTION_ENGINE, + TIKA_SERVER_URL=app.state.config.TIKA_SERVER_URL, + PDF_EXTRACT_IMAGES=app.state.config.PDF_EXTRACT_IMAGES, + ) + docs = loader.load(filename, file_content_type[0], str(path)) + + try: + result = save_docs_to_vector_db(docs, collection_name) + + if result: + sanitized_filename = sanitize_filename(filename) + doc = Documents.get_doc_by_name(sanitized_filename) + + if doc is None: + doc = Documents.insert_new_doc( + user.id, + DocumentForm( + **{ + "name": sanitized_filename, + "title": filename, + "collection_name": collection_name, + "filename": filename, + "content": ( + json.dumps( + { + "tags": list( + map( + lambda name: {"name": name}, + tags, + ) + ) + } + ) + if len(tags) + else "{}" + ), + } + ), + ) + except Exception as e: + log.exception(e) + pass + + except Exception as e: + log.exception(e) + + return True + + class QueryDocForm(BaseModel): collection_name: str query: str diff --git a/src/lib/components/chat/MessageInput/Commands.svelte b/src/lib/components/chat/MessageInput/Commands.svelte index d1f85d458..91f78866d 100644 --- a/src/lib/components/chat/MessageInput/Commands.svelte +++ b/src/lib/components/chat/MessageInput/Commands.svelte @@ -30,7 +30,7 @@ const uploadWeb = async (url) => { console.log(url); - const doc = { + const fileItem = { type: 'doc', name: url, collection_name: '', @@ -40,12 +40,14 @@ }; try { - files = [...files, doc]; + files = [...files, fileItem]; const res = await processWeb(localStorage.token, '', url); if (res) { - doc.status = 'processed'; - doc.collection_name = res.collection_name; + fileItem.status = 'processed'; + fileItem.collection_name = res.collection_name; + fileItem.content = res.content; + files = files; } } catch (e) { @@ -58,7 +60,7 @@ const uploadYoutubeTranscription = async (url) => { console.log(url); - const doc = { + const fileItem = { type: 'doc', name: url, collection_name: '', @@ -68,12 +70,13 @@ }; try { - files = [...files, doc]; + files = [...files, fileItem]; const res = await processYoutubeVideo(localStorage.token, url); if (res) { - doc.status = 'processed'; - doc.collection_name = res.collection_name; + fileItem.status = 'processed'; + fileItem.collection_name = res.collection_name; + fileItem.content = res.content; files = files; } } catch (e) { diff --git a/src/lib/components/common/FileItem.svelte b/src/lib/components/common/FileItem.svelte index 67cbdf509..b86a2d37a 100644 --- a/src/lib/components/common/FileItem.svelte +++ b/src/lib/components/common/FileItem.svelte @@ -39,6 +39,8 @@ if (url) { if (type === 'file') { window.open(`${url}/content`, '_blank').focus(); + } else { + window.open(`${url}`, '_blank').focus(); } } } diff --git a/src/lib/components/common/FileItemModal.svelte b/src/lib/components/common/FileItemModal.svelte index f590d0755..c124a45c7 100644 --- a/src/lib/components/common/FileItemModal.svelte +++ b/src/lib/components/common/FileItemModal.svelte @@ -20,8 +20,14 @@
-
- {file?.name ?? 'File'} +