diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index caabddd03..b88677bb2 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -199,173 +199,181 @@ def get_rf( class TextCleaner: """Modular text cleaning system for document processing and embedding preparation.""" - + @staticmethod def normalize_escape_sequences(text: str) -> str: """Normalize escape sequences from various document formats.""" if not text: return "" - + # Handle double-escaped sequences (common in PPTX) replacements = [ - ('\\\\n', '\n'), # Double-escaped newlines - ('\\\\t', ' '), # Double-escaped tabs - ('\\\\"', '"'), # Double-escaped quotes - ('\\\\r', ''), # Double-escaped carriage returns - ('\\\\/', '/'), # Double-escaped slashes - ('\\\\', '\\'), # Convert double backslashes to single + ("\\\\n", "\n"), # Double-escaped newlines + ("\\\\t", " "), # Double-escaped tabs + ('\\\\"', '"'), # Double-escaped quotes + ("\\\\r", ""), # Double-escaped carriage returns + ("\\\\/", "/"), # Double-escaped slashes + ("\\\\", "\\"), # Convert double backslashes to single ] - + for old, new in replacements: text = text.replace(old, new) - + # Handle single-escaped sequences single_replacements = [ - ('\\n', '\n'), # Single-escaped newlines - ('\\t', ' '), # Single-escaped tabs - ('\\"', '"'), # Single-escaped quotes - ('\\\'', "'"), # Single-escaped single quotes - ('\\r', ''), # Single-escaped carriage returns - ('\\/', '/'), # Single-escaped slashes + ("\\n", "\n"), # Single-escaped newlines + ("\\t", " "), # Single-escaped tabs + ('\\"', '"'), # Single-escaped quotes + ("\\'", "'"), # Single-escaped single quotes + ("\\r", ""), # Single-escaped carriage returns + ("\\/", "/"), # Single-escaped slashes ] - + for old, new in single_replacements: text = text.replace(old, new) - + # Remove any remaining backslash artifacts - text = re.sub(r'\\[a-zA-Z]', '', text) # Remove \letter patterns - text = re.sub(r'\\[0-9]', '', text) # Remove \number patterns - text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns - text = re.sub(r'\\+', '', text) # Remove remaining backslashes - + text = re.sub(r"\\[a-zA-Z]", "", text) # Remove \letter patterns + text = re.sub(r"\\[0-9]", "", text) # Remove \number patterns + text = re.sub(r"\\[^a-zA-Z0-9\s]", "", text) # Remove \symbol patterns + text = re.sub(r"\\+", "", text) # Remove remaining backslashes + return text - + @staticmethod def normalize_unicode(text: str) -> str: """Convert special Unicode characters to ASCII equivalents.""" if not text: return "" - + unicode_map = { - '–': '-', # En dash - '—': '-', # Em dash - ''': "'", # Smart single quote left - ''': "'", # Smart single quote right - '"': '"', # Smart double quote left - '"': '"', # Smart double quote right - '…': '...', # Ellipsis - '™': ' TM', # Trademark - '®': ' R', # Registered - '©': ' C', # Copyright - '°': ' deg', # Degree symbol + "–": "-", # En dash + "—": "-", # Em dash + """: "'", # Smart single quote left + """: "'", # Smart single quote right + '"': '"', # Smart double quote left + '"': '"', # Smart double quote right + "…": "...", # Ellipsis + "™": " TM", # Trademark + "®": " R", # Registered + "©": " C", # Copyright + "°": " deg", # Degree symbol } - + for unicode_char, ascii_char in unicode_map.items(): text = text.replace(unicode_char, ascii_char) - + return text - + @staticmethod def normalize_quotes(text: str) -> str: """Clean up quote-related artifacts and normalize quote marks.""" if not text: return "" - + # Remove quote artifacts quote_patterns = [ - (r'\\+"', '"'), # Multiple backslashes before quotes - (r'\\"', '"'), # Escaped double quotes - (r"\\'", "'"), # Escaped single quotes - (r'\\&', '&'), # Escaped ampersands - (r'""', '"'), # Double quotes - (r"''", "'"), # Double single quotes + (r'\\+"', '"'), # Multiple backslashes before quotes + (r'\\"', '"'), # Escaped double quotes + (r"\\'", "'"), # Escaped single quotes + (r"\\&", "&"), # Escaped ampersands + (r'""', '"'), # Double quotes + (r"''", "'"), # Double single quotes ] - + for pattern, replacement in quote_patterns: text = re.sub(pattern, replacement, text) - + return text - + @staticmethod def normalize_whitespace(text: str, preserve_paragraphs: bool = True) -> str: """Normalize whitespace while optionally preserving paragraph structure.""" if not text: return "" - + if preserve_paragraphs: # Preserve paragraph breaks (double newlines) but clean up excessive spacing - text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs -> single space - text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple empty lines -> double line break - text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE) # Trim line-level whitespace + text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs -> single space + text = re.sub( + r"\n\s*\n\s*\n+", "\n\n", text + ) # Multiple empty lines -> double line break + text = re.sub( + r"^\s+|\s+$", "", text, flags=re.MULTILINE + ) # Trim line-level whitespace else: # Flatten all whitespace for embedding - text = re.sub(r'\n+', ' ', text) # All newlines to spaces - text = re.sub(r'\s+', ' ', text) # All whitespace to single spaces - + text = re.sub(r"\n+", " ", text) # All newlines to spaces + text = re.sub(r"\s+", " ", text) # All whitespace to single spaces + return text.strip() - + @staticmethod def remove_artifacts(text: str) -> str: """Remove document format artifacts and orphaned elements.""" if not text: return "" - + # Remove orphaned punctuation - text = re.sub(r'^\s*[)\]}]+\s*', '', text) # Orphaned closing brackets at start - text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text) # Orphaned closing brackets on own lines - + text = re.sub(r"^\s*[)\]}]+\s*", "", text) # Orphaned closing brackets at start + text = re.sub( + r"\n\s*[)\]}]+\s*\n", "\n\n", text + ) # Orphaned closing brackets on own lines + # Remove excessive punctuation - text = re.sub(r'[.]{3,}', '...', text) # Multiple dots to ellipsis - text = re.sub(r'[-]{3,}', '---', text) # Multiple dashes - + text = re.sub(r"[.]{3,}", "...", text) # Multiple dots to ellipsis + text = re.sub(r"[-]{3,}", "---", text) # Multiple dashes + # Remove empty parentheses and brackets - text = re.sub(r'\(\s*\)', '', text) # Empty parentheses - text = re.sub(r'\[\s*\]', '', text) # Empty square brackets - text = re.sub(r'\{\s*\}', '', text) # Empty curly brackets - + text = re.sub(r"\(\s*\)", "", text) # Empty parentheses + text = re.sub(r"\[\s*\]", "", text) # Empty square brackets + text = re.sub(r"\{\s*\}", "", text) # Empty curly brackets + return text - + @classmethod def clean_for_chunking(cls, text: str) -> str: """Clean text for semantic chunking - preserves structure but normalizes content.""" if not text: return "" - + # Apply all cleaning steps while preserving paragraph structure text = cls.normalize_escape_sequences(text) text = cls.normalize_unicode(text) text = cls.normalize_quotes(text) text = cls.remove_artifacts(text) text = cls.normalize_whitespace(text, preserve_paragraphs=True) - + return text - + @classmethod def clean_for_embedding(cls, text: str) -> str: """Clean text for embedding - flattens structure and optimizes for vector similarity.""" if not text: return "" - + # Start with chunking-level cleaning text = cls.clean_for_chunking(text) - + # Flatten for embedding text = cls.normalize_whitespace(text, preserve_paragraphs=False) - + return text - + @classmethod def clean_for_storage(cls, text: str) -> str: """Clean text for storage - most aggressive cleaning for database storage.""" if not text: return "" - + # Start with embedding-level cleaning text = cls.clean_for_embedding(text) - + # Additional aggressive cleaning for storage - text = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text) # Remove any remaining escape sequences - + text = re.sub( + r"\\([^a-zA-Z0-9\s])", r"\1", text + ) # Remove any remaining escape sequences + return text @@ -374,37 +382,43 @@ def clean_text_content(text: str) -> str: return TextCleaner.clean_for_chunking(text) -def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]: +def create_semantic_chunks( + text: str, max_chunk_size: int, overlap_size: int +) -> TypingList[str]: """Create semantically aware chunks that respect document structure""" if not text or len(text) <= max_chunk_size: return [text] if text else [] - + chunks = [] - + # Split by double line breaks (paragraphs) first - paragraphs = text.split('\n\n') - + paragraphs = text.split("\n\n") + current_chunk = "" - + for paragraph in paragraphs: paragraph = paragraph.strip() if not paragraph: continue - + # If adding this paragraph would exceed chunk size if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size: # Try to split the current chunk at sentence boundaries if it's too long if len(current_chunk) > max_chunk_size: - sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size) + sentence_chunks = split_by_sentences( + current_chunk, max_chunk_size, overlap_size + ) chunks.extend(sentence_chunks) else: chunks.append(current_chunk.strip()) - + # Start new chunk with overlap from previous chunk if applicable if chunks and overlap_size > 0: prev_chunk = chunks[-1] overlap_text = get_text_overlap(prev_chunk, overlap_size) - current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph + current_chunk = ( + overlap_text + "\n\n" + paragraph if overlap_text else paragraph + ) else: current_chunk = paragraph else: @@ -413,39 +427,45 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> current_chunk += "\n\n" + paragraph else: current_chunk = paragraph - + # Add the last chunk if current_chunk: if len(current_chunk) > max_chunk_size: - sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size) + sentence_chunks = split_by_sentences( + current_chunk, max_chunk_size, overlap_size + ) chunks.extend(sentence_chunks) else: chunks.append(current_chunk.strip()) - + return [chunk for chunk in chunks if chunk.strip()] -def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]: +def split_by_sentences( + text: str, max_chunk_size: int, overlap_size: int +) -> TypingList[str]: """Split text by sentences when paragraph-level splitting isn't sufficient""" # Split by sentence endings - sentences = re.split(r'(?<=[.!?])\s+', text) - + sentences = re.split(r"(?<=[.!?])\s+", text) + chunks = [] current_chunk = "" - + for sentence in sentences: sentence = sentence.strip() if not sentence: continue - + # If adding this sentence would exceed chunk size if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chunk_size: chunks.append(current_chunk.strip()) - + # Start new chunk with overlap if overlap_size > 0: overlap_text = get_text_overlap(current_chunk, overlap_size) - current_chunk = overlap_text + " " + sentence if overlap_text else sentence + current_chunk = ( + overlap_text + " " + sentence if overlap_text else sentence + ) else: current_chunk = sentence else: @@ -454,11 +474,11 @@ def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> Typ current_chunk += " " + sentence else: current_chunk = sentence - + # Add the last chunk if current_chunk: chunks.append(current_chunk.strip()) - + return [chunk for chunk in chunks if chunk.strip()] @@ -466,18 +486,18 @@ def get_text_overlap(text: str, overlap_size: int) -> str: """Get the last overlap_size characters from text, preferring word boundaries""" if not text or overlap_size <= 0: return "" - + if len(text) <= overlap_size: return text - + # Try to find a good word boundary within the overlap region overlap_text = text[-overlap_size:] - + # Find the first space to avoid cutting words - space_index = overlap_text.find(' ') + space_index = overlap_text.find(" ") if space_index > 0: return overlap_text[space_index:].strip() - + return overlap_text.strip() @@ -570,7 +590,8 @@ async def update_embedding_config( request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user) ): log.info( - f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}" + f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} " + f"to {form_data.embedding_model}" ) try: request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine @@ -1396,34 +1417,33 @@ def save_docs_to_vector_db( if split: # Apply advanced content-aware splitting and text cleaning processed_docs = [] - + for doc in docs: # Clean the text content before chunking if not doc.page_content: continue - + # Apply text cleaning before chunking using new modular system cleaned_content = TextCleaner.clean_for_chunking(doc.page_content) - + # Create semantic chunks from cleaned content chunks = create_semantic_chunks( cleaned_content, request.app.state.config.CHUNK_SIZE, - request.app.state.config.CHUNK_OVERLAP + request.app.state.config.CHUNK_OVERLAP, ) - + # Create new documents for each chunk for i, chunk in enumerate(chunks): chunk_metadata = { **doc.metadata, "chunk_index": i, - "total_chunks": len(chunks) + "total_chunks": len(chunks), } - processed_docs.append(Document( - page_content=chunk, - metadata=chunk_metadata - )) - + processed_docs.append( + Document(page_content=chunk, metadata=chunk_metadata) + ) + docs = processed_docs if len(docs) == 0: @@ -1501,7 +1521,7 @@ def save_docs_to_vector_db( # Prepare texts for embedding using the new modular cleaning system cleaned_texts = [TextCleaner.clean_for_embedding(text) for text in texts] - + embeddings = embedding_function( cleaned_texts, prefix=RAG_EMBEDDING_CONTENT_PREFIX, @@ -1513,13 +1533,15 @@ def save_docs_to_vector_db( for idx in range(len(texts)): # Apply consistent storage-level cleaning text_to_store = TextCleaner.clean_for_storage(texts[idx]) - - items.append({ - "id": str(uuid.uuid4()), - "text": text_to_store, - "vector": embeddings[idx], - "metadata": metadatas[idx], - }) + + items.append( + { + "id": str(uuid.uuid4()), + "text": text_to_store, + "vector": embeddings[idx], + "metadata": metadatas[idx], + } + ) VECTOR_DB_CLIENT.insert( collection_name=collection_name, @@ -1565,7 +1587,9 @@ def process_file( docs = [ Document( - page_content=TextCleaner.clean_for_chunking(form_data.content.replace("
", "\n")), + page_content=TextCleaner.clean_for_chunking( + form_data.content.replace("
", "\n") + ), metadata={ **file.meta, "name": file.filename, @@ -1588,7 +1612,9 @@ def process_file( if result is not None and len(result.ids[0]) > 0: docs = [ Document( - page_content=TextCleaner.clean_for_chunking(result.documents[0][idx]), + page_content=TextCleaner.clean_for_chunking( + result.documents[0][idx] + ), metadata=result.metadatas[0][idx], ) for idx, id in enumerate(result.ids[0]) @@ -1596,7 +1622,9 @@ def process_file( else: docs = [ Document( - page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")), + page_content=TextCleaner.clean_for_chunking( + file.data.get("content", "") + ), metadata={ **file.meta, "name": file.filename, @@ -1645,22 +1673,26 @@ def process_file( cleaned_docs = [] for doc in docs: cleaned_content = TextCleaner.clean_for_chunking(doc.page_content) - - cleaned_docs.append(Document( - page_content=cleaned_content, - metadata={ - **doc.metadata, - "name": file.filename, - "created_by": file.user_id, - "file_id": file.id, - "source": file.filename, - }, - )) + + cleaned_docs.append( + Document( + page_content=cleaned_content, + metadata={ + **doc.metadata, + "name": file.filename, + "created_by": file.user_id, + "file_id": file.id, + "source": file.filename, + }, + ) + ) docs = cleaned_docs else: docs = [ Document( - page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")), + page_content=TextCleaner.clean_for_chunking( + file.data.get("content", "") + ), metadata={ **file.meta, "name": file.filename, @@ -1670,7 +1702,9 @@ def process_file( }, ) ] - text_content = " ".join([doc.page_content for doc in docs if doc.page_content]) + text_content = " ".join( + [doc.page_content for doc in docs if doc.page_content] + ) # Ensure text_content is never None or empty for hash calculation if not text_content: @@ -2449,7 +2483,9 @@ def process_files_batch( docs: List[Document] = [ Document( - page_content=TextCleaner.clean_for_chunking(text_content.replace("
", "\n")), + page_content=TextCleaner.clean_for_chunking( + text_content.replace("
", "\n") + ), metadata={ **file.meta, "name": file.filename, @@ -2509,10 +2545,10 @@ def delete_file_from_vector_db(file_id: str) -> bool: Delete all vector embeddings for a specific file from the vector database. This function works with any vector database (Pinecone, ChromaDB, etc.) and handles the cleanup when a file is deleted from the chat. - + Args: file_id (str): The ID of the file to delete from vector database - + Returns: bool: True if deletion was successful, False otherwise """ @@ -2521,30 +2557,32 @@ def delete_file_from_vector_db(file_id: str) -> bool: file = Files.get_file_by_id(file_id) if not file: return False - + # Get the file hash for vector deletion file_hash = file.hash if not file_hash: return False - + # Try to get collection name from file metadata collection_name = None - if hasattr(file, 'meta') and file.meta: - collection_name = file.meta.get('collection_name') - + if hasattr(file, "meta") and file.meta: + collection_name = file.meta.get("collection_name") + # If no collection name in metadata, try common patterns used by Open WebUI if not collection_name: # Open WebUI typically uses these patterns: possible_collections = [ f"open-webui_file-{file_id}", # Most common pattern - f"file-{file_id}", # Alternative pattern - f"open-webui_{file_id}", # Another possible pattern + f"file-{file_id}", # Alternative pattern + f"open-webui_{file_id}", # Another possible pattern ] - + # Try each possible collection name for possible_collection in possible_collections: try: - if VECTOR_DB_CLIENT.has_collection(collection_name=possible_collection): + if VECTOR_DB_CLIENT.has_collection( + collection_name=possible_collection + ): result = VECTOR_DB_CLIENT.delete( collection_name=possible_collection, filter={"hash": file_hash}, @@ -2553,19 +2591,21 @@ def delete_file_from_vector_db(file_id: str) -> bool: return True except Exception as e: continue - + # If none of the standard patterns work, try searching through all collections try: deleted_count = 0 - + # Get all collections (this method varies by vector DB implementation) - if hasattr(VECTOR_DB_CLIENT, 'list_collections'): + if hasattr(VECTOR_DB_CLIENT, "list_collections"): try: collections = VECTOR_DB_CLIENT.list_collections() - + for collection in collections: try: - if VECTOR_DB_CLIENT.has_collection(collection_name=collection): + if VECTOR_DB_CLIENT.has_collection( + collection_name=collection + ): result = VECTOR_DB_CLIENT.delete( collection_name=collection, filter={"hash": file_hash}, @@ -2576,14 +2616,16 @@ def delete_file_from_vector_db(file_id: str) -> bool: continue except Exception as e: pass - + return deleted_count > 0 - + except Exception as e: return False - + # Delete from the specific collection found in metadata - if collection_name and VECTOR_DB_CLIENT.has_collection(collection_name=collection_name): + if collection_name and VECTOR_DB_CLIENT.has_collection( + collection_name=collection_name + ): try: result = VECTOR_DB_CLIENT.delete( collection_name=collection_name, @@ -2596,6 +2638,6 @@ def delete_file_from_vector_db(file_id: str) -> bool: return False else: return False - + except Exception as e: return False