Update retrieval.py

2025-06-26 18:26:48 +00:00 · 2025-05-30 18:41:10 -07:00 · 2025-05-30 18:41:10 -07:00 · ef0a724cf1
commit ef0a724cf1
parent 3d0a364e2b
1 changed files with 207 additions and 165 deletions
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@ -199,173 +199,181 @@ def get_rf(

 class TextCleaner:
    """Modular text cleaning system for document processing and embedding preparation."""
-    
+
    @staticmethod
    def normalize_escape_sequences(text: str) -> str:
        """Normalize escape sequences from various document formats."""
        if not text:
            return ""
-        
+
        # Handle double-escaped sequences (common in PPTX)
        replacements = [
-            ('\\\\n', '\n'),     # Double-escaped newlines
-            ('\\\\t', ' '),      # Double-escaped tabs
-            ('\\\\"', '"'),      # Double-escaped quotes
-            ('\\\\r', ''),       # Double-escaped carriage returns
-            ('\\\\/', '/'),      # Double-escaped slashes
-            ('\\\\', '\\'),      # Convert double backslashes to single
+            ("\\\\n", "\n"),  # Double-escaped newlines
+            ("\\\\t", " "),  # Double-escaped tabs
+            ('\\\\"', '"'),  # Double-escaped quotes
+            ("\\\\r", ""),  # Double-escaped carriage returns
+            ("\\\\/", "/"),  # Double-escaped slashes
+            ("\\\\", "\\"),  # Convert double backslashes to single
        ]
-        
+
        for old, new in replacements:
            text = text.replace(old, new)
-        
+
        # Handle single-escaped sequences
        single_replacements = [
-            ('\\n', '\n'),       # Single-escaped newlines
-            ('\\t', ' '),        # Single-escaped tabs
-            ('\\"', '"'),        # Single-escaped quotes
-            ('\\\'', "'"),       # Single-escaped single quotes
-            ('\\r', ''),         # Single-escaped carriage returns
-            ('\\/', '/'),        # Single-escaped slashes
+            ("\\n", "\n"),  # Single-escaped newlines
+            ("\\t", " "),  # Single-escaped tabs
+            ('\\"', '"'),  # Single-escaped quotes
+            ("\\'", "'"),  # Single-escaped single quotes
+            ("\\r", ""),  # Single-escaped carriage returns
+            ("\\/", "/"),  # Single-escaped slashes
        ]
-        
+
        for old, new in single_replacements:
            text = text.replace(old, new)
-        
+
        # Remove any remaining backslash artifacts
-        text = re.sub(r'\\[a-zA-Z]', '', text)       # Remove \letter patterns
-        text = re.sub(r'\\[0-9]', '', text)          # Remove \number patterns
-        text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns
-        text = re.sub(r'\\+', '', text)              # Remove remaining backslashes
-        
+        text = re.sub(r"\\[a-zA-Z]", "", text)  # Remove \letter patterns
+        text = re.sub(r"\\[0-9]", "", text)  # Remove \number patterns
+        text = re.sub(r"\\[^a-zA-Z0-9\s]", "", text)  # Remove \symbol patterns
+        text = re.sub(r"\\+", "", text)  # Remove remaining backslashes
+
        return text
-    
+
    @staticmethod
    def normalize_unicode(text: str) -> str:
        """Convert special Unicode characters to ASCII equivalents."""
        if not text:
            return ""
-        
+
        unicode_map = {
-            '–': '-',     # En dash
-            '—': '-',     # Em dash
-            ''': "'",     # Smart single quote left
-            ''': "'",     # Smart single quote right
-            '"': '"',     # Smart double quote left
-            '"': '"',     # Smart double quote right
-            '…': '...',   # Ellipsis
-            '™': ' TM',   # Trademark
-            '®': ' R',    # Registered
-            '©': ' C',    # Copyright
-            '°': ' deg',  # Degree symbol
+            "–": "-",  # En dash
+            "—": "-",  # Em dash
+            """: "'",     # Smart single quote left
+            """: "'",  # Smart single quote right
+            '"': '"',  # Smart double quote left
+            '"': '"',  # Smart double quote right
+            "…": "...",  # Ellipsis
+            "™": " TM",  # Trademark
+            "®": " R",  # Registered
+            "©": " C",  # Copyright
+            "°": " deg",  # Degree symbol
        }
-        
+
        for unicode_char, ascii_char in unicode_map.items():
            text = text.replace(unicode_char, ascii_char)
-        
+
        return text
-    
+
    @staticmethod
    def normalize_quotes(text: str) -> str:
        """Clean up quote-related artifacts and normalize quote marks."""
        if not text:
            return ""
-        
+
        # Remove quote artifacts
        quote_patterns = [
-            (r'\\+"', '"'),           # Multiple backslashes before quotes
-            (r'\\"', '"'),            # Escaped double quotes
-            (r"\\'", "'"),            # Escaped single quotes
-            (r'\\&', '&'),            # Escaped ampersands
-            (r'""', '"'),             # Double quotes
-            (r"''", "'"),             # Double single quotes
+            (r'\\+"', '"'),  # Multiple backslashes before quotes
+            (r'\\"', '"'),  # Escaped double quotes
+            (r"\\'", "'"),  # Escaped single quotes
+            (r"\\&", "&"),  # Escaped ampersands
+            (r'""', '"'),  # Double quotes
+            (r"''", "'"),  # Double single quotes
        ]
-        
+
        for pattern, replacement in quote_patterns:
            text = re.sub(pattern, replacement, text)
-        
+
        return text
-    
+
    @staticmethod
    def normalize_whitespace(text: str, preserve_paragraphs: bool = True) -> str:
        """Normalize whitespace while optionally preserving paragraph structure."""
        if not text:
            return ""
-        
+
        if preserve_paragraphs:
            # Preserve paragraph breaks (double newlines) but clean up excessive spacing
-            text = re.sub(r'[ \t]+', ' ', text)                    # Multiple spaces/tabs -> single space
-            text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)        # Multiple empty lines -> double line break
-            text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim line-level whitespace
+            text = re.sub(r"[ \t]+", " ", text)  # Multiple spaces/tabs -> single space
+            text = re.sub(
+                r"\n\s*\n\s*\n+", "\n\n", text
+            )  # Multiple empty lines -> double line break
+            text = re.sub(
+                r"^\s+|\s+$", "", text, flags=re.MULTILINE
+            )  # Trim line-level whitespace
        else:
            # Flatten all whitespace for embedding
-            text = re.sub(r'\n+', ' ', text)                      # All newlines to spaces
-            text = re.sub(r'\s+', ' ', text)                      # All whitespace to single spaces
-        
+            text = re.sub(r"\n+", " ", text)  # All newlines to spaces
+            text = re.sub(r"\s+", " ", text)  # All whitespace to single spaces
+
        return text.strip()
-    
+
    @staticmethod
    def remove_artifacts(text: str) -> str:
        """Remove document format artifacts and orphaned elements."""
        if not text:
            return ""
-        
+
        # Remove orphaned punctuation
-        text = re.sub(r'^\s*[)\]}]+\s*', '', text)               # Orphaned closing brackets at start
-        text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text)       # Orphaned closing brackets on own lines
-        
+        text = re.sub(r"^\s*[)\]}]+\s*", "", text)  # Orphaned closing brackets at start
+        text = re.sub(
+            r"\n\s*[)\]}]+\s*\n", "\n\n", text
+        )  # Orphaned closing brackets on own lines
+
        # Remove excessive punctuation
-        text = re.sub(r'[.]{3,}', '...', text)                   # Multiple dots to ellipsis
-        text = re.sub(r'[-]{3,}', '---', text)                   # Multiple dashes
-        
+        text = re.sub(r"[.]{3,}", "...", text)  # Multiple dots to ellipsis
+        text = re.sub(r"[-]{3,}", "---", text)  # Multiple dashes
+
        # Remove empty parentheses and brackets
-        text = re.sub(r'\(\s*\)', '', text)                      # Empty parentheses
-        text = re.sub(r'\[\s*\]', '', text)                      # Empty square brackets
-        text = re.sub(r'\{\s*\}', '', text)                      # Empty curly brackets
-        
+        text = re.sub(r"\(\s*\)", "", text)  # Empty parentheses
+        text = re.sub(r"\[\s*\]", "", text)  # Empty square brackets
+        text = re.sub(r"\{\s*\}", "", text)  # Empty curly brackets
+
        return text
-    
+
    @classmethod
    def clean_for_chunking(cls, text: str) -> str:
        """Clean text for semantic chunking - preserves structure but normalizes content."""
        if not text:
            return ""
-        
+
        # Apply all cleaning steps while preserving paragraph structure
        text = cls.normalize_escape_sequences(text)
        text = cls.normalize_unicode(text)
        text = cls.normalize_quotes(text)
        text = cls.remove_artifacts(text)
        text = cls.normalize_whitespace(text, preserve_paragraphs=True)
-        
+
        return text
-    
+
    @classmethod
    def clean_for_embedding(cls, text: str) -> str:
        """Clean text for embedding - flattens structure and optimizes for vector similarity."""
        if not text:
            return ""
-        
+
        # Start with chunking-level cleaning
        text = cls.clean_for_chunking(text)
-        
+
        # Flatten for embedding
        text = cls.normalize_whitespace(text, preserve_paragraphs=False)
-        
+
        return text
-    
+
    @classmethod
    def clean_for_storage(cls, text: str) -> str:
        """Clean text for storage - most aggressive cleaning for database storage."""
        if not text:
            return ""
-        
+
        # Start with embedding-level cleaning
        text = cls.clean_for_embedding(text)
-        
+
        # Additional aggressive cleaning for storage
-        text = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text)       # Remove any remaining escape sequences
-        
+        text = re.sub(
+            r"\\([^a-zA-Z0-9\s])", r"\1", text
+        )  # Remove any remaining escape sequences
+
        return text


@ -374,37 +382,43 @@ def clean_text_content(text: str) -> str:
    return TextCleaner.clean_for_chunking(text)


-def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+def create_semantic_chunks(
+    text: str, max_chunk_size: int, overlap_size: int
+) -> TypingList[str]:
    """Create semantically aware chunks that respect document structure"""
    if not text or len(text) <= max_chunk_size:
        return [text] if text else []
-    
+
    chunks = []
-    
+
    # Split by double line breaks (paragraphs) first
-    paragraphs = text.split('\n\n')
-    
+    paragraphs = text.split("\n\n")
+
    current_chunk = ""
-    
+
    for paragraph in paragraphs:
        paragraph = paragraph.strip()
        if not paragraph:
            continue
-            
+
        # If adding this paragraph would exceed chunk size
        if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
            # Try to split the current chunk at sentence boundaries if it's too long
            if len(current_chunk) > max_chunk_size:
-                sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+                sentence_chunks = split_by_sentences(
+                    current_chunk, max_chunk_size, overlap_size
+                )
                chunks.extend(sentence_chunks)
            else:
                chunks.append(current_chunk.strip())
-            
+
            # Start new chunk with overlap from previous chunk if applicable
            if chunks and overlap_size > 0:
                prev_chunk = chunks[-1]
                overlap_text = get_text_overlap(prev_chunk, overlap_size)
-                current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+                current_chunk = (
+                    overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+                )
            else:
                current_chunk = paragraph
        else:
@ -413,39 +427,45 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
                current_chunk += "\n\n" + paragraph
            else:
                current_chunk = paragraph
-    
+
    # Add the last chunk
    if current_chunk:
        if len(current_chunk) > max_chunk_size:
-            sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+            sentence_chunks = split_by_sentences(
+                current_chunk, max_chunk_size, overlap_size
+            )
            chunks.extend(sentence_chunks)
        else:
            chunks.append(current_chunk.strip())
-    
+
    return [chunk for chunk in chunks if chunk.strip()]


-def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+def split_by_sentences(
+    text: str, max_chunk_size: int, overlap_size: int
+) -> TypingList[str]:
    """Split text by sentences when paragraph-level splitting isn't sufficient"""
    # Split by sentence endings
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+
    chunks = []
    current_chunk = ""
-    
+
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
-            
+
        # If adding this sentence would exceed chunk size
        if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chunk_size:
            chunks.append(current_chunk.strip())
-            
+
            # Start new chunk with overlap
            if overlap_size > 0:
                overlap_text = get_text_overlap(current_chunk, overlap_size)
-                current_chunk = overlap_text + " " + sentence if overlap_text else sentence
+                current_chunk = (
+                    overlap_text + " " + sentence if overlap_text else sentence
+                )
            else:
                current_chunk = sentence
        else:
@ -454,11 +474,11 @@ def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> Typ
                current_chunk += " " + sentence
            else:
                current_chunk = sentence
-    
+
    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
-    
+
    return [chunk for chunk in chunks if chunk.strip()]


@ -466,18 +486,18 @@ def get_text_overlap(text: str, overlap_size: int) -> str:
    """Get the last overlap_size characters from text, preferring word boundaries"""
    if not text or overlap_size <= 0:
        return ""
-    
+
    if len(text) <= overlap_size:
        return text
-    
+
    # Try to find a good word boundary within the overlap region
    overlap_text = text[-overlap_size:]
-    
+
    # Find the first space to avoid cutting words
-    space_index = overlap_text.find(' ')
+    space_index = overlap_text.find(" ")
    if space_index > 0:
        return overlap_text[space_index:].strip()
-    
+
    return overlap_text.strip()


@ -570,7 +590,8 @@ async def update_embedding_config(
    request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
 ):
    log.info(
-        f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}"
+        f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} "
+        f"to {form_data.embedding_model}"
    )
    try:
        request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
@ -1396,34 +1417,33 @@ def save_docs_to_vector_db(
    if split:
        # Apply advanced content-aware splitting and text cleaning
        processed_docs = []
-        
+
        for doc in docs:
            # Clean the text content before chunking
            if not doc.page_content:
                continue
-            
+
            # Apply text cleaning before chunking using new modular system
            cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
-            
+
            # Create semantic chunks from cleaned content
            chunks = create_semantic_chunks(
                cleaned_content,
                request.app.state.config.CHUNK_SIZE,
-                request.app.state.config.CHUNK_OVERLAP
+                request.app.state.config.CHUNK_OVERLAP,
            )
-            
+
            # Create new documents for each chunk
            for i, chunk in enumerate(chunks):
                chunk_metadata = {
                    **doc.metadata,
                    "chunk_index": i,
-                    "total_chunks": len(chunks)
+                    "total_chunks": len(chunks),
                }
-                processed_docs.append(Document(
-                    page_content=chunk,
-                    metadata=chunk_metadata
-                ))
-        
+                processed_docs.append(
+                    Document(page_content=chunk, metadata=chunk_metadata)
+                )
+
        docs = processed_docs

    if len(docs) == 0:
@ -1501,7 +1521,7 @@ def save_docs_to_vector_db(

        # Prepare texts for embedding using the new modular cleaning system
        cleaned_texts = [TextCleaner.clean_for_embedding(text) for text in texts]
-        
+
        embeddings = embedding_function(
            cleaned_texts,
            prefix=RAG_EMBEDDING_CONTENT_PREFIX,
@ -1513,13 +1533,15 @@ def save_docs_to_vector_db(
        for idx in range(len(texts)):
            # Apply consistent storage-level cleaning
            text_to_store = TextCleaner.clean_for_storage(texts[idx])
-            
-            items.append({
-                "id": str(uuid.uuid4()),
-                "text": text_to_store,
-                "vector": embeddings[idx],
-                "metadata": metadatas[idx],
-            })
+
+            items.append(
+                {
+                    "id": str(uuid.uuid4()),
+                    "text": text_to_store,
+                    "vector": embeddings[idx],
+                    "metadata": metadatas[idx],
+                }
+            )

        VECTOR_DB_CLIENT.insert(
            collection_name=collection_name,
@ -1565,7 +1587,9 @@ def process_file(

            docs = [
                Document(
-                    page_content=TextCleaner.clean_for_chunking(form_data.content.replace("<br/>", "\n")),
+                    page_content=TextCleaner.clean_for_chunking(
+                        form_data.content.replace("<br/>", "\n")
+                    ),
                    metadata={
                        **file.meta,
                        "name": file.filename,
@ -1588,7 +1612,9 @@ def process_file(
            if result is not None and len(result.ids[0]) > 0:
                docs = [
                    Document(
-                        page_content=TextCleaner.clean_for_chunking(result.documents[0][idx]),
+                        page_content=TextCleaner.clean_for_chunking(
+                            result.documents[0][idx]
+                        ),
                        metadata=result.metadatas[0][idx],
                    )
                    for idx, id in enumerate(result.ids[0])
@ -1596,7 +1622,9 @@ def process_file(
            else:
                docs = [
                    Document(
-                        page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
+                        page_content=TextCleaner.clean_for_chunking(
+                            file.data.get("content", "")
+                        ),
                        metadata={
                            **file.meta,
                            "name": file.filename,
@ -1645,22 +1673,26 @@ def process_file(
                cleaned_docs = []
                for doc in docs:
                    cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
-                    
-                    cleaned_docs.append(Document(
-                        page_content=cleaned_content,
-                        metadata={
-                            **doc.metadata,
-                            "name": file.filename,
-                            "created_by": file.user_id,
-                            "file_id": file.id,
-                            "source": file.filename,
-                        },
-                    ))
+
+                    cleaned_docs.append(
+                        Document(
+                            page_content=cleaned_content,
+                            metadata={
+                                **doc.metadata,
+                                "name": file.filename,
+                                "created_by": file.user_id,
+                                "file_id": file.id,
+                                "source": file.filename,
+                            },
+                        )
+                    )
                docs = cleaned_docs
            else:
                docs = [
                    Document(
-                        page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
+                        page_content=TextCleaner.clean_for_chunking(
+                            file.data.get("content", "")
+                        ),
                        metadata={
                            **file.meta,
                            "name": file.filename,
@ -1670,7 +1702,9 @@ def process_file(
                        },
                    )
                ]
-            text_content = " ".join([doc.page_content for doc in docs if doc.page_content])
+            text_content = " ".join(
+                [doc.page_content for doc in docs if doc.page_content]
+            )

        # Ensure text_content is never None or empty for hash calculation
        if not text_content:
@ -2449,7 +2483,9 @@ def process_files_batch(

            docs: List[Document] = [
                Document(
-                    page_content=TextCleaner.clean_for_chunking(text_content.replace("<br/>", "\n")),
+                    page_content=TextCleaner.clean_for_chunking(
+                        text_content.replace("<br/>", "\n")
+                    ),
                    metadata={
                        **file.meta,
                        "name": file.filename,
@ -2509,10 +2545,10 @@ def delete_file_from_vector_db(file_id: str) -> bool:
    Delete all vector embeddings for a specific file from the vector database.
    This function works with any vector database (Pinecone, ChromaDB, etc.) and
    handles the cleanup when a file is deleted from the chat.
-    
+
    Args:
        file_id (str): The ID of the file to delete from vector database
-        
+
    Returns:
        bool: True if deletion was successful, False otherwise
    """
@ -2521,30 +2557,32 @@ def delete_file_from_vector_db(file_id: str) -> bool:
        file = Files.get_file_by_id(file_id)
        if not file:
            return False
-        
+
        # Get the file hash for vector deletion
        file_hash = file.hash
        if not file_hash:
            return False
-        
+
        # Try to get collection name from file metadata
        collection_name = None
-        if hasattr(file, 'meta') and file.meta:
-            collection_name = file.meta.get('collection_name')
-        
+        if hasattr(file, "meta") and file.meta:
+            collection_name = file.meta.get("collection_name")
+
        # If no collection name in metadata, try common patterns used by Open WebUI
        if not collection_name:
            # Open WebUI typically uses these patterns:
            possible_collections = [
                f"open-webui_file-{file_id}",  # Most common pattern
-                f"file-{file_id}",             # Alternative pattern
-                f"open-webui_{file_id}",       # Another possible pattern
+                f"file-{file_id}",  # Alternative pattern
+                f"open-webui_{file_id}",  # Another possible pattern
            ]
-            
+
            # Try each possible collection name
            for possible_collection in possible_collections:
                try:
-                    if VECTOR_DB_CLIENT.has_collection(collection_name=possible_collection):
+                    if VECTOR_DB_CLIENT.has_collection(
+                        collection_name=possible_collection
+                    ):
                        result = VECTOR_DB_CLIENT.delete(
                            collection_name=possible_collection,
                            filter={"hash": file_hash},
@ -2553,19 +2591,21 @@ def delete_file_from_vector_db(file_id: str) -> bool:
                        return True
                except Exception as e:
                    continue
-            
+
            # If none of the standard patterns work, try searching through all collections
            try:
                deleted_count = 0
-                
+
                # Get all collections (this method varies by vector DB implementation)
-                if hasattr(VECTOR_DB_CLIENT, 'list_collections'):
+                if hasattr(VECTOR_DB_CLIENT, "list_collections"):
                    try:
                        collections = VECTOR_DB_CLIENT.list_collections()
-                        
+
                        for collection in collections:
                            try:
-                                if VECTOR_DB_CLIENT.has_collection(collection_name=collection):
+                                if VECTOR_DB_CLIENT.has_collection(
+                                    collection_name=collection
+                                ):
                                    result = VECTOR_DB_CLIENT.delete(
                                        collection_name=collection,
                                        filter={"hash": file_hash},
@ -2576,14 +2616,16 @@ def delete_file_from_vector_db(file_id: str) -> bool:
                                continue
                    except Exception as e:
                        pass
-                
+
                return deleted_count > 0
-                
+
            except Exception as e:
                return False
-        
+
        # Delete from the specific collection found in metadata
-        if collection_name and VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
+        if collection_name and VECTOR_DB_CLIENT.has_collection(
+            collection_name=collection_name
+        ):
            try:
                result = VECTOR_DB_CLIENT.delete(
                    collection_name=collection_name,
@ -2596,6 +2638,6 @@ def delete_file_from_vector_db(file_id: str) -> bool:
                return False
        else:
            return False
-            
+
    except Exception as e:
        return False