Update retrieval.py

2025-06-04 03:37:35 +00:00 · 2025-05-30 18:41:10 -07:00 · 2025-05-30 18:41:10 -07:00 · ef0a724cf1
commit ef0a724cf1
parent 3d0a364e2b
1 changed files with 207 additions and 165 deletions
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@ -208,12 +208,12 @@ class TextCleaner:

        # Handle double-escaped sequences (common in PPTX)
        replacements = [
-            ('\\\\n', '\n'),     # Double-escaped newlines
-            ('\\\\t', ' '),      # Double-escaped tabs
+            ("\\\\n", "\n"),  # Double-escaped newlines
+            ("\\\\t", " "),  # Double-escaped tabs
            ('\\\\"', '"'),  # Double-escaped quotes
-            ('\\\\r', ''),       # Double-escaped carriage returns
-            ('\\\\/', '/'),      # Double-escaped slashes
-            ('\\\\', '\\'),      # Convert double backslashes to single
+            ("\\\\r", ""),  # Double-escaped carriage returns
+            ("\\\\/", "/"),  # Double-escaped slashes
+            ("\\\\", "\\"),  # Convert double backslashes to single
        ]

        for old, new in replacements:
@ -221,22 +221,22 @@ class TextCleaner:

        # Handle single-escaped sequences
        single_replacements = [
-            ('\\n', '\n'),       # Single-escaped newlines
-            ('\\t', ' '),        # Single-escaped tabs
+            ("\\n", "\n"),  # Single-escaped newlines
+            ("\\t", " "),  # Single-escaped tabs
            ('\\"', '"'),  # Single-escaped quotes
-            ('\\\'', "'"),       # Single-escaped single quotes
-            ('\\r', ''),         # Single-escaped carriage returns
-            ('\\/', '/'),        # Single-escaped slashes
+            ("\\'", "'"),  # Single-escaped single quotes
+            ("\\r", ""),  # Single-escaped carriage returns
+            ("\\/", "/"),  # Single-escaped slashes
        ]

        for old, new in single_replacements:
            text = text.replace(old, new)

        # Remove any remaining backslash artifacts
-        text = re.sub(r'\\[a-zA-Z]', '', text)       # Remove \letter patterns
-        text = re.sub(r'\\[0-9]', '', text)          # Remove \number patterns
-        text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns
-        text = re.sub(r'\\+', '', text)              # Remove remaining backslashes
+        text = re.sub(r"\\[a-zA-Z]", "", text)  # Remove \letter patterns
+        text = re.sub(r"\\[0-9]", "", text)  # Remove \number patterns
+        text = re.sub(r"\\[^a-zA-Z0-9\s]", "", text)  # Remove \symbol patterns
+        text = re.sub(r"\\+", "", text)  # Remove remaining backslashes

        return text

@ -247,17 +247,17 @@ class TextCleaner:
            return ""

        unicode_map = {
-            '–': '-',     # En dash
-            '—': '-',     # Em dash
-            ''': "'",     # Smart single quote left
-            ''': "'",     # Smart single quote right
+            "–": "-",  # En dash
+            "—": "-",  # Em dash
+            """: "'",     # Smart single quote left
+            """: "'",  # Smart single quote right
            '"': '"',  # Smart double quote left
            '"': '"',  # Smart double quote right
-            '…': '...',   # Ellipsis
-            '™': ' TM',   # Trademark
-            '®': ' R',    # Registered
-            '©': ' C',    # Copyright
-            '°': ' deg',  # Degree symbol
+            "…": "...",  # Ellipsis
+            "™": " TM",  # Trademark
+            "®": " R",  # Registered
+            "©": " C",  # Copyright
+            "°": " deg",  # Degree symbol
        }

        for unicode_char, ascii_char in unicode_map.items():
@ -276,7 +276,7 @@ class TextCleaner:
            (r'\\+"', '"'),  # Multiple backslashes before quotes
            (r'\\"', '"'),  # Escaped double quotes
            (r"\\'", "'"),  # Escaped single quotes
-            (r'\\&', '&'),            # Escaped ampersands
+            (r"\\&", "&"),  # Escaped ampersands
            (r'""', '"'),  # Double quotes
            (r"''", "'"),  # Double single quotes
        ]
@ -294,13 +294,17 @@ class TextCleaner:

        if preserve_paragraphs:
            # Preserve paragraph breaks (double newlines) but clean up excessive spacing
-            text = re.sub(r'[ \t]+', ' ', text)                    # Multiple spaces/tabs -> single space
-            text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)        # Multiple empty lines -> double line break
-            text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim line-level whitespace
+            text = re.sub(r"[ \t]+", " ", text)  # Multiple spaces/tabs -> single space
+            text = re.sub(
+                r"\n\s*\n\s*\n+", "\n\n", text
+            )  # Multiple empty lines -> double line break
+            text = re.sub(
+                r"^\s+|\s+$", "", text, flags=re.MULTILINE
+            )  # Trim line-level whitespace
        else:
            # Flatten all whitespace for embedding
-            text = re.sub(r'\n+', ' ', text)                      # All newlines to spaces
-            text = re.sub(r'\s+', ' ', text)                      # All whitespace to single spaces
+            text = re.sub(r"\n+", " ", text)  # All newlines to spaces
+            text = re.sub(r"\s+", " ", text)  # All whitespace to single spaces

        return text.strip()

@ -311,17 +315,19 @@ class TextCleaner:
            return ""

        # Remove orphaned punctuation
-        text = re.sub(r'^\s*[)\]}]+\s*', '', text)               # Orphaned closing brackets at start
-        text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text)       # Orphaned closing brackets on own lines
+        text = re.sub(r"^\s*[)\]}]+\s*", "", text)  # Orphaned closing brackets at start
+        text = re.sub(
+            r"\n\s*[)\]}]+\s*\n", "\n\n", text
+        )  # Orphaned closing brackets on own lines

        # Remove excessive punctuation
-        text = re.sub(r'[.]{3,}', '...', text)                   # Multiple dots to ellipsis
-        text = re.sub(r'[-]{3,}', '---', text)                   # Multiple dashes
+        text = re.sub(r"[.]{3,}", "...", text)  # Multiple dots to ellipsis
+        text = re.sub(r"[-]{3,}", "---", text)  # Multiple dashes

        # Remove empty parentheses and brackets
-        text = re.sub(r'\(\s*\)', '', text)                      # Empty parentheses
-        text = re.sub(r'\[\s*\]', '', text)                      # Empty square brackets
-        text = re.sub(r'\{\s*\}', '', text)                      # Empty curly brackets
+        text = re.sub(r"\(\s*\)", "", text)  # Empty parentheses
+        text = re.sub(r"\[\s*\]", "", text)  # Empty square brackets
+        text = re.sub(r"\{\s*\}", "", text)  # Empty curly brackets

        return text

@ -364,7 +370,9 @@ class TextCleaner:
        text = cls.clean_for_embedding(text)

        # Additional aggressive cleaning for storage
-        text = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text)       # Remove any remaining escape sequences
+        text = re.sub(
+            r"\\([^a-zA-Z0-9\s])", r"\1", text
+        )  # Remove any remaining escape sequences

        return text

@ -374,7 +382,9 @@ def clean_text_content(text: str) -> str:
    return TextCleaner.clean_for_chunking(text)


-def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+def create_semantic_chunks(
+    text: str, max_chunk_size: int, overlap_size: int
+) -> TypingList[str]:
    """Create semantically aware chunks that respect document structure"""
    if not text or len(text) <= max_chunk_size:
        return [text] if text else []
@ -382,7 +392,7 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
    chunks = []

    # Split by double line breaks (paragraphs) first
-    paragraphs = text.split('\n\n')
+    paragraphs = text.split("\n\n")

    current_chunk = ""

@ -395,7 +405,9 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
        if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
            # Try to split the current chunk at sentence boundaries if it's too long
            if len(current_chunk) > max_chunk_size:
-                sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+                sentence_chunks = split_by_sentences(
+                    current_chunk, max_chunk_size, overlap_size
+                )
                chunks.extend(sentence_chunks)
            else:
                chunks.append(current_chunk.strip())
@ -404,7 +416,9 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
            if chunks and overlap_size > 0:
                prev_chunk = chunks[-1]
                overlap_text = get_text_overlap(prev_chunk, overlap_size)
-                current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+                current_chunk = (
+                    overlap_text + "\n\n" + paragraph if overlap_text else paragraph
+                )
            else:
                current_chunk = paragraph
        else:
@ -417,7 +431,9 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
    # Add the last chunk
    if current_chunk:
        if len(current_chunk) > max_chunk_size:
-            sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
+            sentence_chunks = split_by_sentences(
+                current_chunk, max_chunk_size, overlap_size
+            )
            chunks.extend(sentence_chunks)
        else:
            chunks.append(current_chunk.strip())
@ -425,10 +441,12 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
    return [chunk for chunk in chunks if chunk.strip()]


-def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
+def split_by_sentences(
+    text: str, max_chunk_size: int, overlap_size: int
+) -> TypingList[str]:
    """Split text by sentences when paragraph-level splitting isn't sufficient"""
    # Split by sentence endings
-    sentences = re.split(r'(?<=[.!?])\s+', text)
+    sentences = re.split(r"(?<=[.!?])\s+", text)

    chunks = []
    current_chunk = ""
@ -445,7 +463,9 @@ def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> Typ
            # Start new chunk with overlap
            if overlap_size > 0:
                overlap_text = get_text_overlap(current_chunk, overlap_size)
-                current_chunk = overlap_text + " " + sentence if overlap_text else sentence
+                current_chunk = (
+                    overlap_text + " " + sentence if overlap_text else sentence
+                )
            else:
                current_chunk = sentence
        else:
@ -474,7 +494,7 @@ def get_text_overlap(text: str, overlap_size: int) -> str:
    overlap_text = text[-overlap_size:]

    # Find the first space to avoid cutting words
-    space_index = overlap_text.find(' ')
+    space_index = overlap_text.find(" ")
    if space_index > 0:
        return overlap_text[space_index:].strip()

@ -570,7 +590,8 @@ async def update_embedding_config(
    request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
 ):
    log.info(
-        f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}"
+        f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} "
+        f"to {form_data.embedding_model}"
    )
    try:
        request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
@ -1409,7 +1430,7 @@ def save_docs_to_vector_db(
            chunks = create_semantic_chunks(
                cleaned_content,
                request.app.state.config.CHUNK_SIZE,
-                request.app.state.config.CHUNK_OVERLAP
+                request.app.state.config.CHUNK_OVERLAP,
            )

            # Create new documents for each chunk
@ -1417,12 +1438,11 @@ def save_docs_to_vector_db(
                chunk_metadata = {
                    **doc.metadata,
                    "chunk_index": i,
-                    "total_chunks": len(chunks)
+                    "total_chunks": len(chunks),
                }
-                processed_docs.append(Document(
-                    page_content=chunk,
-                    metadata=chunk_metadata
-                ))
+                processed_docs.append(
+                    Document(page_content=chunk, metadata=chunk_metadata)
+                )

        docs = processed_docs

@ -1514,12 +1534,14 @@ def save_docs_to_vector_db(
            # Apply consistent storage-level cleaning
            text_to_store = TextCleaner.clean_for_storage(texts[idx])

-            items.append({
+            items.append(
+                {
                    "id": str(uuid.uuid4()),
                    "text": text_to_store,
                    "vector": embeddings[idx],
                    "metadata": metadatas[idx],
-            })
+                }
+            )

        VECTOR_DB_CLIENT.insert(
            collection_name=collection_name,
@ -1565,7 +1587,9 @@ def process_file(

            docs = [
                Document(
-                    page_content=TextCleaner.clean_for_chunking(form_data.content.replace("<br/>", "\n")),
+                    page_content=TextCleaner.clean_for_chunking(
+                        form_data.content.replace("<br/>", "\n")
+                    ),
                    metadata={
                        **file.meta,
                        "name": file.filename,
@ -1588,7 +1612,9 @@ def process_file(
            if result is not None and len(result.ids[0]) > 0:
                docs = [
                    Document(
-                        page_content=TextCleaner.clean_for_chunking(result.documents[0][idx]),
+                        page_content=TextCleaner.clean_for_chunking(
+                            result.documents[0][idx]
+                        ),
                        metadata=result.metadatas[0][idx],
                    )
                    for idx, id in enumerate(result.ids[0])
@ -1596,7 +1622,9 @@ def process_file(
            else:
                docs = [
                    Document(
-                        page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
+                        page_content=TextCleaner.clean_for_chunking(
+                            file.data.get("content", "")
+                        ),
                        metadata={
                            **file.meta,
                            "name": file.filename,
@ -1646,7 +1674,8 @@ def process_file(
                for doc in docs:
                    cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)

-                    cleaned_docs.append(Document(
+                    cleaned_docs.append(
+                        Document(
                            page_content=cleaned_content,
                            metadata={
                                **doc.metadata,
@ -1655,12 +1684,15 @@ def process_file(
                                "file_id": file.id,
                                "source": file.filename,
                            },
-                    ))
+                        )
+                    )
                docs = cleaned_docs
            else:
                docs = [
                    Document(
-                        page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
+                        page_content=TextCleaner.clean_for_chunking(
+                            file.data.get("content", "")
+                        ),
                        metadata={
                            **file.meta,
                            "name": file.filename,
@ -1670,7 +1702,9 @@ def process_file(
                        },
                    )
                ]
-            text_content = " ".join([doc.page_content for doc in docs if doc.page_content])
+            text_content = " ".join(
+                [doc.page_content for doc in docs if doc.page_content]
+            )

        # Ensure text_content is never None or empty for hash calculation
        if not text_content:
@ -2449,7 +2483,9 @@ def process_files_batch(

            docs: List[Document] = [
                Document(
-                    page_content=TextCleaner.clean_for_chunking(text_content.replace("<br/>", "\n")),
+                    page_content=TextCleaner.clean_for_chunking(
+                        text_content.replace("<br/>", "\n")
+                    ),
                    metadata={
                        **file.meta,
                        "name": file.filename,
@ -2529,8 +2565,8 @@ def delete_file_from_vector_db(file_id: str) -> bool:

        # Try to get collection name from file metadata
        collection_name = None
-        if hasattr(file, 'meta') and file.meta:
-            collection_name = file.meta.get('collection_name')
+        if hasattr(file, "meta") and file.meta:
+            collection_name = file.meta.get("collection_name")

        # If no collection name in metadata, try common patterns used by Open WebUI
        if not collection_name:
@ -2544,7 +2580,9 @@ def delete_file_from_vector_db(file_id: str) -> bool:
            # Try each possible collection name
            for possible_collection in possible_collections:
                try:
-                    if VECTOR_DB_CLIENT.has_collection(collection_name=possible_collection):
+                    if VECTOR_DB_CLIENT.has_collection(
+                        collection_name=possible_collection
+                    ):
                        result = VECTOR_DB_CLIENT.delete(
                            collection_name=possible_collection,
                            filter={"hash": file_hash},
@ -2559,13 +2597,15 @@ def delete_file_from_vector_db(file_id: str) -> bool:
                deleted_count = 0

                # Get all collections (this method varies by vector DB implementation)
-                if hasattr(VECTOR_DB_CLIENT, 'list_collections'):
+                if hasattr(VECTOR_DB_CLIENT, "list_collections"):
                    try:
                        collections = VECTOR_DB_CLIENT.list_collections()

                        for collection in collections:
                            try:
-                                if VECTOR_DB_CLIENT.has_collection(collection_name=collection):
+                                if VECTOR_DB_CLIENT.has_collection(
+                                    collection_name=collection
+                                ):
                                    result = VECTOR_DB_CLIENT.delete(
                                        collection_name=collection,
                                        filter={"hash": file_hash},
@ -2583,7 +2623,9 @@ def delete_file_from_vector_db(file_id: str) -> bool:
                return False

        # Delete from the specific collection found in metadata
-        if collection_name and VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
+        if collection_name and VECTOR_DB_CLIENT.has_collection(
+            collection_name=collection_name
+        ):
            try:
                result = VECTOR_DB_CLIENT.delete(
                    collection_name=collection_name,