mirror of
https://github.com/open-webui/open-webui
synced 2025-06-04 03:37:35 +00:00
Update retrieval.py
This commit is contained in:
parent
3d0a364e2b
commit
ef0a724cf1
@ -208,12 +208,12 @@ class TextCleaner:
|
|||||||
|
|
||||||
# Handle double-escaped sequences (common in PPTX)
|
# Handle double-escaped sequences (common in PPTX)
|
||||||
replacements = [
|
replacements = [
|
||||||
('\\\\n', '\n'), # Double-escaped newlines
|
("\\\\n", "\n"), # Double-escaped newlines
|
||||||
('\\\\t', ' '), # Double-escaped tabs
|
("\\\\t", " "), # Double-escaped tabs
|
||||||
('\\\\"', '"'), # Double-escaped quotes
|
('\\\\"', '"'), # Double-escaped quotes
|
||||||
('\\\\r', ''), # Double-escaped carriage returns
|
("\\\\r", ""), # Double-escaped carriage returns
|
||||||
('\\\\/', '/'), # Double-escaped slashes
|
("\\\\/", "/"), # Double-escaped slashes
|
||||||
('\\\\', '\\'), # Convert double backslashes to single
|
("\\\\", "\\"), # Convert double backslashes to single
|
||||||
]
|
]
|
||||||
|
|
||||||
for old, new in replacements:
|
for old, new in replacements:
|
||||||
@ -221,22 +221,22 @@ class TextCleaner:
|
|||||||
|
|
||||||
# Handle single-escaped sequences
|
# Handle single-escaped sequences
|
||||||
single_replacements = [
|
single_replacements = [
|
||||||
('\\n', '\n'), # Single-escaped newlines
|
("\\n", "\n"), # Single-escaped newlines
|
||||||
('\\t', ' '), # Single-escaped tabs
|
("\\t", " "), # Single-escaped tabs
|
||||||
('\\"', '"'), # Single-escaped quotes
|
('\\"', '"'), # Single-escaped quotes
|
||||||
('\\\'', "'"), # Single-escaped single quotes
|
("\\'", "'"), # Single-escaped single quotes
|
||||||
('\\r', ''), # Single-escaped carriage returns
|
("\\r", ""), # Single-escaped carriage returns
|
||||||
('\\/', '/'), # Single-escaped slashes
|
("\\/", "/"), # Single-escaped slashes
|
||||||
]
|
]
|
||||||
|
|
||||||
for old, new in single_replacements:
|
for old, new in single_replacements:
|
||||||
text = text.replace(old, new)
|
text = text.replace(old, new)
|
||||||
|
|
||||||
# Remove any remaining backslash artifacts
|
# Remove any remaining backslash artifacts
|
||||||
text = re.sub(r'\\[a-zA-Z]', '', text) # Remove \letter patterns
|
text = re.sub(r"\\[a-zA-Z]", "", text) # Remove \letter patterns
|
||||||
text = re.sub(r'\\[0-9]', '', text) # Remove \number patterns
|
text = re.sub(r"\\[0-9]", "", text) # Remove \number patterns
|
||||||
text = re.sub(r'\\[^a-zA-Z0-9\s]', '', text) # Remove \symbol patterns
|
text = re.sub(r"\\[^a-zA-Z0-9\s]", "", text) # Remove \symbol patterns
|
||||||
text = re.sub(r'\\+', '', text) # Remove remaining backslashes
|
text = re.sub(r"\\+", "", text) # Remove remaining backslashes
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -247,17 +247,17 @@ class TextCleaner:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
unicode_map = {
|
unicode_map = {
|
||||||
'–': '-', # En dash
|
"–": "-", # En dash
|
||||||
'—': '-', # Em dash
|
"—": "-", # Em dash
|
||||||
''': "'", # Smart single quote left
|
""": "'", # Smart single quote left
|
||||||
''': "'", # Smart single quote right
|
""": "'", # Smart single quote right
|
||||||
'"': '"', # Smart double quote left
|
'"': '"', # Smart double quote left
|
||||||
'"': '"', # Smart double quote right
|
'"': '"', # Smart double quote right
|
||||||
'…': '...', # Ellipsis
|
"…": "...", # Ellipsis
|
||||||
'™': ' TM', # Trademark
|
"™": " TM", # Trademark
|
||||||
'®': ' R', # Registered
|
"®": " R", # Registered
|
||||||
'©': ' C', # Copyright
|
"©": " C", # Copyright
|
||||||
'°': ' deg', # Degree symbol
|
"°": " deg", # Degree symbol
|
||||||
}
|
}
|
||||||
|
|
||||||
for unicode_char, ascii_char in unicode_map.items():
|
for unicode_char, ascii_char in unicode_map.items():
|
||||||
@ -276,7 +276,7 @@ class TextCleaner:
|
|||||||
(r'\\+"', '"'), # Multiple backslashes before quotes
|
(r'\\+"', '"'), # Multiple backslashes before quotes
|
||||||
(r'\\"', '"'), # Escaped double quotes
|
(r'\\"', '"'), # Escaped double quotes
|
||||||
(r"\\'", "'"), # Escaped single quotes
|
(r"\\'", "'"), # Escaped single quotes
|
||||||
(r'\\&', '&'), # Escaped ampersands
|
(r"\\&", "&"), # Escaped ampersands
|
||||||
(r'""', '"'), # Double quotes
|
(r'""', '"'), # Double quotes
|
||||||
(r"''", "'"), # Double single quotes
|
(r"''", "'"), # Double single quotes
|
||||||
]
|
]
|
||||||
@ -294,13 +294,17 @@ class TextCleaner:
|
|||||||
|
|
||||||
if preserve_paragraphs:
|
if preserve_paragraphs:
|
||||||
# Preserve paragraph breaks (double newlines) but clean up excessive spacing
|
# Preserve paragraph breaks (double newlines) but clean up excessive spacing
|
||||||
text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces/tabs -> single space
|
text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs -> single space
|
||||||
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple empty lines -> double line break
|
text = re.sub(
|
||||||
text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE) # Trim line-level whitespace
|
r"\n\s*\n\s*\n+", "\n\n", text
|
||||||
|
) # Multiple empty lines -> double line break
|
||||||
|
text = re.sub(
|
||||||
|
r"^\s+|\s+$", "", text, flags=re.MULTILINE
|
||||||
|
) # Trim line-level whitespace
|
||||||
else:
|
else:
|
||||||
# Flatten all whitespace for embedding
|
# Flatten all whitespace for embedding
|
||||||
text = re.sub(r'\n+', ' ', text) # All newlines to spaces
|
text = re.sub(r"\n+", " ", text) # All newlines to spaces
|
||||||
text = re.sub(r'\s+', ' ', text) # All whitespace to single spaces
|
text = re.sub(r"\s+", " ", text) # All whitespace to single spaces
|
||||||
|
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
@ -311,17 +315,19 @@ class TextCleaner:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
# Remove orphaned punctuation
|
# Remove orphaned punctuation
|
||||||
text = re.sub(r'^\s*[)\]}]+\s*', '', text) # Orphaned closing brackets at start
|
text = re.sub(r"^\s*[)\]}]+\s*", "", text) # Orphaned closing brackets at start
|
||||||
text = re.sub(r'\n\s*[)\]}]+\s*\n', '\n\n', text) # Orphaned closing brackets on own lines
|
text = re.sub(
|
||||||
|
r"\n\s*[)\]}]+\s*\n", "\n\n", text
|
||||||
|
) # Orphaned closing brackets on own lines
|
||||||
|
|
||||||
# Remove excessive punctuation
|
# Remove excessive punctuation
|
||||||
text = re.sub(r'[.]{3,}', '...', text) # Multiple dots to ellipsis
|
text = re.sub(r"[.]{3,}", "...", text) # Multiple dots to ellipsis
|
||||||
text = re.sub(r'[-]{3,}', '---', text) # Multiple dashes
|
text = re.sub(r"[-]{3,}", "---", text) # Multiple dashes
|
||||||
|
|
||||||
# Remove empty parentheses and brackets
|
# Remove empty parentheses and brackets
|
||||||
text = re.sub(r'\(\s*\)', '', text) # Empty parentheses
|
text = re.sub(r"\(\s*\)", "", text) # Empty parentheses
|
||||||
text = re.sub(r'\[\s*\]', '', text) # Empty square brackets
|
text = re.sub(r"\[\s*\]", "", text) # Empty square brackets
|
||||||
text = re.sub(r'\{\s*\}', '', text) # Empty curly brackets
|
text = re.sub(r"\{\s*\}", "", text) # Empty curly brackets
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -364,7 +370,9 @@ class TextCleaner:
|
|||||||
text = cls.clean_for_embedding(text)
|
text = cls.clean_for_embedding(text)
|
||||||
|
|
||||||
# Additional aggressive cleaning for storage
|
# Additional aggressive cleaning for storage
|
||||||
text = re.sub(r'\\([^a-zA-Z0-9\s])', r'\1', text) # Remove any remaining escape sequences
|
text = re.sub(
|
||||||
|
r"\\([^a-zA-Z0-9\s])", r"\1", text
|
||||||
|
) # Remove any remaining escape sequences
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@ -374,7 +382,9 @@ def clean_text_content(text: str) -> str:
|
|||||||
return TextCleaner.clean_for_chunking(text)
|
return TextCleaner.clean_for_chunking(text)
|
||||||
|
|
||||||
|
|
||||||
def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
|
def create_semantic_chunks(
|
||||||
|
text: str, max_chunk_size: int, overlap_size: int
|
||||||
|
) -> TypingList[str]:
|
||||||
"""Create semantically aware chunks that respect document structure"""
|
"""Create semantically aware chunks that respect document structure"""
|
||||||
if not text or len(text) <= max_chunk_size:
|
if not text or len(text) <= max_chunk_size:
|
||||||
return [text] if text else []
|
return [text] if text else []
|
||||||
@ -382,7 +392,7 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
|
|||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
# Split by double line breaks (paragraphs) first
|
# Split by double line breaks (paragraphs) first
|
||||||
paragraphs = text.split('\n\n')
|
paragraphs = text.split("\n\n")
|
||||||
|
|
||||||
current_chunk = ""
|
current_chunk = ""
|
||||||
|
|
||||||
@ -395,7 +405,9 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
|
|||||||
if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
|
if current_chunk and len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
|
||||||
# Try to split the current chunk at sentence boundaries if it's too long
|
# Try to split the current chunk at sentence boundaries if it's too long
|
||||||
if len(current_chunk) > max_chunk_size:
|
if len(current_chunk) > max_chunk_size:
|
||||||
sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
|
sentence_chunks = split_by_sentences(
|
||||||
|
current_chunk, max_chunk_size, overlap_size
|
||||||
|
)
|
||||||
chunks.extend(sentence_chunks)
|
chunks.extend(sentence_chunks)
|
||||||
else:
|
else:
|
||||||
chunks.append(current_chunk.strip())
|
chunks.append(current_chunk.strip())
|
||||||
@ -404,7 +416,9 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
|
|||||||
if chunks and overlap_size > 0:
|
if chunks and overlap_size > 0:
|
||||||
prev_chunk = chunks[-1]
|
prev_chunk = chunks[-1]
|
||||||
overlap_text = get_text_overlap(prev_chunk, overlap_size)
|
overlap_text = get_text_overlap(prev_chunk, overlap_size)
|
||||||
current_chunk = overlap_text + "\n\n" + paragraph if overlap_text else paragraph
|
current_chunk = (
|
||||||
|
overlap_text + "\n\n" + paragraph if overlap_text else paragraph
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
current_chunk = paragraph
|
current_chunk = paragraph
|
||||||
else:
|
else:
|
||||||
@ -417,7 +431,9 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
|
|||||||
# Add the last chunk
|
# Add the last chunk
|
||||||
if current_chunk:
|
if current_chunk:
|
||||||
if len(current_chunk) > max_chunk_size:
|
if len(current_chunk) > max_chunk_size:
|
||||||
sentence_chunks = split_by_sentences(current_chunk, max_chunk_size, overlap_size)
|
sentence_chunks = split_by_sentences(
|
||||||
|
current_chunk, max_chunk_size, overlap_size
|
||||||
|
)
|
||||||
chunks.extend(sentence_chunks)
|
chunks.extend(sentence_chunks)
|
||||||
else:
|
else:
|
||||||
chunks.append(current_chunk.strip())
|
chunks.append(current_chunk.strip())
|
||||||
@ -425,10 +441,12 @@ def create_semantic_chunks(text: str, max_chunk_size: int, overlap_size: int) ->
|
|||||||
return [chunk for chunk in chunks if chunk.strip()]
|
return [chunk for chunk in chunks if chunk.strip()]
|
||||||
|
|
||||||
|
|
||||||
def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> TypingList[str]:
|
def split_by_sentences(
|
||||||
|
text: str, max_chunk_size: int, overlap_size: int
|
||||||
|
) -> TypingList[str]:
|
||||||
"""Split text by sentences when paragraph-level splitting isn't sufficient"""
|
"""Split text by sentences when paragraph-level splitting isn't sufficient"""
|
||||||
# Split by sentence endings
|
# Split by sentence endings
|
||||||
sentences = re.split(r'(?<=[.!?])\s+', text)
|
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
current_chunk = ""
|
current_chunk = ""
|
||||||
@ -445,7 +463,9 @@ def split_by_sentences(text: str, max_chunk_size: int, overlap_size: int) -> Typ
|
|||||||
# Start new chunk with overlap
|
# Start new chunk with overlap
|
||||||
if overlap_size > 0:
|
if overlap_size > 0:
|
||||||
overlap_text = get_text_overlap(current_chunk, overlap_size)
|
overlap_text = get_text_overlap(current_chunk, overlap_size)
|
||||||
current_chunk = overlap_text + " " + sentence if overlap_text else sentence
|
current_chunk = (
|
||||||
|
overlap_text + " " + sentence if overlap_text else sentence
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
current_chunk = sentence
|
current_chunk = sentence
|
||||||
else:
|
else:
|
||||||
@ -474,7 +494,7 @@ def get_text_overlap(text: str, overlap_size: int) -> str:
|
|||||||
overlap_text = text[-overlap_size:]
|
overlap_text = text[-overlap_size:]
|
||||||
|
|
||||||
# Find the first space to avoid cutting words
|
# Find the first space to avoid cutting words
|
||||||
space_index = overlap_text.find(' ')
|
space_index = overlap_text.find(" ")
|
||||||
if space_index > 0:
|
if space_index > 0:
|
||||||
return overlap_text[space_index:].strip()
|
return overlap_text[space_index:].strip()
|
||||||
|
|
||||||
@ -570,7 +590,8 @@ async def update_embedding_config(
|
|||||||
request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
|
request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
|
||||||
):
|
):
|
||||||
log.info(
|
log.info(
|
||||||
f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}"
|
f"Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} "
|
||||||
|
f"to {form_data.embedding_model}"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
|
request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
|
||||||
@ -1409,7 +1430,7 @@ def save_docs_to_vector_db(
|
|||||||
chunks = create_semantic_chunks(
|
chunks = create_semantic_chunks(
|
||||||
cleaned_content,
|
cleaned_content,
|
||||||
request.app.state.config.CHUNK_SIZE,
|
request.app.state.config.CHUNK_SIZE,
|
||||||
request.app.state.config.CHUNK_OVERLAP
|
request.app.state.config.CHUNK_OVERLAP,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create new documents for each chunk
|
# Create new documents for each chunk
|
||||||
@ -1417,12 +1438,11 @@ def save_docs_to_vector_db(
|
|||||||
chunk_metadata = {
|
chunk_metadata = {
|
||||||
**doc.metadata,
|
**doc.metadata,
|
||||||
"chunk_index": i,
|
"chunk_index": i,
|
||||||
"total_chunks": len(chunks)
|
"total_chunks": len(chunks),
|
||||||
}
|
}
|
||||||
processed_docs.append(Document(
|
processed_docs.append(
|
||||||
page_content=chunk,
|
Document(page_content=chunk, metadata=chunk_metadata)
|
||||||
metadata=chunk_metadata
|
)
|
||||||
))
|
|
||||||
|
|
||||||
docs = processed_docs
|
docs = processed_docs
|
||||||
|
|
||||||
@ -1514,12 +1534,14 @@ def save_docs_to_vector_db(
|
|||||||
# Apply consistent storage-level cleaning
|
# Apply consistent storage-level cleaning
|
||||||
text_to_store = TextCleaner.clean_for_storage(texts[idx])
|
text_to_store = TextCleaner.clean_for_storage(texts[idx])
|
||||||
|
|
||||||
items.append({
|
items.append(
|
||||||
|
{
|
||||||
"id": str(uuid.uuid4()),
|
"id": str(uuid.uuid4()),
|
||||||
"text": text_to_store,
|
"text": text_to_store,
|
||||||
"vector": embeddings[idx],
|
"vector": embeddings[idx],
|
||||||
"metadata": metadatas[idx],
|
"metadata": metadatas[idx],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
VECTOR_DB_CLIENT.insert(
|
VECTOR_DB_CLIENT.insert(
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
@ -1565,7 +1587,9 @@ def process_file(
|
|||||||
|
|
||||||
docs = [
|
docs = [
|
||||||
Document(
|
Document(
|
||||||
page_content=TextCleaner.clean_for_chunking(form_data.content.replace("<br/>", "\n")),
|
page_content=TextCleaner.clean_for_chunking(
|
||||||
|
form_data.content.replace("<br/>", "\n")
|
||||||
|
),
|
||||||
metadata={
|
metadata={
|
||||||
**file.meta,
|
**file.meta,
|
||||||
"name": file.filename,
|
"name": file.filename,
|
||||||
@ -1588,7 +1612,9 @@ def process_file(
|
|||||||
if result is not None and len(result.ids[0]) > 0:
|
if result is not None and len(result.ids[0]) > 0:
|
||||||
docs = [
|
docs = [
|
||||||
Document(
|
Document(
|
||||||
page_content=TextCleaner.clean_for_chunking(result.documents[0][idx]),
|
page_content=TextCleaner.clean_for_chunking(
|
||||||
|
result.documents[0][idx]
|
||||||
|
),
|
||||||
metadata=result.metadatas[0][idx],
|
metadata=result.metadatas[0][idx],
|
||||||
)
|
)
|
||||||
for idx, id in enumerate(result.ids[0])
|
for idx, id in enumerate(result.ids[0])
|
||||||
@ -1596,7 +1622,9 @@ def process_file(
|
|||||||
else:
|
else:
|
||||||
docs = [
|
docs = [
|
||||||
Document(
|
Document(
|
||||||
page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
|
page_content=TextCleaner.clean_for_chunking(
|
||||||
|
file.data.get("content", "")
|
||||||
|
),
|
||||||
metadata={
|
metadata={
|
||||||
**file.meta,
|
**file.meta,
|
||||||
"name": file.filename,
|
"name": file.filename,
|
||||||
@ -1646,7 +1674,8 @@ def process_file(
|
|||||||
for doc in docs:
|
for doc in docs:
|
||||||
cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
|
cleaned_content = TextCleaner.clean_for_chunking(doc.page_content)
|
||||||
|
|
||||||
cleaned_docs.append(Document(
|
cleaned_docs.append(
|
||||||
|
Document(
|
||||||
page_content=cleaned_content,
|
page_content=cleaned_content,
|
||||||
metadata={
|
metadata={
|
||||||
**doc.metadata,
|
**doc.metadata,
|
||||||
@ -1655,12 +1684,15 @@ def process_file(
|
|||||||
"file_id": file.id,
|
"file_id": file.id,
|
||||||
"source": file.filename,
|
"source": file.filename,
|
||||||
},
|
},
|
||||||
))
|
)
|
||||||
|
)
|
||||||
docs = cleaned_docs
|
docs = cleaned_docs
|
||||||
else:
|
else:
|
||||||
docs = [
|
docs = [
|
||||||
Document(
|
Document(
|
||||||
page_content=TextCleaner.clean_for_chunking(file.data.get("content", "")),
|
page_content=TextCleaner.clean_for_chunking(
|
||||||
|
file.data.get("content", "")
|
||||||
|
),
|
||||||
metadata={
|
metadata={
|
||||||
**file.meta,
|
**file.meta,
|
||||||
"name": file.filename,
|
"name": file.filename,
|
||||||
@ -1670,7 +1702,9 @@ def process_file(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
text_content = " ".join([doc.page_content for doc in docs if doc.page_content])
|
text_content = " ".join(
|
||||||
|
[doc.page_content for doc in docs if doc.page_content]
|
||||||
|
)
|
||||||
|
|
||||||
# Ensure text_content is never None or empty for hash calculation
|
# Ensure text_content is never None or empty for hash calculation
|
||||||
if not text_content:
|
if not text_content:
|
||||||
@ -2449,7 +2483,9 @@ def process_files_batch(
|
|||||||
|
|
||||||
docs: List[Document] = [
|
docs: List[Document] = [
|
||||||
Document(
|
Document(
|
||||||
page_content=TextCleaner.clean_for_chunking(text_content.replace("<br/>", "\n")),
|
page_content=TextCleaner.clean_for_chunking(
|
||||||
|
text_content.replace("<br/>", "\n")
|
||||||
|
),
|
||||||
metadata={
|
metadata={
|
||||||
**file.meta,
|
**file.meta,
|
||||||
"name": file.filename,
|
"name": file.filename,
|
||||||
@ -2529,8 +2565,8 @@ def delete_file_from_vector_db(file_id: str) -> bool:
|
|||||||
|
|
||||||
# Try to get collection name from file metadata
|
# Try to get collection name from file metadata
|
||||||
collection_name = None
|
collection_name = None
|
||||||
if hasattr(file, 'meta') and file.meta:
|
if hasattr(file, "meta") and file.meta:
|
||||||
collection_name = file.meta.get('collection_name')
|
collection_name = file.meta.get("collection_name")
|
||||||
|
|
||||||
# If no collection name in metadata, try common patterns used by Open WebUI
|
# If no collection name in metadata, try common patterns used by Open WebUI
|
||||||
if not collection_name:
|
if not collection_name:
|
||||||
@ -2544,7 +2580,9 @@ def delete_file_from_vector_db(file_id: str) -> bool:
|
|||||||
# Try each possible collection name
|
# Try each possible collection name
|
||||||
for possible_collection in possible_collections:
|
for possible_collection in possible_collections:
|
||||||
try:
|
try:
|
||||||
if VECTOR_DB_CLIENT.has_collection(collection_name=possible_collection):
|
if VECTOR_DB_CLIENT.has_collection(
|
||||||
|
collection_name=possible_collection
|
||||||
|
):
|
||||||
result = VECTOR_DB_CLIENT.delete(
|
result = VECTOR_DB_CLIENT.delete(
|
||||||
collection_name=possible_collection,
|
collection_name=possible_collection,
|
||||||
filter={"hash": file_hash},
|
filter={"hash": file_hash},
|
||||||
@ -2559,13 +2597,15 @@ def delete_file_from_vector_db(file_id: str) -> bool:
|
|||||||
deleted_count = 0
|
deleted_count = 0
|
||||||
|
|
||||||
# Get all collections (this method varies by vector DB implementation)
|
# Get all collections (this method varies by vector DB implementation)
|
||||||
if hasattr(VECTOR_DB_CLIENT, 'list_collections'):
|
if hasattr(VECTOR_DB_CLIENT, "list_collections"):
|
||||||
try:
|
try:
|
||||||
collections = VECTOR_DB_CLIENT.list_collections()
|
collections = VECTOR_DB_CLIENT.list_collections()
|
||||||
|
|
||||||
for collection in collections:
|
for collection in collections:
|
||||||
try:
|
try:
|
||||||
if VECTOR_DB_CLIENT.has_collection(collection_name=collection):
|
if VECTOR_DB_CLIENT.has_collection(
|
||||||
|
collection_name=collection
|
||||||
|
):
|
||||||
result = VECTOR_DB_CLIENT.delete(
|
result = VECTOR_DB_CLIENT.delete(
|
||||||
collection_name=collection,
|
collection_name=collection,
|
||||||
filter={"hash": file_hash},
|
filter={"hash": file_hash},
|
||||||
@ -2583,7 +2623,9 @@ def delete_file_from_vector_db(file_id: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
# Delete from the specific collection found in metadata
|
# Delete from the specific collection found in metadata
|
||||||
if collection_name and VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
|
if collection_name and VECTOR_DB_CLIENT.has_collection(
|
||||||
|
collection_name=collection_name
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
result = VECTOR_DB_CLIENT.delete(
|
result = VECTOR_DB_CLIENT.delete(
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
|
Loading…
Reference in New Issue
Block a user