From 443c6cdb9f385ee455e26cb674f77005d37b45f1 Mon Sep 17 00:00:00 2001 From: Daniel Downs Date: Thu, 6 Mar 2025 11:19:59 -0600 Subject: [PATCH] added Parser Function documentation --- docs/features/plugin/functions/index.mdx | 33 ++++- docs/features/plugin/functions/parser.mdx | 172 ++++++++++++++++++++++ 2 files changed, 201 insertions(+), 4 deletions(-) create mode 100644 docs/features/plugin/functions/parser.mdx diff --git a/docs/features/plugin/functions/index.mdx b/docs/features/plugin/functions/index.mdx index 8c2feec..76dfe4f 100644 --- a/docs/features/plugin/functions/index.mdx +++ b/docs/features/plugin/functions/index.mdx @@ -15,7 +15,7 @@ Think of Functions as **modular building blocks** that let you enhance how the W ## 🏗️ Types of Functions -There are **three types of Functions** in Open WebUI, each with a specific purpose. Let’s break them down and explain exactly what they do: +There are **four types of Functions** in Open WebUI, each with a specific purpose. Let’s break them down and explain exactly what they do: --- @@ -77,6 +77,28 @@ Learn how to set them up in the [**Action Functions Guide**](./action.mdx). --- +### 4. [**Parser Function** – Knowledge splitting/embedding/uploading](./parser.mdx) + +A **Parser Function** is used to handle how Knowledge is parsed and uploaded. + +**What does it do?** +Parsers allow you to define new workflows when a user adds new Knowledge. The default behavior +of Open WebUI uses LangChain's [RecursiveCharacterTextSplitter.](https://api.python.langchain.com/en/latest/character/langchain_text_splitters.character.RecursiveCharacterTextSplitter.html) + +If this default tool does not properly split your documents or additional information is needed when storing information, +Parsers allow you to define a new workflow that will handle things according to your usecase. Multiple Parsers can be used +if needed. + +**Use case example:** + +If you wanted to use [LayoutParser](https://layout-parser.github.io/) to handle more complex document structures, you can create a +Parser that uses a pretrained Detectron2 model and OCR to better handle documents that RecursiveCharacterTextSplitter was poorly +splitting. + +For a detailed guide, see [**Parser Functions**](./parser.mdx). + +--- + ## 🛠️ How to Use Functions Here's how to put Functions to work in Open WebUI: @@ -92,6 +114,7 @@ You can install Functions via the Open WebUI interface or by importing them manu Functions must be explicitly enabled after installation: - When you enable a **Pipe Function**, it becomes available as its own **model** in the interface. - For **Filter** and **Action Functions**, enabling them isn’t enough—you also need to assign them to specific models or enable them globally for all models. +- When enabling a **Parser Function**, it will automatically be applied when new Knowledge is uploaded of the same ParserType as defined in the Parser's constructor. --- @@ -104,7 +127,8 @@ Functions must be explicitly enabled after installation: ### Quick Summary - **Pipes** appear as standalone models you can interact with. - **Filters** modify inputs/outputs for smoother AI interactions. -- **Actions** add clickable buttons to individual chat messages. +- **Actions** add clickable buttons to individual chat messages. +- **Parsers** handle splitting/embedding/uploading of new Knowledge Once you’ve followed the setup process, Functions will seamlessly enhance your workflows. @@ -124,10 +148,11 @@ Whether you’re customizing workflows for specific projects, integrating extern ### 📝 Final Notes: 1. Always install Functions from **trusted sources only**. -2. Make sure you understand the difference between Pipe, Filter, and Action Functions to use them effectively. +2. Make sure you understand the difference between Pipe, Filter, Action, and Parser Functions to use them effectively. 3. Explore the official guides: - [Pipe Functions Guide](./pipe.mdx) - [Filter Functions Guide](./filter.mdx) - - [Action Functions Guide](./action.mdx) + - [Action Functions Guide](./action.mdx) + - [Parser Functions Guide](./parser.mdx) By leveraging Functions, you’ll bring entirely new capabilities to your Open WebUI setup. Start experimenting today! 🚀 \ No newline at end of file diff --git a/docs/features/plugin/functions/parser.mdx b/docs/features/plugin/functions/parser.mdx new file mode 100644 index 0000000..4ec7185 --- /dev/null +++ b/docs/features/plugin/functions/parser.mdx @@ -0,0 +1,172 @@ +--- +sidebar_position: 4 +title: "Parser Function" +--- + +# Parser Function: Handling custom Knowledge splitting/embedding/uploading + +## Introduction + +Knowledge allows AI Models to reference specific, known information. This is known as Retrieval Augmented Generated (RAG) + and is extremely useful for increasing answer accuracy. + +When a user uploads new Knowledge, that text must be split into chunks, turned into embeddings, and +then uploaded to a vector database for later use. How the text is split apart and the embeddings used can +drastically affect the performance of the model. + +By default OpenWebUI handles this process for you. However, it may not handle things well enough for your use case. +In this situation, you can write your own Parser that will handle the process in whatever method best suits your needs. + + +### 🦴 Basic Skeleton of a Parser + +The basics of a Parser are extremely simple. Only two attributes and a single method are required: + +```python +from open_webui.utils.parser import PARSER_TYPE + +class Parser: + def __init__(self): + self.name = "Custom Parser" + self.parser_type = PARSER_TYPE.ALL + + def save_docs_to_vector_db(self, + request: Request, + docs, + collection_name, + metadata: Optional[dict] = None, + overwrite: bool = False, + split: bool = True, + add: bool = False, + user=None, + ) -> bool: + # This is where you handle splitting docs and putting their embeddings into the vdb collection + + print("Handling splitting, embedding, and uploading") + + return True # to mark that things completed successfully +``` + + +### PARSER_TYPE + +Because different types of knowledge can be uploaded and different types of content need to be handled in different ways, +open_webui.utils.parser.PARSER_TYPE defines what types of content the Parser will be used for. The currently available +choices are: + +- TEXT +- FILE +- YOUTUBE +- WEB_CONTENT +- WEB_SEARCH +- ALL + +Any combination of these values (or a single one) is viable, though ALL supersedes other choices. + + +### Inheriting from DefaultParser + +Users may only care about changing one part of the custom Knowledge process (for example, you may want to +split the document in a better way but don't want to handle embedding or uploading). + +In this case, users can inherit from openwebui.utils.parser.DefaultParser, which handles needed all needed steps. Specific steps +can then be overridden. + +```python +import logging + +import tiktoken + +from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter + +from open_webui.env import SRC_LOG_LEVELS +from open_webui.constants import ERROR_MESSAGES +from open_webui.utils.parser import PARSER_TYPE, DefaultParser + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +class Parser(DefaultParser): + def __init__(self): + self.name = "Splitting Parser" + self.parser_type = PARSER_TYPE.ALL + + def split(self, request, docs): + log.info("This is a custom override of the default splitting behavior") + + if request.app.state.config.TEXT_SPLITTER in ["", "character"]: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=request.app.state.config.CHUNK_SIZE, + chunk_overlap=request.app.state.config.CHUNK_OVERLAP, + add_start_index=True, + ) + elif request.app.state.config.TEXT_SPLITTER == "token": + log.info( + f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}" + ) + + tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME)) + text_splitter = TokenTextSplitter( + encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME), + chunk_size=request.app.state.config.CHUNK_SIZE, + chunk_overlap=request.app.state.config.CHUNK_OVERLAP, + add_start_index=True, + ) + else: + raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter")) + + docs = text_splitter.split_documents(docs) + + if len(docs) == 0: + raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) + + return docs + +``` + +### Valves + +Just like other functions, Parsers can have Valves. More detailed information on Valves can be found in the +[Pipe documentation](./pipe.mdx) + +```python +from open_webui.utils.parser import PARSER_TYPE + +class Parser: + class Valves(BaseModel): + MODEL_ID: str = Field(default="") + + def __init__(self): + super().__init__() + self.name = "Splitting Parser" + self.parser_type = PARSER_TYPE.ALL + self.valves = self.Valves() + + def save_docs_to_vector_db(self, + request: Request, + docs, + collection_name, + metadata: Optional[dict] = None, + overwrite: bool = False, + split: bool = True, + add: bool = False, + user=None, + ) -> bool: + # This is where you handle splitting docs and putting their embeddings into the vdb collection + + print(f"valve value: {self.valves.MODEL_ID}") + print("Handling splitting, embedding, and uploading") + + return True # to mark that things completed successfully +``` + + + + +## FAQ 🛑 + +### **Q: How Are Parsers Different From other Functions?** + +A: Parsers are the only Functions that occur when Knowledge is uploaded. Other functions can interact with Knowledge +but none have control over how that Knowledge is handled by OpenWebUI. \ No newline at end of file