feat: llama cpp pipeline

2025-06-26 18:15:58 +00:00 · 2024-05-21 22:43:43 -07:00
parent b3bb653f46
commit 62630de0f9
2 changed files with 55 additions and 2 deletions
--- a/main.py
+++ b/main.py
@@ -146,6 +146,10 @@ async def generate_openai_chat_completion(form_data: OpenAIChatCompletionForm):

                if isinstance(res, Iterator):
                    for line in res:
+                        if isinstance(line, BaseModel):
+                            line = line.model_dump_json()
+                            line = f"data: {line}"
+
                        try:
                            line = line.decode("utf-8")
                        except:
@@ -189,18 +193,20 @@ async def generate_openai_chat_completion(form_data: OpenAIChatCompletionForm):

            if isinstance(res, dict):
                return res
+            elif isinstance(res, BaseModel):
+                return res.model_dump()
            else:
+
                message = ""

                if isinstance(res, str):
                    message = res

-                elif isinstance(res, Generator):
+                if isinstance(res, Generator):
                    for stream in res:
                        message = f"{message}{stream}"

                logging.info(f"stream:false:{message}")
-
                return {
                    "id": f"{form_data.model}-{str(uuid.uuid4())}",
                    "object": "chat.completion",
--- a/pipelines/examples/llama_cpp_pipeline.py
+++ b/pipelines/examples/llama_cpp_pipeline.py
@@ -0,0 +1,47 @@
+from typing import List, Union, Generator, Iterator
+from schemas import OpenAIChatMessage
+
+
+class Pipeline:
+    def __init__(self):
+        # Optionally, you can set the id and name of the pipeline.
+        self.id = "llama_cpp_pipeline"
+        self.name = "Llama C++ Pipeline"
+        self.llm = None
+        pass
+
+    async def on_startup(self):
+        # This function is called when the server is started.
+        print(f"on_startup:{__name__}")
+        from llama_cpp import Llama
+
+        self.llm = Llama(
+            model_path="./models/llama3.gguf",
+            # n_gpu_layers=-1, # Uncomment to use GPU acceleration
+            # seed=1337, # Uncomment to set a specific seed
+            # n_ctx=2048, # Uncomment to increase the context window
+        )
+
+        pass
+
+    async def on_shutdown(self):
+        # This function is called when the server is stopped.
+        print(f"on_shutdown:{__name__}")
+        pass
+
+    def get_response(
+        self, user_message: str, messages: List[OpenAIChatMessage], body: dict
+    ) -> Union[str, Generator, Iterator]:
+        # This is where you can add your custom pipelines like RAG.'
+        print(f"get_response:{__name__}")
+
+        print(messages)
+        print(user_message)
+        print(body)
+
+        response = self.llm.create_chat_completion_openai_v1(
+            messages=[message.model_dump() for message in messages],
+            stream=body["stream"],
+        )
+
+        return response