From 62630de0f94076d93881e9ace393145c58673400 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Tue, 21 May 2024 22:43:43 -0700 Subject: [PATCH] feat: llama cpp pipeline --- main.py | 10 ++++- pipelines/examples/llama_cpp_pipeline.py | 47 ++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 pipelines/examples/llama_cpp_pipeline.py diff --git a/main.py b/main.py index f735241..63234e7 100644 --- a/main.py +++ b/main.py @@ -146,6 +146,10 @@ async def generate_openai_chat_completion(form_data: OpenAIChatCompletionForm): if isinstance(res, Iterator): for line in res: + if isinstance(line, BaseModel): + line = line.model_dump_json() + line = f"data: {line}" + try: line = line.decode("utf-8") except: @@ -189,18 +193,20 @@ async def generate_openai_chat_completion(form_data: OpenAIChatCompletionForm): if isinstance(res, dict): return res + elif isinstance(res, BaseModel): + return res.model_dump() else: + message = "" if isinstance(res, str): message = res - elif isinstance(res, Generator): + if isinstance(res, Generator): for stream in res: message = f"{message}{stream}" logging.info(f"stream:false:{message}") - return { "id": f"{form_data.model}-{str(uuid.uuid4())}", "object": "chat.completion", diff --git a/pipelines/examples/llama_cpp_pipeline.py b/pipelines/examples/llama_cpp_pipeline.py new file mode 100644 index 0000000..2032558 --- /dev/null +++ b/pipelines/examples/llama_cpp_pipeline.py @@ -0,0 +1,47 @@ +from typing import List, Union, Generator, Iterator +from schemas import OpenAIChatMessage + + +class Pipeline: + def __init__(self): + # Optionally, you can set the id and name of the pipeline. + self.id = "llama_cpp_pipeline" + self.name = "Llama C++ Pipeline" + self.llm = None + pass + + async def on_startup(self): + # This function is called when the server is started. + print(f"on_startup:{__name__}") + from llama_cpp import Llama + + self.llm = Llama( + model_path="./models/llama3.gguf", + # n_gpu_layers=-1, # Uncomment to use GPU acceleration + # seed=1337, # Uncomment to set a specific seed + # n_ctx=2048, # Uncomment to increase the context window + ) + + pass + + async def on_shutdown(self): + # This function is called when the server is stopped. + print(f"on_shutdown:{__name__}") + pass + + def get_response( + self, user_message: str, messages: List[OpenAIChatMessage], body: dict + ) -> Union[str, Generator, Iterator]: + # This is where you can add your custom pipelines like RAG.' + print(f"get_response:{__name__}") + + print(messages) + print(user_message) + print(body) + + response = self.llm.create_chat_completion_openai_v1( + messages=[message.model_dump() for message in messages], + stream=body["stream"], + ) + + return response