From f7ecd272b2db2b3f7b40d97f694976a0464de059 Mon Sep 17 00:00:00 2001
From: pkeffect <pkeffect@gmail.com>
Date: Fri, 18 Apr 2025 16:03:20 -0400
Subject: [PATCH] New Deployment

Added Dockerfile and compose.yaml as an option for deployment. Updated README.md
---
 Dockerfile   | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md    |  14 +++++++
 compose.yaml |  15 +++++++
 3 files changed, 144 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 compose.yaml

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..107c304
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,115 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install essential packages
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy the project files
+COPY . /app/
+
+# Install the package
+RUN pip install --no-cache-dir -e .
+RUN pip install requests
+
+# Create a volume mount point for models and cache
+VOLUME /models
+VOLUME /cache
+
+# Create a proxy server script
+RUN echo 'import os\n\
+import uvicorn\n\
+from fastapi import FastAPI, Request\n\
+from fastapi.responses import StreamingResponse, JSONResponse\n\
+from llama_cpp_runner.main import LlamaCpp\n\
+\n\
+app = FastAPI(title="LlamaCpp Proxy")\n\
+\n\
+# Initialize the LlamaCpp class\n\
+models_dir = os.environ.get("MODELS_DIR", "/models")\n\
+cache_dir = os.environ.get("CACHE_DIR", "/cache")\n\
+verbose = os.environ.get("VERBOSE", "true").lower() == "true"\n\
+timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))\n\
+\n\
+print(f"Models directory: {models_dir}")\n\
+print(f"Cache directory: {cache_dir}")\n\
+\n\
+# Create the LlamaCpp instance\n\
+llama_runner = LlamaCpp(\n\
+    models_dir=models_dir,\n\
+    cache_dir=cache_dir, \n\
+    verbose=verbose, \n\
+    timeout_minutes=timeout\n\
+)\n\
+\n\
+@app.get("/")\n\
+def read_root():\n\
+    """Get server status and list of available models."""\n\
+    return {"status": "running", "models": llama_runner.list_models()}\n\
+\n\
+@app.post("/v1/chat/completions")\n\
+async def chat_completions(request: Request):\n\
+    """Forward chat completion requests to the LlamaCpp server."""\n\
+    try:\n\
+        body = await request.json()\n\
+        \n\
+        if "model" not in body:\n\
+            return JSONResponse(\n\
+                status_code=400,\n\
+                content={"error": "Model not specified in request"}\n\
+            )\n\
+        \n\
+        try:\n\
+            result = llama_runner.chat_completion(body)\n\
+            \n\
+            # Handle streaming responses\n\
+            if body.get("stream", False):\n\
+                async def generate():\n\
+                    for line in result:\n\
+                        if line:\n\
+                            yield f"data: {line}\\n\\n"\n\
+                    yield "data: [DONE]\\n\\n"\n\
+                \n\
+                return StreamingResponse(generate(), media_type="text/event-stream")\n\
+            else:\n\
+                return result\n\
+        except Exception as e:\n\
+            return JSONResponse(\n\
+                status_code=500,\n\
+                content={"error": str(e)}\n\
+            )\n\
+    except Exception as e:\n\
+        return JSONResponse(\n\
+            status_code=400,\n\
+            content={"error": f"Invalid request: {str(e)}"}\n\
+        )\n\
+\n\
+@app.get("/models")\n\
+def list_models():\n\
+    """List all available models."""\n\
+    return {"models": llama_runner.list_models()}\n\
+\n\
+if __name__ == "__main__":\n\
+    print("Starting LlamaCpp Proxy Server on port 3636")\n\
+    models = llama_runner.list_models()\n\
+    print(f"Available models: {models}")\n\
+    if not models:\n\
+        print("WARNING: No models found in the models directory.")\n\
+    uvicorn.run(app, host="0.0.0.0", port=3636)\n' > /app/proxy_server.py
+
+# Install FastAPI and Uvicorn
+RUN pip install --no-cache-dir fastapi uvicorn
+
+# Expose the proxy server port
+EXPOSE 3636
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+
+# Command to run when the container starts
+CMD ["python", "/app/proxy_server.py"]
\ No newline at end of file
diff --git a/README.md b/README.md
index f9e6173..88f425f 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,20 @@ Installing `llama-cpp-runner` is quick and easy! Just use pip:
 pip install llama-cpp-runner
 ```
 
+## Optional Installation (Docker)
+
+Clone the repository 
+
+```bash
+git clone https://github.com/open-webui/llama-cpp-runner
+```
+
+Build and run
+
+```bash
+docker compose up -d
+```
+
 ## Usage 📖
 
 ### Initialize the Runner
diff --git a/compose.yaml b/compose.yaml
new file mode 100644
index 0000000..8a553ff
--- /dev/null
+++ b/compose.yaml
@@ -0,0 +1,15 @@
+services:
+  llama-cpp-runner:
+    build: .
+    container_name: owui-llama-cpp-runner
+    ports:
+      - "3636:3636"
+    volumes:
+      - ./models:/models
+      - ./cache:/cache
+    environment:
+      - MODELS_DIR=/models
+      - CACHE_DIR=/cache
+      - VERBOSE=true
+      - TIMEOUT_MINUTES=30
+    restart: unless-stopped
\ No newline at end of file