From fd8f43a156a6b9fbf50d7128c1d58a0d16e152f0 Mon Sep 17 00:00:00 2001
From: pkeffect <pkeffect@gmail.com>
Date: Fri, 18 Apr 2025 18:39:29 -0400
Subject: [PATCH] refactor and update

---
 .gitignore      |  7 +++--
 Dockerfile      | 28 ++++++++++--------
 compose.yaml    | 16 +++++++---
 proxy_server.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 20 deletions(-)
 create mode 100644 proxy_server.py

diff --git a/.gitignore b/.gitignore
index 823e384..631947d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
-llama_cpp_cache
-cache
-dist
\ No newline at end of file
+llama_cpp_cache/
+cache/
+dist/
+models/*
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 107c304..1f3d580 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,26 +2,27 @@ FROM python:3.11-slim
 
 WORKDIR /app
 
-# Install essential packages
-RUN apt-get update && apt-get install -y \
+# Install only essential packages and clean up in one layer to reduce image size
+RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
     wget \
     git \
     build-essential \
+    && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy the project files
-COPY . /app/
+# Copy only necessary files
+COPY pyproject.toml README.md LICENSE /app/
+COPY src/ /app/src/
 
-# Install the package
-RUN pip install --no-cache-dir -e .
-RUN pip install requests
+# Install the package in development mode and required dependencies
+RUN pip install --no-cache-dir -e . && pip install --no-cache-dir requests fastapi uvicorn
 
-# Create a volume mount point for models and cache
+# Create volume mount points
 VOLUME /models
 VOLUME /cache
 
-# Create a proxy server script
+# Create proxy server script directly in the Dockerfile
 RUN echo 'import os\n\
 import uvicorn\n\
 from fastapi import FastAPI, Request\n\
@@ -100,16 +101,17 @@ if __name__ == "__main__":\n\
     print(f"Available models: {models}")\n\
     if not models:\n\
         print("WARNING: No models found in the models directory.")\n\
-    uvicorn.run(app, host="0.0.0.0", port=3636)\n' > /app/proxy_server.py
-
-# Install FastAPI and Uvicorn
-RUN pip install --no-cache-dir fastapi uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=3636)' > /app/proxy_server.py
 
 # Expose the proxy server port
 EXPOSE 3636
 
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
+ENV MODELS_DIR=/models
+ENV CACHE_DIR=/cache
+ENV VERBOSE=true
+ENV TIMEOUT_MINUTES=30
 
 # Command to run when the container starts
 CMD ["python", "/app/proxy_server.py"]
\ No newline at end of file
diff --git a/compose.yaml b/compose.yaml
index 8a553ff..bf2ae02 100644
--- a/compose.yaml
+++ b/compose.yaml
@@ -1,15 +1,23 @@
 services:
-  llama-cpp-runner:
+  owui-llama-cpp-runner:
     build: .
     container_name: owui-llama-cpp-runner
     ports:
       - "3636:3636"
     volumes:
-      - ./models:/models
-      - ./cache:/cache
+      - ./models:/models # local mount
+      - ./cache:/cache   # local mount
+    # Remove . from the paths above to use native docker volumes
     environment:
       - MODELS_DIR=/models
       - CACHE_DIR=/cache
       - VERBOSE=true
       - TIMEOUT_MINUTES=30
-    restart: unless-stopped
\ No newline at end of file
+      - LD_LIBRARY_PATH=/cache/llama_cpp/build/bin
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:3636/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
\ No newline at end of file
diff --git a/proxy_server.py b/proxy_server.py
new file mode 100644
index 0000000..63685b5
--- /dev/null
+++ b/proxy_server.py
@@ -0,0 +1,79 @@
+import os
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse, JSONResponse
+from llama_cpp_runner.main import LlamaCpp
+
+app = FastAPI(title="LlamaCpp Proxy")
+
+# Initialize the LlamaCpp class
+models_dir = os.environ.get("MODELS_DIR", "/models")
+cache_dir = os.environ.get("CACHE_DIR", "/cache")
+verbose = os.environ.get("VERBOSE", "true").lower() == "true"
+timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))
+
+print(f"Models directory: {models_dir}")
+print(f"Cache directory: {cache_dir}")
+
+# Create the LlamaCpp instance
+llama_runner = LlamaCpp(
+    models_dir=models_dir,
+    cache_dir=cache_dir, 
+    verbose=verbose, 
+    timeout_minutes=timeout
+)
+
+@app.get("/")
+def read_root():
+    """Get server status and list of available models."""
+    return {"status": "running", "models": llama_runner.list_models()}
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    """Forward chat completion requests to the LlamaCpp server."""
+    try:
+        body = await request.json()
+        
+        if "model" not in body:
+            return JSONResponse(
+                status_code=400,
+                content={"error": "Model not specified in request"}
+            )
+        
+        try:
+            result = llama_runner.chat_completion(body)
+            
+            # Handle streaming responses
+            if body.get("stream", False):
+                async def generate():
+                    for line in result:
+                        if line:
+                            yield f"data: {line}\n\n"
+                    yield "data: [DONE]\n\n"
+                
+                return StreamingResponse(generate(), media_type="text/event-stream")
+            else:
+                return result
+        except Exception as e:
+            return JSONResponse(
+                status_code=500,
+                content={"error": str(e)}
+            )
+    except Exception as e:
+        return JSONResponse(
+            status_code=400,
+            content={"error": f"Invalid request: {str(e)}"}
+        )
+
+@app.get("/models")
+def list_models():
+    """List all available models."""
+    return {"models": llama_runner.list_models()}
+
+if __name__ == "__main__":
+    print("Starting LlamaCpp Proxy Server on port 3636")
+    models = llama_runner.list_models()
+    print(f"Available models: {models}")
+    if not models:
+        print("WARNING: No models found in the models directory.")
+    uvicorn.run(app, host="0.0.0.0", port=3636)
\ No newline at end of file