refactor and update

2025-05-24 21:24:31 +00:00 · 2025-04-18 18:39:29 -04:00 · 2025-04-18 18:39:29 -04:00 · fd8f43a156
commit fd8f43a156
parent f7ecd272b2
4 changed files with 110 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
-llama_cpp_cache
+llama_cpp_cache/
-cache
+cache/
-dist
+dist/
 models/*
--- a/28
+++ b/28
@ -2,26 +2,27 @@ FROM python:3.11-slim
 WORKDIR /app
-# Install essential packages
+# Install only essential packages and clean up in one layer to reduce image size
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends \
    curl \
    wget \
    git \
    build-essential \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
-# Copy the project files
+# Copy only necessary files
-COPY . /app/
+COPY pyproject.toml README.md LICENSE /app/
 COPY src/ /app/src/
-# Install the package
+# Install the package in development mode and required dependencies
-RUN pip install --no-cache-dir -e .
+RUN pip install --no-cache-dir -e . && pip install --no-cache-dir requests fastapi uvicorn
 RUN pip install requests
-# Create a volume mount point for models and cache
+# Create volume mount points
 VOLUME /models
 VOLUME /cache
-# Create a proxy server script
+# Create proxy server script directly in the Dockerfile
 RUN echo 'import os\n\
 import uvicorn\n\
 from fastapi import FastAPI, Request\n\
@ -100,16 +101,17 @@ if __name__ == "__main__":\n\
    print(f"Available models: {models}")\n\
    if not models:\n\
        print("WARNING: No models found in the models directory.")\n\
-    uvicorn.run(app, host="0.0.0.0", port=3636)\n' > /app/proxy_server.py
+    uvicorn.run(app, host="0.0.0.0", port=3636)' > /app/proxy_server.py
 # Install FastAPI and Uvicorn
 RUN pip install --no-cache-dir fastapi uvicorn
 # Expose the proxy server port
 EXPOSE 3636
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV MODELS_DIR=/models
 ENV CACHE_DIR=/cache
 ENV VERBOSE=true
 ENV TIMEOUT_MINUTES=30
 # Command to run when the container starts
 CMD ["python", "/app/proxy_server.py"]
--- a/compose.yaml
+++ b/compose.yaml
@ -1,15 +1,23 @@
 services:
-  llama-cpp-runner:
+  owui-llama-cpp-runner:
    build: .
    container_name: owui-llama-cpp-runner
    ports:
      - "3636:3636"
    volumes:
-      - ./models:/models
+      - ./models:/models # local mount
-      - ./cache:/cache
+      - ./cache:/cache   # local mount
    # Remove . from the paths above to use native docker volumes
    environment:
      - MODELS_DIR=/models
      - CACHE_DIR=/cache
      - VERBOSE=true
      - TIMEOUT_MINUTES=30
      - LD_LIBRARY_PATH=/cache/llama_cpp/build/bin
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3636/"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
--- a/proxy_server.py
+++ b/proxy_server.py
@ -0,0 +1,79 @@
 import os
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse, JSONResponse
 from llama_cpp_runner.main import LlamaCpp
 app = FastAPI(title="LlamaCpp Proxy")
 # Initialize the LlamaCpp class
 models_dir = os.environ.get("MODELS_DIR", "/models")
 cache_dir = os.environ.get("CACHE_DIR", "/cache")
 verbose = os.environ.get("VERBOSE", "true").lower() == "true"
 timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))
 print(f"Models directory: {models_dir}")
 print(f"Cache directory: {cache_dir}")
 # Create the LlamaCpp instance
 llama_runner = LlamaCpp(
    models_dir=models_dir,
    cache_dir=cache_dir, 
    verbose=verbose, 
    timeout_minutes=timeout
 )
@app.get("/")
 def read_root():
    """Get server status and list of available models."""
    return {"status": "running", "models": llama_runner.list_models()}
@app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
    """Forward chat completion requests to the LlamaCpp server."""
    try:
        body = await request.json()
        if "model" not in body:
            return JSONResponse(
                status_code=400,
                content={"error": "Model not specified in request"}
            )
        try:
            result = llama_runner.chat_completion(body)
            # Handle streaming responses
            if body.get("stream", False):
                async def generate():
                    for line in result:
                        if line:
                            yield f"data: {line}\n\n"
                    yield "data: [DONE]\n\n"
                return StreamingResponse(generate(), media_type="text/event-stream")
            else:
                return result
        except Exception as e:
            return JSONResponse(
                status_code=500,
                content={"error": str(e)}
            )
    except Exception as e:
        return JSONResponse(
            status_code=400,
            content={"error": f"Invalid request: {str(e)}"}
        )
@app.get("/models")
 def list_models():
    """List all available models."""
    return {"models": llama_runner.list_models()}
 if __name__ == "__main__":
    print("Starting LlamaCpp Proxy Server on port 3636")
    models = llama_runner.list_models()
    print(f"Available models: {models}")
    if not models:
        print("WARNING: No models found in the models directory.")
    uvicorn.run(app, host="0.0.0.0", port=3636)