diff --git a/.gitignore b/.gitignore index 823e384..631947d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ -llama_cpp_cache -cache -dist \ No newline at end of file +llama_cpp_cache/ +cache/ +dist/ +models/* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1f3d580 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,117 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install only essential packages and clean up in one layer to reduce image size +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + wget \ + git \ + build-essential \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Copy only necessary files +COPY pyproject.toml README.md LICENSE /app/ +COPY src/ /app/src/ + +# Install the package in development mode and required dependencies +RUN pip install --no-cache-dir -e . && pip install --no-cache-dir requests fastapi uvicorn + +# Create volume mount points +VOLUME /models +VOLUME /cache + +# Create proxy server script directly in the Dockerfile +RUN echo 'import os\n\ +import uvicorn\n\ +from fastapi import FastAPI, Request\n\ +from fastapi.responses import StreamingResponse, JSONResponse\n\ +from llama_cpp_runner.main import LlamaCpp\n\ +\n\ +app = FastAPI(title="LlamaCpp Proxy")\n\ +\n\ +# Initialize the LlamaCpp class\n\ +models_dir = os.environ.get("MODELS_DIR", "/models")\n\ +cache_dir = os.environ.get("CACHE_DIR", "/cache")\n\ +verbose = os.environ.get("VERBOSE", "true").lower() == "true"\n\ +timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))\n\ +\n\ +print(f"Models directory: {models_dir}")\n\ +print(f"Cache directory: {cache_dir}")\n\ +\n\ +# Create the LlamaCpp instance\n\ +llama_runner = LlamaCpp(\n\ + models_dir=models_dir,\n\ + cache_dir=cache_dir, \n\ + verbose=verbose, \n\ + timeout_minutes=timeout\n\ +)\n\ +\n\ +@app.get("/")\n\ +def read_root():\n\ + """Get server status and list of available models."""\n\ + return {"status": "running", "models": llama_runner.list_models()}\n\ +\n\ +@app.post("/v1/chat/completions")\n\ +async def chat_completions(request: Request):\n\ + """Forward chat completion requests to the LlamaCpp server."""\n\ + try:\n\ + body = await request.json()\n\ + \n\ + if "model" not in body:\n\ + return JSONResponse(\n\ + status_code=400,\n\ + content={"error": "Model not specified in request"}\n\ + )\n\ + \n\ + try:\n\ + result = llama_runner.chat_completion(body)\n\ + \n\ + # Handle streaming responses\n\ + if body.get("stream", False):\n\ + async def generate():\n\ + for line in result:\n\ + if line:\n\ + yield f"data: {line}\\n\\n"\n\ + yield "data: [DONE]\\n\\n"\n\ + \n\ + return StreamingResponse(generate(), media_type="text/event-stream")\n\ + else:\n\ + return result\n\ + except Exception as e:\n\ + return JSONResponse(\n\ + status_code=500,\n\ + content={"error": str(e)}\n\ + )\n\ + except Exception as e:\n\ + return JSONResponse(\n\ + status_code=400,\n\ + content={"error": f"Invalid request: {str(e)}"}\n\ + )\n\ +\n\ +@app.get("/models")\n\ +def list_models():\n\ + """List all available models."""\n\ + return {"models": llama_runner.list_models()}\n\ +\n\ +if __name__ == "__main__":\n\ + print("Starting LlamaCpp Proxy Server on port 3636")\n\ + models = llama_runner.list_models()\n\ + print(f"Available models: {models}")\n\ + if not models:\n\ + print("WARNING: No models found in the models directory.")\n\ + uvicorn.run(app, host="0.0.0.0", port=3636)' > /app/proxy_server.py + +# Expose the proxy server port +EXPOSE 3636 + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV MODELS_DIR=/models +ENV CACHE_DIR=/cache +ENV VERBOSE=true +ENV TIMEOUT_MINUTES=30 + +# Command to run when the container starts +CMD ["python", "/app/proxy_server.py"] \ No newline at end of file diff --git a/README.md b/README.md index f9e6173..88f425f 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,20 @@ Installing `llama-cpp-runner` is quick and easy! Just use pip: pip install llama-cpp-runner ``` +## Optional Installation (Docker) + +Clone the repository + +```bash +git clone https://github.com/open-webui/llama-cpp-runner +``` + +Build and run + +```bash +docker compose up -d +``` + ## Usage 📖 ### Initialize the Runner diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..bf2ae02 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,23 @@ +services: + owui-llama-cpp-runner: + build: . + container_name: owui-llama-cpp-runner + ports: + - "3636:3636" + volumes: + - ./models:/models # local mount + - ./cache:/cache # local mount + # Remove . from the paths above to use native docker volumes + environment: + - MODELS_DIR=/models + - CACHE_DIR=/cache + - VERBOSE=true + - TIMEOUT_MINUTES=30 + - LD_LIBRARY_PATH=/cache/llama_cpp/build/bin + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:3636/"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/proxy_server.py b/proxy_server.py new file mode 100644 index 0000000..63685b5 --- /dev/null +++ b/proxy_server.py @@ -0,0 +1,79 @@ +import os +import uvicorn +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse, JSONResponse +from llama_cpp_runner.main import LlamaCpp + +app = FastAPI(title="LlamaCpp Proxy") + +# Initialize the LlamaCpp class +models_dir = os.environ.get("MODELS_DIR", "/models") +cache_dir = os.environ.get("CACHE_DIR", "/cache") +verbose = os.environ.get("VERBOSE", "true").lower() == "true" +timeout = int(os.environ.get("TIMEOUT_MINUTES", "30")) + +print(f"Models directory: {models_dir}") +print(f"Cache directory: {cache_dir}") + +# Create the LlamaCpp instance +llama_runner = LlamaCpp( + models_dir=models_dir, + cache_dir=cache_dir, + verbose=verbose, + timeout_minutes=timeout +) + +@app.get("/") +def read_root(): + """Get server status and list of available models.""" + return {"status": "running", "models": llama_runner.list_models()} + +@app.post("/v1/chat/completions") +async def chat_completions(request: Request): + """Forward chat completion requests to the LlamaCpp server.""" + try: + body = await request.json() + + if "model" not in body: + return JSONResponse( + status_code=400, + content={"error": "Model not specified in request"} + ) + + try: + result = llama_runner.chat_completion(body) + + # Handle streaming responses + if body.get("stream", False): + async def generate(): + for line in result: + if line: + yield f"data: {line}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(generate(), media_type="text/event-stream") + else: + return result + except Exception as e: + return JSONResponse( + status_code=500, + content={"error": str(e)} + ) + except Exception as e: + return JSONResponse( + status_code=400, + content={"error": f"Invalid request: {str(e)}"} + ) + +@app.get("/models") +def list_models(): + """List all available models.""" + return {"models": llama_runner.list_models()} + +if __name__ == "__main__": + print("Starting LlamaCpp Proxy Server on port 3636") + models = llama_runner.list_models() + print(f"Available models: {models}") + if not models: + print("WARNING: No models found in the models directory.") + uvicorn.run(app, host="0.0.0.0", port=3636) \ No newline at end of file