mirror of
https://github.com/open-webui/llama-cpp-runner
synced 2025-05-14 08:40:55 +00:00
New Deployment
Added Dockerfile and compose.yaml as an option for deployment. Updated README.md
This commit is contained in:
parent
40c5cc4b0f
commit
f7ecd272b2
115
Dockerfile
Normal file
115
Dockerfile
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install essential packages
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
curl \
|
||||||
|
wget \
|
||||||
|
git \
|
||||||
|
build-essential \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy the project files
|
||||||
|
COPY . /app/
|
||||||
|
|
||||||
|
# Install the package
|
||||||
|
RUN pip install --no-cache-dir -e .
|
||||||
|
RUN pip install requests
|
||||||
|
|
||||||
|
# Create a volume mount point for models and cache
|
||||||
|
VOLUME /models
|
||||||
|
VOLUME /cache
|
||||||
|
|
||||||
|
# Create a proxy server script
|
||||||
|
RUN echo 'import os\n\
|
||||||
|
import uvicorn\n\
|
||||||
|
from fastapi import FastAPI, Request\n\
|
||||||
|
from fastapi.responses import StreamingResponse, JSONResponse\n\
|
||||||
|
from llama_cpp_runner.main import LlamaCpp\n\
|
||||||
|
\n\
|
||||||
|
app = FastAPI(title="LlamaCpp Proxy")\n\
|
||||||
|
\n\
|
||||||
|
# Initialize the LlamaCpp class\n\
|
||||||
|
models_dir = os.environ.get("MODELS_DIR", "/models")\n\
|
||||||
|
cache_dir = os.environ.get("CACHE_DIR", "/cache")\n\
|
||||||
|
verbose = os.environ.get("VERBOSE", "true").lower() == "true"\n\
|
||||||
|
timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))\n\
|
||||||
|
\n\
|
||||||
|
print(f"Models directory: {models_dir}")\n\
|
||||||
|
print(f"Cache directory: {cache_dir}")\n\
|
||||||
|
\n\
|
||||||
|
# Create the LlamaCpp instance\n\
|
||||||
|
llama_runner = LlamaCpp(\n\
|
||||||
|
models_dir=models_dir,\n\
|
||||||
|
cache_dir=cache_dir, \n\
|
||||||
|
verbose=verbose, \n\
|
||||||
|
timeout_minutes=timeout\n\
|
||||||
|
)\n\
|
||||||
|
\n\
|
||||||
|
@app.get("/")\n\
|
||||||
|
def read_root():\n\
|
||||||
|
"""Get server status and list of available models."""\n\
|
||||||
|
return {"status": "running", "models": llama_runner.list_models()}\n\
|
||||||
|
\n\
|
||||||
|
@app.post("/v1/chat/completions")\n\
|
||||||
|
async def chat_completions(request: Request):\n\
|
||||||
|
"""Forward chat completion requests to the LlamaCpp server."""\n\
|
||||||
|
try:\n\
|
||||||
|
body = await request.json()\n\
|
||||||
|
\n\
|
||||||
|
if "model" not in body:\n\
|
||||||
|
return JSONResponse(\n\
|
||||||
|
status_code=400,\n\
|
||||||
|
content={"error": "Model not specified in request"}\n\
|
||||||
|
)\n\
|
||||||
|
\n\
|
||||||
|
try:\n\
|
||||||
|
result = llama_runner.chat_completion(body)\n\
|
||||||
|
\n\
|
||||||
|
# Handle streaming responses\n\
|
||||||
|
if body.get("stream", False):\n\
|
||||||
|
async def generate():\n\
|
||||||
|
for line in result:\n\
|
||||||
|
if line:\n\
|
||||||
|
yield f"data: {line}\\n\\n"\n\
|
||||||
|
yield "data: [DONE]\\n\\n"\n\
|
||||||
|
\n\
|
||||||
|
return StreamingResponse(generate(), media_type="text/event-stream")\n\
|
||||||
|
else:\n\
|
||||||
|
return result\n\
|
||||||
|
except Exception as e:\n\
|
||||||
|
return JSONResponse(\n\
|
||||||
|
status_code=500,\n\
|
||||||
|
content={"error": str(e)}\n\
|
||||||
|
)\n\
|
||||||
|
except Exception as e:\n\
|
||||||
|
return JSONResponse(\n\
|
||||||
|
status_code=400,\n\
|
||||||
|
content={"error": f"Invalid request: {str(e)}"}\n\
|
||||||
|
)\n\
|
||||||
|
\n\
|
||||||
|
@app.get("/models")\n\
|
||||||
|
def list_models():\n\
|
||||||
|
"""List all available models."""\n\
|
||||||
|
return {"models": llama_runner.list_models()}\n\
|
||||||
|
\n\
|
||||||
|
if __name__ == "__main__":\n\
|
||||||
|
print("Starting LlamaCpp Proxy Server on port 3636")\n\
|
||||||
|
models = llama_runner.list_models()\n\
|
||||||
|
print(f"Available models: {models}")\n\
|
||||||
|
if not models:\n\
|
||||||
|
print("WARNING: No models found in the models directory.")\n\
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=3636)\n' > /app/proxy_server.py
|
||||||
|
|
||||||
|
# Install FastAPI and Uvicorn
|
||||||
|
RUN pip install --no-cache-dir fastapi uvicorn
|
||||||
|
|
||||||
|
# Expose the proxy server port
|
||||||
|
EXPOSE 3636
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# Command to run when the container starts
|
||||||
|
CMD ["python", "/app/proxy_server.py"]
|
14
README.md
14
README.md
@ -25,6 +25,20 @@ Installing `llama-cpp-runner` is quick and easy! Just use pip:
|
|||||||
pip install llama-cpp-runner
|
pip install llama-cpp-runner
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Optional Installation (Docker)
|
||||||
|
|
||||||
|
Clone the repository
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/open-webui/llama-cpp-runner
|
||||||
|
```
|
||||||
|
|
||||||
|
Build and run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
## Usage 📖
|
## Usage 📖
|
||||||
|
|
||||||
### Initialize the Runner
|
### Initialize the Runner
|
||||||
|
15
compose.yaml
Normal file
15
compose.yaml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
services:
|
||||||
|
llama-cpp-runner:
|
||||||
|
build: .
|
||||||
|
container_name: owui-llama-cpp-runner
|
||||||
|
ports:
|
||||||
|
- "3636:3636"
|
||||||
|
volumes:
|
||||||
|
- ./models:/models
|
||||||
|
- ./cache:/cache
|
||||||
|
environment:
|
||||||
|
- MODELS_DIR=/models
|
||||||
|
- CACHE_DIR=/cache
|
||||||
|
- VERBOSE=true
|
||||||
|
- TIMEOUT_MINUTES=30
|
||||||
|
restart: unless-stopped
|
Loading…
Reference in New Issue
Block a user