mirror of
https://github.com/open-webui/llama-cpp-runner
synced 2025-05-24 21:24:31 +00:00
refactor and update
This commit is contained in:
parent
f7ecd272b2
commit
fd8f43a156
7
.gitignore
vendored
7
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
llama_cpp_cache
|
llama_cpp_cache/
|
||||||
cache
|
cache/
|
||||||
dist
|
dist/
|
||||||
|
models/*
|
28
Dockerfile
28
Dockerfile
@ -2,26 +2,27 @@ FROM python:3.11-slim
|
|||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install essential packages
|
# Install only essential packages and clean up in one layer to reduce image size
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
curl \
|
curl \
|
||||||
wget \
|
wget \
|
||||||
git \
|
git \
|
||||||
build-essential \
|
build-essential \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy the project files
|
# Copy only necessary files
|
||||||
COPY . /app/
|
COPY pyproject.toml README.md LICENSE /app/
|
||||||
|
COPY src/ /app/src/
|
||||||
|
|
||||||
# Install the package
|
# Install the package in development mode and required dependencies
|
||||||
RUN pip install --no-cache-dir -e .
|
RUN pip install --no-cache-dir -e . && pip install --no-cache-dir requests fastapi uvicorn
|
||||||
RUN pip install requests
|
|
||||||
|
|
||||||
# Create a volume mount point for models and cache
|
# Create volume mount points
|
||||||
VOLUME /models
|
VOLUME /models
|
||||||
VOLUME /cache
|
VOLUME /cache
|
||||||
|
|
||||||
# Create a proxy server script
|
# Create proxy server script directly in the Dockerfile
|
||||||
RUN echo 'import os\n\
|
RUN echo 'import os\n\
|
||||||
import uvicorn\n\
|
import uvicorn\n\
|
||||||
from fastapi import FastAPI, Request\n\
|
from fastapi import FastAPI, Request\n\
|
||||||
@ -100,16 +101,17 @@ if __name__ == "__main__":\n\
|
|||||||
print(f"Available models: {models}")\n\
|
print(f"Available models: {models}")\n\
|
||||||
if not models:\n\
|
if not models:\n\
|
||||||
print("WARNING: No models found in the models directory.")\n\
|
print("WARNING: No models found in the models directory.")\n\
|
||||||
uvicorn.run(app, host="0.0.0.0", port=3636)\n' > /app/proxy_server.py
|
uvicorn.run(app, host="0.0.0.0", port=3636)' > /app/proxy_server.py
|
||||||
|
|
||||||
# Install FastAPI and Uvicorn
|
|
||||||
RUN pip install --no-cache-dir fastapi uvicorn
|
|
||||||
|
|
||||||
# Expose the proxy server port
|
# Expose the proxy server port
|
||||||
EXPOSE 3636
|
EXPOSE 3636
|
||||||
|
|
||||||
# Set environment variables
|
# Set environment variables
|
||||||
ENV PYTHONUNBUFFERED=1
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
ENV MODELS_DIR=/models
|
||||||
|
ENV CACHE_DIR=/cache
|
||||||
|
ENV VERBOSE=true
|
||||||
|
ENV TIMEOUT_MINUTES=30
|
||||||
|
|
||||||
# Command to run when the container starts
|
# Command to run when the container starts
|
||||||
CMD ["python", "/app/proxy_server.py"]
|
CMD ["python", "/app/proxy_server.py"]
|
14
compose.yaml
14
compose.yaml
@ -1,15 +1,23 @@
|
|||||||
services:
|
services:
|
||||||
llama-cpp-runner:
|
owui-llama-cpp-runner:
|
||||||
build: .
|
build: .
|
||||||
container_name: owui-llama-cpp-runner
|
container_name: owui-llama-cpp-runner
|
||||||
ports:
|
ports:
|
||||||
- "3636:3636"
|
- "3636:3636"
|
||||||
volumes:
|
volumes:
|
||||||
- ./models:/models
|
- ./models:/models # local mount
|
||||||
- ./cache:/cache
|
- ./cache:/cache # local mount
|
||||||
|
# Remove . from the paths above to use native docker volumes
|
||||||
environment:
|
environment:
|
||||||
- MODELS_DIR=/models
|
- MODELS_DIR=/models
|
||||||
- CACHE_DIR=/cache
|
- CACHE_DIR=/cache
|
||||||
- VERBOSE=true
|
- VERBOSE=true
|
||||||
- TIMEOUT_MINUTES=30
|
- TIMEOUT_MINUTES=30
|
||||||
|
- LD_LIBRARY_PATH=/cache/llama_cpp/build/bin
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:3636/"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
79
proxy_server.py
Normal file
79
proxy_server.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
import os
|
||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI, Request
|
||||||
|
from fastapi.responses import StreamingResponse, JSONResponse
|
||||||
|
from llama_cpp_runner.main import LlamaCpp
|
||||||
|
|
||||||
|
app = FastAPI(title="LlamaCpp Proxy")
|
||||||
|
|
||||||
|
# Initialize the LlamaCpp class
|
||||||
|
models_dir = os.environ.get("MODELS_DIR", "/models")
|
||||||
|
cache_dir = os.environ.get("CACHE_DIR", "/cache")
|
||||||
|
verbose = os.environ.get("VERBOSE", "true").lower() == "true"
|
||||||
|
timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))
|
||||||
|
|
||||||
|
print(f"Models directory: {models_dir}")
|
||||||
|
print(f"Cache directory: {cache_dir}")
|
||||||
|
|
||||||
|
# Create the LlamaCpp instance
|
||||||
|
llama_runner = LlamaCpp(
|
||||||
|
models_dir=models_dir,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
verbose=verbose,
|
||||||
|
timeout_minutes=timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def read_root():
|
||||||
|
"""Get server status and list of available models."""
|
||||||
|
return {"status": "running", "models": llama_runner.list_models()}
|
||||||
|
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
async def chat_completions(request: Request):
|
||||||
|
"""Forward chat completion requests to the LlamaCpp server."""
|
||||||
|
try:
|
||||||
|
body = await request.json()
|
||||||
|
|
||||||
|
if "model" not in body:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content={"error": "Model not specified in request"}
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = llama_runner.chat_completion(body)
|
||||||
|
|
||||||
|
# Handle streaming responses
|
||||||
|
if body.get("stream", False):
|
||||||
|
async def generate():
|
||||||
|
for line in result:
|
||||||
|
if line:
|
||||||
|
yield f"data: {line}\n\n"
|
||||||
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
|
return StreamingResponse(generate(), media_type="text/event-stream")
|
||||||
|
else:
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=500,
|
||||||
|
content={"error": str(e)}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content={"error": f"Invalid request: {str(e)}"}
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.get("/models")
|
||||||
|
def list_models():
|
||||||
|
"""List all available models."""
|
||||||
|
return {"models": llama_runner.list_models()}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Starting LlamaCpp Proxy Server on port 3636")
|
||||||
|
models = llama_runner.list_models()
|
||||||
|
print(f"Available models: {models}")
|
||||||
|
if not models:
|
||||||
|
print("WARNING: No models found in the models directory.")
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=3636)
|
Loading…
Reference in New Issue
Block a user