refactor and update

This commit is contained in:
pkeffect 2025-04-18 18:39:29 -04:00
parent f7ecd272b2
commit fd8f43a156
4 changed files with 110 additions and 20 deletions

7
.gitignore vendored
View File

@ -1,3 +1,4 @@
llama_cpp_cache
cache
dist
llama_cpp_cache/
cache/
dist/
models/*

View File

@ -2,26 +2,27 @@ FROM python:3.11-slim
WORKDIR /app
# Install essential packages
RUN apt-get update && apt-get install -y \
# Install only essential packages and clean up in one layer to reduce image size
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
wget \
git \
build-essential \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Copy the project files
COPY . /app/
# Copy only necessary files
COPY pyproject.toml README.md LICENSE /app/
COPY src/ /app/src/
# Install the package
RUN pip install --no-cache-dir -e .
RUN pip install requests
# Install the package in development mode and required dependencies
RUN pip install --no-cache-dir -e . && pip install --no-cache-dir requests fastapi uvicorn
# Create a volume mount point for models and cache
# Create volume mount points
VOLUME /models
VOLUME /cache
# Create a proxy server script
# Create proxy server script directly in the Dockerfile
RUN echo 'import os\n\
import uvicorn\n\
from fastapi import FastAPI, Request\n\
@ -100,16 +101,17 @@ if __name__ == "__main__":\n\
print(f"Available models: {models}")\n\
if not models:\n\
print("WARNING: No models found in the models directory.")\n\
uvicorn.run(app, host="0.0.0.0", port=3636)\n' > /app/proxy_server.py
# Install FastAPI and Uvicorn
RUN pip install --no-cache-dir fastapi uvicorn
uvicorn.run(app, host="0.0.0.0", port=3636)' > /app/proxy_server.py
# Expose the proxy server port
EXPOSE 3636
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV MODELS_DIR=/models
ENV CACHE_DIR=/cache
ENV VERBOSE=true
ENV TIMEOUT_MINUTES=30
# Command to run when the container starts
CMD ["python", "/app/proxy_server.py"]

View File

@ -1,15 +1,23 @@
services:
llama-cpp-runner:
owui-llama-cpp-runner:
build: .
container_name: owui-llama-cpp-runner
ports:
- "3636:3636"
volumes:
- ./models:/models
- ./cache:/cache
- ./models:/models # local mount
- ./cache:/cache # local mount
# Remove . from the paths above to use native docker volumes
environment:
- MODELS_DIR=/models
- CACHE_DIR=/cache
- VERBOSE=true
- TIMEOUT_MINUTES=30
- LD_LIBRARY_PATH=/cache/llama_cpp/build/bin
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3636/"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s

79
proxy_server.py Normal file
View File

@ -0,0 +1,79 @@
import os
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, JSONResponse
from llama_cpp_runner.main import LlamaCpp
app = FastAPI(title="LlamaCpp Proxy")
# Initialize the LlamaCpp class
models_dir = os.environ.get("MODELS_DIR", "/models")
cache_dir = os.environ.get("CACHE_DIR", "/cache")
verbose = os.environ.get("VERBOSE", "true").lower() == "true"
timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))
print(f"Models directory: {models_dir}")
print(f"Cache directory: {cache_dir}")
# Create the LlamaCpp instance
llama_runner = LlamaCpp(
models_dir=models_dir,
cache_dir=cache_dir,
verbose=verbose,
timeout_minutes=timeout
)
@app.get("/")
def read_root():
"""Get server status and list of available models."""
return {"status": "running", "models": llama_runner.list_models()}
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
"""Forward chat completion requests to the LlamaCpp server."""
try:
body = await request.json()
if "model" not in body:
return JSONResponse(
status_code=400,
content={"error": "Model not specified in request"}
)
try:
result = llama_runner.chat_completion(body)
# Handle streaming responses
if body.get("stream", False):
async def generate():
for line in result:
if line:
yield f"data: {line}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
return result
except Exception as e:
return JSONResponse(
status_code=500,
content={"error": str(e)}
)
except Exception as e:
return JSONResponse(
status_code=400,
content={"error": f"Invalid request: {str(e)}"}
)
@app.get("/models")
def list_models():
"""List all available models."""
return {"models": llama_runner.list_models()}
if __name__ == "__main__":
print("Starting LlamaCpp Proxy Server on port 3636")
models = llama_runner.list_models()
print(f"Available models: {models}")
if not models:
print("WARNING: No models found in the models directory.")
uvicorn.run(app, host="0.0.0.0", port=3636)