llama-cpp-runner/proxy_server.py

import os
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, JSONResponse
from llama_cpp_runner.main import LlamaCpp

app = FastAPI(title="LlamaCpp Proxy")

# Initialize the LlamaCpp class
models_dir = os.environ.get("MODELS_DIR", "/models")
cache_dir = os.environ.get("CACHE_DIR", "/cache")
verbose = os.environ.get("VERBOSE", "true").lower() == "true"
timeout = int(os.environ.get("TIMEOUT_MINUTES", "30"))

print(f"Models directory: {models_dir}")
print(f"Cache directory: {cache_dir}")

# Create the LlamaCpp instance
llama_runner = LlamaCpp(
    models_dir=models_dir,
    cache_dir=cache_dir,
    verbose=verbose,
    timeout_minutes=timeout
)

@app.get("/")
def read_root():
    """Get server status and list of available models."""
    return {"status": "running", "models": llama_runner.list_models()}

@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
    """Forward chat completion requests to the LlamaCpp server."""
    try:
        body = await request.json()

        if "model" not in body:
            return JSONResponse(
                status_code=400,
                content={"error": "Model not specified in request"}
            )

        try:
            result = llama_runner.chat_completion(body)

            # Handle streaming responses
            if body.get("stream", False):
                async def generate():
                    for line in result:
                        if line:
                            yield f"data: {line}\n\n"
                    yield "data: [DONE]\n\n"

                return StreamingResponse(generate(), media_type="text/event-stream")
            else:
                return result
        except Exception as e:
            return JSONResponse(
                status_code=500,
                content={"error": str(e)}
            )
    except Exception as e:
        return JSONResponse(
            status_code=400,
            content={"error": f"Invalid request: {str(e)}"}
        )

@app.get("/models")
def list_models():
    """List all available models."""
    return {"models": llama_runner.list_models()}

if __name__ == "__main__":
    print("Starting LlamaCpp Proxy Server on port 3636")
    models = llama_runner.list_models()
    print(f"Available models: {models}")
    if not models:
        print("WARNING: No models found in the models directory.")
    uvicorn.run(app, host="0.0.0.0", port=3636)