This commit is contained in:
Timothy Jaeryang Baek 2025-01-28 13:38:49 -08:00
parent 3d15266ea6
commit ef461ac9e1
2 changed files with 68 additions and 43 deletions

View File

@ -1,7 +1,7 @@
[project] [project]
name = "llama-cpp-runner" name = "llama-cpp-runner"
version = "0.0.1" version = "0.0.1"
description = "A runner for llama-cpp" description = "Quick and easy way to run large language models (LLMs) with llama.cpp"
readme = "README.md" readme = "README.md"
authors = [ authors = [
{ name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" } { name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" }

View File

@ -9,27 +9,41 @@ import stat
import time import time
import socket import socket
import os
import platform
import requests
import zipfile
import json
class LlamaCpp: class LlamaCpp:
def __init__( def __init__(
self, models_dir, cache_dir="./cache", verbose=False, timeout_minutes=5 self,
models_dir,
cache_dir="~/.llama_cpp_runner",
verbose=False,
timeout_minutes=5,
pinned_version=None,
): ):
""" """
Initialize the LlamaCpp class. Initialize the LlamaCpp class.
Args: Args:
models_dir (str): Directory where GGUF models are stored. models_dir (str): Directory where GGUF models are stored.
cache_dir (str): Directory to store llama.cpp binaries and related assets. cache_dir (str): Directory to store llama.cpp binaries and related assets. Defaults to '~/.llama_cpp_runner'.
verbose (bool): Whether to enable verbose logging. verbose (bool): Whether to enable verbose logging.
timeout_minutes (int): Timeout for shutting down idle servers. timeout_minutes (int): Timeout for shutting down idle servers.
pinned_version (str or None): Pinned release version of llama.cpp binaries.
""" """
self.models_dir = models_dir self.models_dir = models_dir
self.cache_dir = cache_dir self.cache_dir = os.path.expanduser(
cache_dir
) # Ensure cache is in a fixed location
self.verbose = verbose self.verbose = verbose
self.timeout_minutes = timeout_minutes self.timeout_minutes = timeout_minutes
self.pinned_version = pinned_version # Optional pinned version
self.llama_cpp_path = ( self.llama_cpp_path = (
self._install_llama_cpp_binaries() self._install_llama_cpp_binaries()
) # Handle binaries installation ) # Install the required binaries
self.servers = ( self.servers = (
{} {}
) # Maintain a mapping of model names to LlamaCppServer instances ) # Maintain a mapping of model names to LlamaCppServer instances
@ -37,7 +51,6 @@ class LlamaCpp:
def list_models(self): def list_models(self):
""" """
List all GGUF models available in the `models_dir`. List all GGUF models available in the `models_dir`.
Returns: Returns:
list: A list of model names (files ending in ".gguf"). list: A list of model names (files ending in ".gguf").
""" """
@ -51,37 +64,29 @@ class LlamaCpp:
def chat_completion(self, body): def chat_completion(self, body):
""" """
Handle chat completion requests. Handle chat completion requests.
Args: Args:
body (dict): The payload for the chat completion request. It must contain the "model" key. body (dict): The payload for the chat completion request. It must contain the "model" key.
Returns: Returns:
dict or generator: Response from the server (non-streaming or streaming mode). dict or generator: Response from the server (non-streaming or streaming mode).
""" """
if "model" not in body: if "model" not in body:
raise ValueError("The request body must contain a 'model' key.") raise ValueError("The request body must contain a 'model' key.")
model_name = body["model"] model_name = body["model"]
gguf_path = os.path.join(self.models_dir, model_name) gguf_path = os.path.join(self.models_dir, model_name)
if not os.path.exists(gguf_path): if not os.path.exists(gguf_path):
raise FileNotFoundError(f"Model file not found: {gguf_path}") raise FileNotFoundError(f"Model file not found: {gguf_path}")
# Check if the server for this model is already running # Check if the server for this model is already running
if model_name not in self.servers or not self.servers[model_name]._server_url: if model_name not in self.servers or not self.servers[model_name]._server_url:
self._log(f"Initializing a new server for model: {model_name}") self._log(f"Initializing a new server for model: {model_name}")
self.servers[model_name] = self._create_server(gguf_path) self.servers[model_name] = self._create_server(gguf_path)
server = self.servers[model_name] server = self.servers[model_name]
return server.chat_completion(body) return server.chat_completion(body)
def _create_server(self, gguf_path): def _create_server(self, gguf_path):
""" """
Create a new LlamaCppServer instance for the given model. Create a new LlamaCppServer instance for the given model.
Args: Args:
gguf_path (str): Path to the GGUF model file. gguf_path (str): Path to the GGUF model file.
Returns: Returns:
LlamaCppServer: A new server instance. LlamaCppServer: A new server instance.
""" """
@ -96,49 +101,63 @@ class LlamaCpp:
def _install_llama_cpp_binaries(self): def _install_llama_cpp_binaries(self):
""" """
Download and install llama.cpp binaries. Download and install llama.cpp binaries.
Returns: Returns:
str: Path to the installed llama.cpp binaries. str: Path to the installed llama.cpp binaries.
""" """
self._log("Installing llama.cpp binaries...") self._log("Installing llama.cpp binaries...")
release_info = self._get_latest_release() try:
assets = release_info["assets"] # Use pinned version if provided, otherwise fetch the latest release
asset = self._get_appropriate_asset(assets) release_info = self._get_release_info()
if not asset: assets = release_info["assets"]
raise RuntimeError("No appropriate binary found for your system.") asset = self._get_appropriate_asset(assets)
if not asset:
raise RuntimeError("No appropriate binary found for your system.")
asset_name = asset["name"]
asset_name = asset["name"] # Check if cached binaries match the required version
if self._check_cache(release_info, asset): if self._check_cache(release_info, asset):
self._log("Using cached llama.cpp binaries.") self._log("Using cached llama.cpp binaries.")
else: else:
self._download_and_unzip(asset["browser_download_url"], asset_name) if not self._internet_available():
self._update_cache_info(release_info, asset) raise RuntimeError(
"No cached binary available and unable to fetch from the internet."
)
self._download_and_unzip(asset["browser_download_url"], asset_name)
self._update_cache_info(release_info, asset)
except Exception as e:
self._log(f"Error during binary installation: {e}")
raise
return os.path.join(self.cache_dir, "llama_cpp") return os.path.join(self.cache_dir, "llama_cpp")
def _get_latest_release(self): def _get_release_info(self):
""" """
Fetch the latest release of llama.cpp from GitHub. Fetch metadata of the specified release (pinned or latest) from GitHub.
Returns: Returns:
dict: Release information. dict: Release information.
""" """
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest" if self.pinned_version:
api_url = f"https://api.github.com/repos/ggerganov/llama.cpp/releases/tags/{self.pinned_version}"
else:
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
if not self._internet_available():
# Fall back to cache if no internet access
raise RuntimeError("No internet access and no cached version available.")
response = requests.get(api_url) response = requests.get(api_url)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
else: else:
raise RuntimeError( error_reason = f"Failed to fetch release info: HTTP {response.status_code}"
f"Failed to fetch release info. Status code: {response.status_code}" raise RuntimeError(error_reason)
)
def _get_appropriate_asset(self, assets): def _get_appropriate_asset(self, assets):
""" """
Select the appropriate binary asset for the current system. Select the appropriate binary asset for the current system.
Args: Args:
assets (list): List of asset metadata from the release. assets (list): List of asset metadata from the release.
Returns: Returns:
dict or None: Matching asset metadata, or None if no match found. dict or None: Matching asset metadata, or None if no match found.
""" """
@ -168,13 +187,11 @@ class LlamaCpp:
def _check_cache(self, release_info, asset): def _check_cache(self, release_info, asset):
""" """
Check whether the latest binaries are already cached. Check whether the latest binaries are already cached.
Args: Args:
release_info (dict): Metadata of the latest release. release_info (dict): Metadata of the latest release.
asset (dict): Metadata of the selected asset. asset (dict): Metadata of the selected asset.
Returns: Returns:
bool: True if the cached binary matches the latest release, False otherwise. bool: True if the cached binary matches the required release, False otherwise.
""" """
cache_info_path = os.path.join(self.cache_dir, "cache_info.json") cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
if os.path.exists(cache_info_path): if os.path.exists(cache_info_path):
@ -190,7 +207,6 @@ class LlamaCpp:
def _download_and_unzip(self, url, asset_name): def _download_and_unzip(self, url, asset_name):
""" """
Download and extract llama.cpp binaries. Download and extract llama.cpp binaries.
Args: Args:
url (str): URL of the asset to download. url (str): URL of the asset to download.
asset_name (str): Name of the asset file. asset_name (str): Name of the asset file.
@ -205,7 +221,6 @@ class LlamaCpp:
self._log(f"Successfully downloaded: {asset_name}") self._log(f"Successfully downloaded: {asset_name}")
else: else:
raise RuntimeError(f"Failed to download binary: {url}") raise RuntimeError(f"Failed to download binary: {url}")
extract_dir = os.path.join(self.cache_dir, "llama_cpp") extract_dir = os.path.join(self.cache_dir, "llama_cpp")
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_dir) zip_ref.extractall(extract_dir)
@ -214,7 +229,6 @@ class LlamaCpp:
def _update_cache_info(self, release_info, asset): def _update_cache_info(self, release_info, asset):
""" """
Update cache metadata with the downloaded release info. Update cache metadata with the downloaded release info.
Args: Args:
release_info (dict): Metadata of the latest release. release_info (dict): Metadata of the latest release.
asset (dict): Metadata of the downloaded asset. asset (dict): Metadata of the downloaded asset.
@ -224,10 +238,21 @@ class LlamaCpp:
with open(cache_info_path, "w") as f: with open(cache_info_path, "w") as f:
json.dump(cache_info, f) json.dump(cache_info, f)
def _internet_available(self):
"""
Check for internet connectivity.
Returns:
bool: True if the internet is accessible, False otherwise.
"""
try:
requests.get("https://api.github.com", timeout=3)
return True
except requests.ConnectionError:
return False
def _log(self, message): def _log(self, message):
""" """
Print a log message if verbosity is enabled. Print a log message if verbosity is enabled.
Args: Args:
message (str): Log message to print. message (str): Log message to print.
""" """