mirror of
https://github.com/open-webui/llama-cpp-runner
synced 2025-05-10 06:41:26 +00:00
refac
This commit is contained in:
parent
3d15266ea6
commit
ef461ac9e1
@ -1,7 +1,7 @@
|
||||
[project]
|
||||
name = "llama-cpp-runner"
|
||||
version = "0.0.1"
|
||||
description = "A runner for llama-cpp"
|
||||
description = "Quick and easy way to run large language models (LLMs) with llama.cpp"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
{ name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" }
|
||||
|
@ -9,27 +9,41 @@ import stat
|
||||
import time
|
||||
import socket
|
||||
|
||||
import os
|
||||
import platform
|
||||
import requests
|
||||
import zipfile
|
||||
import json
|
||||
|
||||
|
||||
class LlamaCpp:
|
||||
def __init__(
|
||||
self, models_dir, cache_dir="./cache", verbose=False, timeout_minutes=5
|
||||
self,
|
||||
models_dir,
|
||||
cache_dir="~/.llama_cpp_runner",
|
||||
verbose=False,
|
||||
timeout_minutes=5,
|
||||
pinned_version=None,
|
||||
):
|
||||
"""
|
||||
Initialize the LlamaCpp class.
|
||||
|
||||
Args:
|
||||
models_dir (str): Directory where GGUF models are stored.
|
||||
cache_dir (str): Directory to store llama.cpp binaries and related assets.
|
||||
cache_dir (str): Directory to store llama.cpp binaries and related assets. Defaults to '~/.llama_cpp_runner'.
|
||||
verbose (bool): Whether to enable verbose logging.
|
||||
timeout_minutes (int): Timeout for shutting down idle servers.
|
||||
pinned_version (str or None): Pinned release version of llama.cpp binaries.
|
||||
"""
|
||||
self.models_dir = models_dir
|
||||
self.cache_dir = cache_dir
|
||||
self.cache_dir = os.path.expanduser(
|
||||
cache_dir
|
||||
) # Ensure cache is in a fixed location
|
||||
self.verbose = verbose
|
||||
self.timeout_minutes = timeout_minutes
|
||||
self.pinned_version = pinned_version # Optional pinned version
|
||||
self.llama_cpp_path = (
|
||||
self._install_llama_cpp_binaries()
|
||||
) # Handle binaries installation
|
||||
) # Install the required binaries
|
||||
self.servers = (
|
||||
{}
|
||||
) # Maintain a mapping of model names to LlamaCppServer instances
|
||||
@ -37,7 +51,6 @@ class LlamaCpp:
|
||||
def list_models(self):
|
||||
"""
|
||||
List all GGUF models available in the `models_dir`.
|
||||
|
||||
Returns:
|
||||
list: A list of model names (files ending in ".gguf").
|
||||
"""
|
||||
@ -51,37 +64,29 @@ class LlamaCpp:
|
||||
def chat_completion(self, body):
|
||||
"""
|
||||
Handle chat completion requests.
|
||||
|
||||
Args:
|
||||
body (dict): The payload for the chat completion request. It must contain the "model" key.
|
||||
|
||||
Returns:
|
||||
dict or generator: Response from the server (non-streaming or streaming mode).
|
||||
"""
|
||||
if "model" not in body:
|
||||
raise ValueError("The request body must contain a 'model' key.")
|
||||
|
||||
model_name = body["model"]
|
||||
gguf_path = os.path.join(self.models_dir, model_name)
|
||||
|
||||
if not os.path.exists(gguf_path):
|
||||
raise FileNotFoundError(f"Model file not found: {gguf_path}")
|
||||
|
||||
# Check if the server for this model is already running
|
||||
if model_name not in self.servers or not self.servers[model_name]._server_url:
|
||||
self._log(f"Initializing a new server for model: {model_name}")
|
||||
self.servers[model_name] = self._create_server(gguf_path)
|
||||
|
||||
server = self.servers[model_name]
|
||||
return server.chat_completion(body)
|
||||
|
||||
def _create_server(self, gguf_path):
|
||||
"""
|
||||
Create a new LlamaCppServer instance for the given model.
|
||||
|
||||
Args:
|
||||
gguf_path (str): Path to the GGUF model file.
|
||||
|
||||
Returns:
|
||||
LlamaCppServer: A new server instance.
|
||||
"""
|
||||
@ -96,49 +101,63 @@ class LlamaCpp:
|
||||
def _install_llama_cpp_binaries(self):
|
||||
"""
|
||||
Download and install llama.cpp binaries.
|
||||
|
||||
Returns:
|
||||
str: Path to the installed llama.cpp binaries.
|
||||
"""
|
||||
self._log("Installing llama.cpp binaries...")
|
||||
release_info = self._get_latest_release()
|
||||
try:
|
||||
# Use pinned version if provided, otherwise fetch the latest release
|
||||
release_info = self._get_release_info()
|
||||
assets = release_info["assets"]
|
||||
asset = self._get_appropriate_asset(assets)
|
||||
if not asset:
|
||||
raise RuntimeError("No appropriate binary found for your system.")
|
||||
|
||||
asset_name = asset["name"]
|
||||
|
||||
# Check if cached binaries match the required version
|
||||
if self._check_cache(release_info, asset):
|
||||
self._log("Using cached llama.cpp binaries.")
|
||||
else:
|
||||
if not self._internet_available():
|
||||
raise RuntimeError(
|
||||
"No cached binary available and unable to fetch from the internet."
|
||||
)
|
||||
self._download_and_unzip(asset["browser_download_url"], asset_name)
|
||||
self._update_cache_info(release_info, asset)
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"Error during binary installation: {e}")
|
||||
raise
|
||||
|
||||
return os.path.join(self.cache_dir, "llama_cpp")
|
||||
|
||||
def _get_latest_release(self):
|
||||
def _get_release_info(self):
|
||||
"""
|
||||
Fetch the latest release of llama.cpp from GitHub.
|
||||
|
||||
Fetch metadata of the specified release (pinned or latest) from GitHub.
|
||||
Returns:
|
||||
dict: Release information.
|
||||
"""
|
||||
if self.pinned_version:
|
||||
api_url = f"https://api.github.com/repos/ggerganov/llama.cpp/releases/tags/{self.pinned_version}"
|
||||
else:
|
||||
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
||||
|
||||
if not self._internet_available():
|
||||
# Fall back to cache if no internet access
|
||||
raise RuntimeError("No internet access and no cached version available.")
|
||||
|
||||
response = requests.get(api_url)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Failed to fetch release info. Status code: {response.status_code}"
|
||||
)
|
||||
error_reason = f"Failed to fetch release info: HTTP {response.status_code}"
|
||||
raise RuntimeError(error_reason)
|
||||
|
||||
def _get_appropriate_asset(self, assets):
|
||||
"""
|
||||
Select the appropriate binary asset for the current system.
|
||||
|
||||
Args:
|
||||
assets (list): List of asset metadata from the release.
|
||||
|
||||
Returns:
|
||||
dict or None: Matching asset metadata, or None if no match found.
|
||||
"""
|
||||
@ -168,13 +187,11 @@ class LlamaCpp:
|
||||
def _check_cache(self, release_info, asset):
|
||||
"""
|
||||
Check whether the latest binaries are already cached.
|
||||
|
||||
Args:
|
||||
release_info (dict): Metadata of the latest release.
|
||||
asset (dict): Metadata of the selected asset.
|
||||
|
||||
Returns:
|
||||
bool: True if the cached binary matches the latest release, False otherwise.
|
||||
bool: True if the cached binary matches the required release, False otherwise.
|
||||
"""
|
||||
cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
|
||||
if os.path.exists(cache_info_path):
|
||||
@ -190,7 +207,6 @@ class LlamaCpp:
|
||||
def _download_and_unzip(self, url, asset_name):
|
||||
"""
|
||||
Download and extract llama.cpp binaries.
|
||||
|
||||
Args:
|
||||
url (str): URL of the asset to download.
|
||||
asset_name (str): Name of the asset file.
|
||||
@ -205,7 +221,6 @@ class LlamaCpp:
|
||||
self._log(f"Successfully downloaded: {asset_name}")
|
||||
else:
|
||||
raise RuntimeError(f"Failed to download binary: {url}")
|
||||
|
||||
extract_dir = os.path.join(self.cache_dir, "llama_cpp")
|
||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||
zip_ref.extractall(extract_dir)
|
||||
@ -214,7 +229,6 @@ class LlamaCpp:
|
||||
def _update_cache_info(self, release_info, asset):
|
||||
"""
|
||||
Update cache metadata with the downloaded release info.
|
||||
|
||||
Args:
|
||||
release_info (dict): Metadata of the latest release.
|
||||
asset (dict): Metadata of the downloaded asset.
|
||||
@ -224,10 +238,21 @@ class LlamaCpp:
|
||||
with open(cache_info_path, "w") as f:
|
||||
json.dump(cache_info, f)
|
||||
|
||||
def _internet_available(self):
|
||||
"""
|
||||
Check for internet connectivity.
|
||||
Returns:
|
||||
bool: True if the internet is accessible, False otherwise.
|
||||
"""
|
||||
try:
|
||||
requests.get("https://api.github.com", timeout=3)
|
||||
return True
|
||||
except requests.ConnectionError:
|
||||
return False
|
||||
|
||||
def _log(self, message):
|
||||
"""
|
||||
Print a log message if verbosity is enabled.
|
||||
|
||||
Args:
|
||||
message (str): Log message to print.
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user