mirror of
https://github.com/open-webui/llama-cpp-runner
synced 2025-05-10 14:51:23 +00:00
refac
This commit is contained in:
parent
3d15266ea6
commit
ef461ac9e1
@ -1,7 +1,7 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "llama-cpp-runner"
|
name = "llama-cpp-runner"
|
||||||
version = "0.0.1"
|
version = "0.0.1"
|
||||||
description = "A runner for llama-cpp"
|
description = "Quick and easy way to run large language models (LLMs) with llama.cpp"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" }
|
{ name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" }
|
||||||
|
@ -9,27 +9,41 @@ import stat
|
|||||||
import time
|
import time
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import requests
|
||||||
|
import zipfile
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
class LlamaCpp:
|
class LlamaCpp:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, models_dir, cache_dir="./cache", verbose=False, timeout_minutes=5
|
self,
|
||||||
|
models_dir,
|
||||||
|
cache_dir="~/.llama_cpp_runner",
|
||||||
|
verbose=False,
|
||||||
|
timeout_minutes=5,
|
||||||
|
pinned_version=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the LlamaCpp class.
|
Initialize the LlamaCpp class.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
models_dir (str): Directory where GGUF models are stored.
|
models_dir (str): Directory where GGUF models are stored.
|
||||||
cache_dir (str): Directory to store llama.cpp binaries and related assets.
|
cache_dir (str): Directory to store llama.cpp binaries and related assets. Defaults to '~/.llama_cpp_runner'.
|
||||||
verbose (bool): Whether to enable verbose logging.
|
verbose (bool): Whether to enable verbose logging.
|
||||||
timeout_minutes (int): Timeout for shutting down idle servers.
|
timeout_minutes (int): Timeout for shutting down idle servers.
|
||||||
|
pinned_version (str or None): Pinned release version of llama.cpp binaries.
|
||||||
"""
|
"""
|
||||||
self.models_dir = models_dir
|
self.models_dir = models_dir
|
||||||
self.cache_dir = cache_dir
|
self.cache_dir = os.path.expanduser(
|
||||||
|
cache_dir
|
||||||
|
) # Ensure cache is in a fixed location
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.timeout_minutes = timeout_minutes
|
self.timeout_minutes = timeout_minutes
|
||||||
|
self.pinned_version = pinned_version # Optional pinned version
|
||||||
self.llama_cpp_path = (
|
self.llama_cpp_path = (
|
||||||
self._install_llama_cpp_binaries()
|
self._install_llama_cpp_binaries()
|
||||||
) # Handle binaries installation
|
) # Install the required binaries
|
||||||
self.servers = (
|
self.servers = (
|
||||||
{}
|
{}
|
||||||
) # Maintain a mapping of model names to LlamaCppServer instances
|
) # Maintain a mapping of model names to LlamaCppServer instances
|
||||||
@ -37,7 +51,6 @@ class LlamaCpp:
|
|||||||
def list_models(self):
|
def list_models(self):
|
||||||
"""
|
"""
|
||||||
List all GGUF models available in the `models_dir`.
|
List all GGUF models available in the `models_dir`.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: A list of model names (files ending in ".gguf").
|
list: A list of model names (files ending in ".gguf").
|
||||||
"""
|
"""
|
||||||
@ -51,37 +64,29 @@ class LlamaCpp:
|
|||||||
def chat_completion(self, body):
|
def chat_completion(self, body):
|
||||||
"""
|
"""
|
||||||
Handle chat completion requests.
|
Handle chat completion requests.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
body (dict): The payload for the chat completion request. It must contain the "model" key.
|
body (dict): The payload for the chat completion request. It must contain the "model" key.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict or generator: Response from the server (non-streaming or streaming mode).
|
dict or generator: Response from the server (non-streaming or streaming mode).
|
||||||
"""
|
"""
|
||||||
if "model" not in body:
|
if "model" not in body:
|
||||||
raise ValueError("The request body must contain a 'model' key.")
|
raise ValueError("The request body must contain a 'model' key.")
|
||||||
|
|
||||||
model_name = body["model"]
|
model_name = body["model"]
|
||||||
gguf_path = os.path.join(self.models_dir, model_name)
|
gguf_path = os.path.join(self.models_dir, model_name)
|
||||||
|
|
||||||
if not os.path.exists(gguf_path):
|
if not os.path.exists(gguf_path):
|
||||||
raise FileNotFoundError(f"Model file not found: {gguf_path}")
|
raise FileNotFoundError(f"Model file not found: {gguf_path}")
|
||||||
|
|
||||||
# Check if the server for this model is already running
|
# Check if the server for this model is already running
|
||||||
if model_name not in self.servers or not self.servers[model_name]._server_url:
|
if model_name not in self.servers or not self.servers[model_name]._server_url:
|
||||||
self._log(f"Initializing a new server for model: {model_name}")
|
self._log(f"Initializing a new server for model: {model_name}")
|
||||||
self.servers[model_name] = self._create_server(gguf_path)
|
self.servers[model_name] = self._create_server(gguf_path)
|
||||||
|
|
||||||
server = self.servers[model_name]
|
server = self.servers[model_name]
|
||||||
return server.chat_completion(body)
|
return server.chat_completion(body)
|
||||||
|
|
||||||
def _create_server(self, gguf_path):
|
def _create_server(self, gguf_path):
|
||||||
"""
|
"""
|
||||||
Create a new LlamaCppServer instance for the given model.
|
Create a new LlamaCppServer instance for the given model.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
gguf_path (str): Path to the GGUF model file.
|
gguf_path (str): Path to the GGUF model file.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
LlamaCppServer: A new server instance.
|
LlamaCppServer: A new server instance.
|
||||||
"""
|
"""
|
||||||
@ -96,49 +101,63 @@ class LlamaCpp:
|
|||||||
def _install_llama_cpp_binaries(self):
|
def _install_llama_cpp_binaries(self):
|
||||||
"""
|
"""
|
||||||
Download and install llama.cpp binaries.
|
Download and install llama.cpp binaries.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Path to the installed llama.cpp binaries.
|
str: Path to the installed llama.cpp binaries.
|
||||||
"""
|
"""
|
||||||
self._log("Installing llama.cpp binaries...")
|
self._log("Installing llama.cpp binaries...")
|
||||||
release_info = self._get_latest_release()
|
try:
|
||||||
assets = release_info["assets"]
|
# Use pinned version if provided, otherwise fetch the latest release
|
||||||
asset = self._get_appropriate_asset(assets)
|
release_info = self._get_release_info()
|
||||||
if not asset:
|
assets = release_info["assets"]
|
||||||
raise RuntimeError("No appropriate binary found for your system.")
|
asset = self._get_appropriate_asset(assets)
|
||||||
|
if not asset:
|
||||||
|
raise RuntimeError("No appropriate binary found for your system.")
|
||||||
|
asset_name = asset["name"]
|
||||||
|
|
||||||
asset_name = asset["name"]
|
# Check if cached binaries match the required version
|
||||||
if self._check_cache(release_info, asset):
|
if self._check_cache(release_info, asset):
|
||||||
self._log("Using cached llama.cpp binaries.")
|
self._log("Using cached llama.cpp binaries.")
|
||||||
else:
|
else:
|
||||||
self._download_and_unzip(asset["browser_download_url"], asset_name)
|
if not self._internet_available():
|
||||||
self._update_cache_info(release_info, asset)
|
raise RuntimeError(
|
||||||
|
"No cached binary available and unable to fetch from the internet."
|
||||||
|
)
|
||||||
|
self._download_and_unzip(asset["browser_download_url"], asset_name)
|
||||||
|
self._update_cache_info(release_info, asset)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"Error during binary installation: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
return os.path.join(self.cache_dir, "llama_cpp")
|
return os.path.join(self.cache_dir, "llama_cpp")
|
||||||
|
|
||||||
def _get_latest_release(self):
|
def _get_release_info(self):
|
||||||
"""
|
"""
|
||||||
Fetch the latest release of llama.cpp from GitHub.
|
Fetch metadata of the specified release (pinned or latest) from GitHub.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Release information.
|
dict: Release information.
|
||||||
"""
|
"""
|
||||||
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
if self.pinned_version:
|
||||||
|
api_url = f"https://api.github.com/repos/ggerganov/llama.cpp/releases/tags/{self.pinned_version}"
|
||||||
|
else:
|
||||||
|
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
||||||
|
|
||||||
|
if not self._internet_available():
|
||||||
|
# Fall back to cache if no internet access
|
||||||
|
raise RuntimeError("No internet access and no cached version available.")
|
||||||
|
|
||||||
response = requests.get(api_url)
|
response = requests.get(api_url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
return response.json()
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
error_reason = f"Failed to fetch release info: HTTP {response.status_code}"
|
||||||
f"Failed to fetch release info. Status code: {response.status_code}"
|
raise RuntimeError(error_reason)
|
||||||
)
|
|
||||||
|
|
||||||
def _get_appropriate_asset(self, assets):
|
def _get_appropriate_asset(self, assets):
|
||||||
"""
|
"""
|
||||||
Select the appropriate binary asset for the current system.
|
Select the appropriate binary asset for the current system.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
assets (list): List of asset metadata from the release.
|
assets (list): List of asset metadata from the release.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict or None: Matching asset metadata, or None if no match found.
|
dict or None: Matching asset metadata, or None if no match found.
|
||||||
"""
|
"""
|
||||||
@ -168,13 +187,11 @@ class LlamaCpp:
|
|||||||
def _check_cache(self, release_info, asset):
|
def _check_cache(self, release_info, asset):
|
||||||
"""
|
"""
|
||||||
Check whether the latest binaries are already cached.
|
Check whether the latest binaries are already cached.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
release_info (dict): Metadata of the latest release.
|
release_info (dict): Metadata of the latest release.
|
||||||
asset (dict): Metadata of the selected asset.
|
asset (dict): Metadata of the selected asset.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the cached binary matches the latest release, False otherwise.
|
bool: True if the cached binary matches the required release, False otherwise.
|
||||||
"""
|
"""
|
||||||
cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
|
cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
|
||||||
if os.path.exists(cache_info_path):
|
if os.path.exists(cache_info_path):
|
||||||
@ -190,7 +207,6 @@ class LlamaCpp:
|
|||||||
def _download_and_unzip(self, url, asset_name):
|
def _download_and_unzip(self, url, asset_name):
|
||||||
"""
|
"""
|
||||||
Download and extract llama.cpp binaries.
|
Download and extract llama.cpp binaries.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): URL of the asset to download.
|
url (str): URL of the asset to download.
|
||||||
asset_name (str): Name of the asset file.
|
asset_name (str): Name of the asset file.
|
||||||
@ -205,7 +221,6 @@ class LlamaCpp:
|
|||||||
self._log(f"Successfully downloaded: {asset_name}")
|
self._log(f"Successfully downloaded: {asset_name}")
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Failed to download binary: {url}")
|
raise RuntimeError(f"Failed to download binary: {url}")
|
||||||
|
|
||||||
extract_dir = os.path.join(self.cache_dir, "llama_cpp")
|
extract_dir = os.path.join(self.cache_dir, "llama_cpp")
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
zip_ref.extractall(extract_dir)
|
zip_ref.extractall(extract_dir)
|
||||||
@ -214,7 +229,6 @@ class LlamaCpp:
|
|||||||
def _update_cache_info(self, release_info, asset):
|
def _update_cache_info(self, release_info, asset):
|
||||||
"""
|
"""
|
||||||
Update cache metadata with the downloaded release info.
|
Update cache metadata with the downloaded release info.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
release_info (dict): Metadata of the latest release.
|
release_info (dict): Metadata of the latest release.
|
||||||
asset (dict): Metadata of the downloaded asset.
|
asset (dict): Metadata of the downloaded asset.
|
||||||
@ -224,10 +238,21 @@ class LlamaCpp:
|
|||||||
with open(cache_info_path, "w") as f:
|
with open(cache_info_path, "w") as f:
|
||||||
json.dump(cache_info, f)
|
json.dump(cache_info, f)
|
||||||
|
|
||||||
|
def _internet_available(self):
|
||||||
|
"""
|
||||||
|
Check for internet connectivity.
|
||||||
|
Returns:
|
||||||
|
bool: True if the internet is accessible, False otherwise.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
requests.get("https://api.github.com", timeout=3)
|
||||||
|
return True
|
||||||
|
except requests.ConnectionError:
|
||||||
|
return False
|
||||||
|
|
||||||
def _log(self, message):
|
def _log(self, message):
|
||||||
"""
|
"""
|
||||||
Print a log message if verbosity is enabled.
|
Print a log message if verbosity is enabled.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
message (str): Log message to print.
|
message (str): Log message to print.
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user