refac

2025-05-10 06:41:26 +00:00 · 2025-01-28 13:38:49 -08:00 · 2025-01-28 13:38:49 -08:00 · ef461ac9e1
commit ef461ac9e1
parent 3d15266ea6
2 changed files with 68 additions and 43 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,7 +1,7 @@
 [project]
 name = "llama-cpp-runner"
 version = "0.0.1"
-description = "A runner for llama-cpp"
+description = "Quick and easy way to run large language models (LLMs) with llama.cpp"
 readme = "README.md"
 authors = [
    { name = "Timothy Jaeryang Baek", email = "tim@openwebui.com" }
--- a/src/llama_cpp_runner/main.py
+++ b/src/llama_cpp_runner/main.py
@ -9,27 +9,41 @@ import stat
 import time
 import socket

+import os
+import platform
+import requests
+import zipfile
+import json
+

 class LlamaCpp:
    def __init__(
-        self, models_dir, cache_dir="./cache", verbose=False, timeout_minutes=5
+        self,
+        models_dir,
+        cache_dir="~/.llama_cpp_runner",
+        verbose=False,
+        timeout_minutes=5,
+        pinned_version=None,
    ):
        """
        Initialize the LlamaCpp class.
-
        Args:
            models_dir (str): Directory where GGUF models are stored.
-            cache_dir (str): Directory to store llama.cpp binaries and related assets.
+            cache_dir (str): Directory to store llama.cpp binaries and related assets. Defaults to '~/.llama_cpp_runner'.
            verbose (bool): Whether to enable verbose logging.
            timeout_minutes (int): Timeout for shutting down idle servers.
+            pinned_version (str or None): Pinned release version of llama.cpp binaries.
        """
        self.models_dir = models_dir
-        self.cache_dir = cache_dir
+        self.cache_dir = os.path.expanduser(
+            cache_dir
+        )  # Ensure cache is in a fixed location
        self.verbose = verbose
        self.timeout_minutes = timeout_minutes
+        self.pinned_version = pinned_version  # Optional pinned version
        self.llama_cpp_path = (
            self._install_llama_cpp_binaries()
-        )  # Handle binaries installation
+        )  # Install the required binaries
        self.servers = (
            {}
        )  # Maintain a mapping of model names to LlamaCppServer instances
@ -37,7 +51,6 @@ class LlamaCpp:
    def list_models(self):
        """
        List all GGUF models available in the `models_dir`.
-
        Returns:
            list: A list of model names (files ending in ".gguf").
        """
@ -51,37 +64,29 @@ class LlamaCpp:
    def chat_completion(self, body):
        """
        Handle chat completion requests.
-
        Args:
            body (dict): The payload for the chat completion request. It must contain the "model" key.
-
        Returns:
            dict or generator: Response from the server (non-streaming or streaming mode).
        """
        if "model" not in body:
            raise ValueError("The request body must contain a 'model' key.")
-
        model_name = body["model"]
        gguf_path = os.path.join(self.models_dir, model_name)
-
        if not os.path.exists(gguf_path):
            raise FileNotFoundError(f"Model file not found: {gguf_path}")
-
        # Check if the server for this model is already running
        if model_name not in self.servers or not self.servers[model_name]._server_url:
            self._log(f"Initializing a new server for model: {model_name}")
            self.servers[model_name] = self._create_server(gguf_path)
-
        server = self.servers[model_name]
        return server.chat_completion(body)

    def _create_server(self, gguf_path):
        """
        Create a new LlamaCppServer instance for the given model.
-
        Args:
            gguf_path (str): Path to the GGUF model file.
-
        Returns:
            LlamaCppServer: A new server instance.
        """
@ -96,49 +101,63 @@ class LlamaCpp:
    def _install_llama_cpp_binaries(self):
        """
        Download and install llama.cpp binaries.
-
        Returns:
            str: Path to the installed llama.cpp binaries.
        """
        self._log("Installing llama.cpp binaries...")
-        release_info = self._get_latest_release()
+        try:
+            # Use pinned version if provided, otherwise fetch the latest release
+            release_info = self._get_release_info()
            assets = release_info["assets"]
            asset = self._get_appropriate_asset(assets)
            if not asset:
                raise RuntimeError("No appropriate binary found for your system.")
-
            asset_name = asset["name"]
+
+            # Check if cached binaries match the required version
            if self._check_cache(release_info, asset):
                self._log("Using cached llama.cpp binaries.")
            else:
+                if not self._internet_available():
+                    raise RuntimeError(
+                        "No cached binary available and unable to fetch from the internet."
+                    )
                self._download_and_unzip(asset["browser_download_url"], asset_name)
                self._update_cache_info(release_info, asset)

+        except Exception as e:
+            self._log(f"Error during binary installation: {e}")
+            raise
+
        return os.path.join(self.cache_dir, "llama_cpp")

-    def _get_latest_release(self):
+    def _get_release_info(self):
        """
-        Fetch the latest release of llama.cpp from GitHub.
-
+        Fetch metadata of the specified release (pinned or latest) from GitHub.
        Returns:
            dict: Release information.
        """
+        if self.pinned_version:
+            api_url = f"https://api.github.com/repos/ggerganov/llama.cpp/releases/tags/{self.pinned_version}"
+        else:
            api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
+
+        if not self._internet_available():
+            # Fall back to cache if no internet access
+            raise RuntimeError("No internet access and no cached version available.")
+
        response = requests.get(api_url)
        if response.status_code == 200:
            return response.json()
        else:
-            raise RuntimeError(
-                f"Failed to fetch release info. Status code: {response.status_code}"
-            )
+            error_reason = f"Failed to fetch release info: HTTP {response.status_code}"
+            raise RuntimeError(error_reason)

    def _get_appropriate_asset(self, assets):
        """
        Select the appropriate binary asset for the current system.
-
        Args:
            assets (list): List of asset metadata from the release.
-
        Returns:
            dict or None: Matching asset metadata, or None if no match found.
        """
@ -168,13 +187,11 @@ class LlamaCpp:
    def _check_cache(self, release_info, asset):
        """
        Check whether the latest binaries are already cached.
-
        Args:
            release_info (dict): Metadata of the latest release.
            asset (dict): Metadata of the selected asset.
-
        Returns:
-            bool: True if the cached binary matches the latest release, False otherwise.
+            bool: True if the cached binary matches the required release, False otherwise.
        """
        cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
        if os.path.exists(cache_info_path):
@ -190,7 +207,6 @@ class LlamaCpp:
    def _download_and_unzip(self, url, asset_name):
        """
        Download and extract llama.cpp binaries.
-
        Args:
            url (str): URL of the asset to download.
            asset_name (str): Name of the asset file.
@ -205,7 +221,6 @@ class LlamaCpp:
            self._log(f"Successfully downloaded: {asset_name}")
        else:
            raise RuntimeError(f"Failed to download binary: {url}")
-
        extract_dir = os.path.join(self.cache_dir, "llama_cpp")
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(extract_dir)
@ -214,7 +229,6 @@ class LlamaCpp:
    def _update_cache_info(self, release_info, asset):
        """
        Update cache metadata with the downloaded release info.
-
        Args:
            release_info (dict): Metadata of the latest release.
            asset (dict): Metadata of the downloaded asset.
@ -224,10 +238,21 @@ class LlamaCpp:
        with open(cache_info_path, "w") as f:
            json.dump(cache_info, f)

+    def _internet_available(self):
+        """
+        Check for internet connectivity.
+        Returns:
+            bool: True if the internet is accessible, False otherwise.
+        """
+        try:
+            requests.get("https://api.github.com", timeout=3)
+            return True
+        except requests.ConnectionError:
+            return False
+
    def _log(self, message):
        """
        Print a log message if verbosity is enabled.
-
        Args:
            message (str): Log message to print.
        """