This commit is contained in:
Timothy Jaeryang Baek 2025-01-27 18:21:36 -08:00
parent 55199b2827
commit 79e2046a28
5 changed files with 215 additions and 147 deletions

1
.python-version Normal file
View File

@ -0,0 +1 @@
3.11

0
README.md Normal file
View File

6
hello.py Normal file
View File

@ -0,0 +1,6 @@
def main():
print("Hello from llama-cpp-runner!")
if __name__ == "__main__":
main()

304
main.py
View File

@ -2,97 +2,192 @@ import os
import platform import platform
import requests import requests
import zipfile import zipfile
import argparse
import json import json
from packaging import version import subprocess
import threading
import stat import stat
import time
import socket
def log(message, verbose=False): class LlamaCppServer:
if verbose: def __init__(
print(message) self, llama_cpp_path=None, gguf_path=None, cache_dir="./cache", verbose=False
):
self.verbose = verbose
self.cache_dir = cache_dir
self.llama_cpp_path = llama_cpp_path
self.gguf_path = gguf_path
self.server_process = None
self._server_url = None
self._server_thread = None
self.port = None
# Fetch or validate llama path
if llama_cpp_path is None:
self.llama_cpp_path = self._install_llama_cpp_binaries()
elif not os.path.exists(llama_cpp_path):
raise FileNotFoundError(
f"Specified llama_cpp_path not found: {llama_cpp_path}"
)
def get_latest_release(verbose=False): # Start the server if gguf_path is provided
if gguf_path:
self._start_server_in_thread()
@property
def url(self):
"""Return the URL where the server is running."""
if self._server_url is None:
raise ValueError(
"Server is not running. Start the server with a valid GGUF path."
)
return self._server_url
def kill(self):
"""Kill the server process and clean up."""
if self.server_process and self.server_process.poll() is None:
self.server_process.terminate()
self.server_process.wait()
self.server_process = None
self._server_url = None
self.port = None
self._log("Llama server successfully killed.")
if self._server_thread and self._server_thread.is_alive():
self._server_thread.join()
def _start_server_in_thread(self):
"""Start the server in a separate thread."""
def target():
try:
self._start_server()
except Exception as e:
self._log(f"Failed to start server: {e}")
self._server_thread = threading.Thread(target=target, daemon=True)
self._server_thread.start()
def _start_server(self):
"""Start the llama-server."""
if not self.gguf_path or not os.path.exists(self.gguf_path):
raise ValueError(
f"GGUF model path is not specified or invalid: {self.gguf_path}"
)
server_binary = os.path.join(
self.llama_cpp_path, "build", "bin", "llama-server"
)
if not os.path.exists(server_binary):
raise FileNotFoundError(f"Server binary not found: {server_binary}")
# Ensure the binary is executable
self._set_executable(server_binary)
# Find an available port
self.port = self._find_available_port(start_port=10000)
if self.port is None:
raise RuntimeError("No available port found between 10000 and 11000.")
self._log(f"Starting server with binary: {server_binary}")
self._log(f"Using GGUF path: {self.gguf_path}")
self._log(f"Using port: {self.port}")
self.server_process = subprocess.Popen(
[server_binary, "-m", self.gguf_path, "--port", str(self.port)],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
)
# Wait for the server to confirm it is ready by monitoring its output
self._server_url = None
for line in iter(self.server_process.stdout.readline, ""):
self._log(line.strip())
if "Listening on" in line:
self._server_url = f"http://localhost:{self.port}"
self._log(f"Server is now accessible at {self._server_url}")
break
if not self._server_url:
raise RuntimeError("Failed to confirm server is running.")
def _find_available_port(self, start_port=10000, end_port=11000):
"""Find an available port between `start_port` and `end_port`."""
for port in range(start_port, end_port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
if sock.connect_ex(("localhost", port)) != 0:
return port
return None
def _set_executable(self, file_path):
"""Ensure the file at `file_path` is executable."""
if platform.system() != "Windows":
current_mode = os.stat(file_path).st_mode
os.chmod(
file_path, current_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
)
def _log(self, message):
"""Print a log message if verbosity is enabled."""
if self.verbose:
print(f"[LlamaCppServer] {message}")
def _install_llama_cpp_binaries(self):
"""Download and install llama.cpp binaries."""
self._log("Installing llama.cpp binaries...")
release_info = self._get_latest_release()
assets = release_info["assets"]
asset = self._get_appropriate_asset(assets)
if not asset:
raise RuntimeError("No appropriate binary found for your system.")
asset_name = asset["name"]
if self._check_cache(release_info, asset):
self._log("Using cached llama.cpp binaries.")
else:
self._download_and_unzip(asset["browser_download_url"], asset_name)
self._update_cache_info(release_info, asset)
return os.path.join(self.cache_dir, "llama_cpp")
def _get_latest_release(self):
"""Fetch the latest release of llama.cpp from GitHub."""
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest" api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
log(f"Fetching latest release info from: {api_url}", verbose)
response = requests.get(api_url) response = requests.get(api_url)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
else: else:
raise Exception( raise RuntimeError(
f"Failed to fetch release info. Status code: {response.status_code}" f"Failed to fetch release info. Status code: {response.status_code}"
) )
def _get_appropriate_asset(self, assets):
def get_appropriate_asset(assets, verbose=False): """Select the appropriate binary asset for the current system."""
system = platform.system().lower() system = platform.system().lower()
machine = platform.machine().lower() machine = platform.machine().lower()
processor = platform.processor() processor = platform.processor()
log(f"System: {system}", verbose)
log(f"Machine: {machine}", verbose)
log(f"Processor: {processor}", verbose)
if system == "windows": if system == "windows":
if "arm" in machine: if "arm" in machine:
asset = next((a for a in assets if "win-arm64" in a["name"]), None) return next((a for a in assets if "win-arm64" in a["name"]), None)
else: elif "avx512" in processor:
if "avx512" in processor: return next((a for a in assets if "win-avx512-x64" in a["name"]), None)
asset = next((a for a in assets if "win-avx512-x64" in a["name"]), None)
elif "avx2" in processor: elif "avx2" in processor:
asset = next((a for a in assets if "win-avx2-x64" in a["name"]), None) return next((a for a in assets if "win-avx2-x64" in a["name"]), None)
elif "avx" in processor: elif "avx" in processor:
asset = next((a for a in assets if "win-avx-x64" in a["name"]), None) return next((a for a in assets if "win-avx-x64" in a["name"]), None)
else: else:
asset = next((a for a in assets if "win-noavx-x64" in a["name"]), None) return next((a for a in assets if "win-noavx-x64" in a["name"]), None)
elif system == "darwin": elif system == "darwin":
if "arm" in machine: if "arm" in machine:
asset = next((a for a in assets if "macos-arm64" in a["name"]), None) return next((a for a in assets if "macos-arm64" in a["name"]), None)
else: else:
asset = next((a for a in assets if "macos-x64" in a["name"]), None) return next((a for a in assets if "macos-x64" in a["name"]), None)
elif system == "linux": elif system == "linux":
asset = next((a for a in assets if "ubuntu-x64" in a["name"]), None) return next((a for a in assets if "ubuntu-x64" in a["name"]), None)
else: return None
asset = None
log(f"Selected asset: {asset['name'] if asset else None}", verbose)
return asset
def _check_cache(self, release_info, asset):
def set_executable(file_path): """Check whether the latest binaries are already cached."""
current_mode = os.stat(file_path).st_mode cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
os.chmod(file_path, current_mode | stat.S_IEXEC)
def download_and_unzip(url, asset_name, cache_dir, verbose=False):
os.makedirs(cache_dir, exist_ok=True)
zip_path = os.path.join(cache_dir, asset_name)
log(f"Downloading from: {url}", verbose)
response = requests.get(url)
if response.status_code == 200:
with open(zip_path, "wb") as file:
file.write(response.content)
log(f"Downloaded: {asset_name}", verbose)
extract_dir = os.path.join(cache_dir, "llama_cpp")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_dir)
log(f"Extracted to: {extract_dir}", verbose)
# Set execute permissions for all extracted files
if platform.system() != "Windows":
for root, dirs, files in os.walk(extract_dir):
for file in files:
file_path = os.path.join(root, file)
set_executable(file_path)
log("Set execute permissions for extracted files", verbose)
else:
log("Skipping permission setting on Windows", verbose)
print(f"Successfully downloaded and extracted {asset_name}")
return True
else:
print(f"Failed to download {asset_name}")
return False
def check_cache(release_info, asset, cache_dir, verbose=False):
cache_info_path = os.path.join(cache_dir, "cache_info.json")
if os.path.exists(cache_info_path): if os.path.exists(cache_info_path):
with open(cache_info_path, "r") as f: with open(cache_info_path, "r") as f:
cache_info = json.load(f) cache_info = json.load(f)
@ -100,70 +195,29 @@ def check_cache(release_info, asset, cache_dir, verbose=False):
cache_info.get("tag_name") == release_info["tag_name"] cache_info.get("tag_name") == release_info["tag_name"]
and cache_info.get("asset_name") == asset["name"] and cache_info.get("asset_name") == asset["name"]
): ):
log("Latest version already downloaded.", verbose)
return True return True
return False return False
def _download_and_unzip(self, url, asset_name):
def update_cache_info(release_info, asset, cache_dir): """Download and extract llama.cpp binaries."""
cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]} os.makedirs(self.cache_dir, exist_ok=True)
cache_info_path = os.path.join(cache_dir, "cache_info.json") zip_path = os.path.join(self.cache_dir, asset_name)
with open(cache_info_path, "w") as f: self._log(f"Downloading binary from: {url}")
json.dump(cache_info, f) response = requests.get(url)
if response.status_code == 200:
with open(zip_path, "wb") as file:
def unzip_asset(asset_name, cache_dir, verbose=False): file.write(response.content)
zip_path = os.path.join(cache_dir, asset_name) self._log(f"Successfully downloaded: {asset_name}")
extract_dir = os.path.join(cache_dir, "llama_cpp") else:
if os.path.exists(zip_path): raise RuntimeError(f"Failed to download binary: {url}")
extract_dir = os.path.join(self.cache_dir, "llama_cpp")
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_dir) zip_ref.extractall(extract_dir)
log(f"Extracted to: {extract_dir}", verbose) self._log(f"Extracted binaries to: {extract_dir}")
# Set execute permissions for all extracted files
if platform.system() != "Windows":
for root, dirs, files in os.walk(extract_dir):
for file in files:
file_path = os.path.join(root, file)
set_executable(file_path)
log("Set execute permissions for extracted files", verbose)
else:
log("Skipping permission setting on Windows", verbose)
print(f"Successfully extracted {asset_name}")
return True
else:
print(f"Zip file not found: {asset_name}")
return False
def _update_cache_info(self, release_info, asset):
def main(): """Update cache metadata with the downloaded release info."""
parser = argparse.ArgumentParser( cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
description="Download and extract llama.cpp binaries" cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
) with open(cache_info_path, "w") as f:
parser.add_argument( json.dump(cache_info, f)
"-v", "--verbose", action="store_true", help="Increase output verbosity"
)
parser.add_argument("-c", "--cache", default="./cache", help="Cache directory")
args = parser.parse_args()
try:
release_info = get_latest_release(args.verbose)
assets = release_info["assets"]
appropriate_asset = get_appropriate_asset(assets, args.verbose)
if appropriate_asset:
asset_name = appropriate_asset["name"]
if check_cache(release_info, appropriate_asset, args.cache, args.verbose):
print("Latest version already downloaded. Extracting cached version.")
unzip_asset(asset_name, args.cache, args.verbose)
else:
download_url = appropriate_asset["browser_download_url"]
if download_and_unzip(
download_url, asset_name, args.cache, args.verbose
):
update_cache_info(release_info, appropriate_asset, args.cache)
else:
print("No appropriate binary found for your system.")
except Exception as e:
print(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()

7
pyproject.toml Normal file
View File

@ -0,0 +1,7 @@
[project]
name = "llama-cpp-runner"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = []