This commit is contained in:
Timothy Jaeryang Baek 2025-01-27 18:21:36 -08:00
parent 55199b2827
commit 79e2046a28
5 changed files with 215 additions and 147 deletions

1
.python-version Normal file
View File

@ -0,0 +1 @@
3.11

0
README.md Normal file
View File

6
hello.py Normal file
View File

@ -0,0 +1,6 @@
def main():
print("Hello from llama-cpp-runner!")
if __name__ == "__main__":
main()

348
main.py
View File

@ -2,168 +2,222 @@ import os
import platform import platform
import requests import requests
import zipfile import zipfile
import argparse
import json import json
from packaging import version import subprocess
import threading
import stat import stat
import time
import socket
def log(message, verbose=False): class LlamaCppServer:
if verbose: def __init__(
print(message) self, llama_cpp_path=None, gguf_path=None, cache_dir="./cache", verbose=False
):
self.verbose = verbose
self.cache_dir = cache_dir
self.llama_cpp_path = llama_cpp_path
self.gguf_path = gguf_path
self.server_process = None
self._server_url = None
self._server_thread = None
self.port = None
# Fetch or validate llama path
if llama_cpp_path is None:
self.llama_cpp_path = self._install_llama_cpp_binaries()
elif not os.path.exists(llama_cpp_path):
raise FileNotFoundError(
f"Specified llama_cpp_path not found: {llama_cpp_path}"
)
def get_latest_release(verbose=False): # Start the server if gguf_path is provided
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest" if gguf_path:
log(f"Fetching latest release info from: {api_url}", verbose) self._start_server_in_thread()
response = requests.get(api_url)
if response.status_code == 200: @property
return response.json() def url(self):
else: """Return the URL where the server is running."""
raise Exception( if self._server_url is None:
f"Failed to fetch release info. Status code: {response.status_code}" raise ValueError(
"Server is not running. Start the server with a valid GGUF path."
)
return self._server_url
def kill(self):
"""Kill the server process and clean up."""
if self.server_process and self.server_process.poll() is None:
self.server_process.terminate()
self.server_process.wait()
self.server_process = None
self._server_url = None
self.port = None
self._log("Llama server successfully killed.")
if self._server_thread and self._server_thread.is_alive():
self._server_thread.join()
def _start_server_in_thread(self):
"""Start the server in a separate thread."""
def target():
try:
self._start_server()
except Exception as e:
self._log(f"Failed to start server: {e}")
self._server_thread = threading.Thread(target=target, daemon=True)
self._server_thread.start()
def _start_server(self):
"""Start the llama-server."""
if not self.gguf_path or not os.path.exists(self.gguf_path):
raise ValueError(
f"GGUF model path is not specified or invalid: {self.gguf_path}"
)
server_binary = os.path.join(
self.llama_cpp_path, "build", "bin", "llama-server"
)
if not os.path.exists(server_binary):
raise FileNotFoundError(f"Server binary not found: {server_binary}")
# Ensure the binary is executable
self._set_executable(server_binary)
# Find an available port
self.port = self._find_available_port(start_port=10000)
if self.port is None:
raise RuntimeError("No available port found between 10000 and 11000.")
self._log(f"Starting server with binary: {server_binary}")
self._log(f"Using GGUF path: {self.gguf_path}")
self._log(f"Using port: {self.port}")
self.server_process = subprocess.Popen(
[server_binary, "-m", self.gguf_path, "--port", str(self.port)],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
) )
# Wait for the server to confirm it is ready by monitoring its output
self._server_url = None
for line in iter(self.server_process.stdout.readline, ""):
self._log(line.strip())
if "Listening on" in line:
self._server_url = f"http://localhost:{self.port}"
self._log(f"Server is now accessible at {self._server_url}")
break
def get_appropriate_asset(assets, verbose=False): if not self._server_url:
system = platform.system().lower() raise RuntimeError("Failed to confirm server is running.")
machine = platform.machine().lower()
processor = platform.processor()
log(f"System: {system}", verbose)
log(f"Machine: {machine}", verbose)
log(f"Processor: {processor}", verbose)
if system == "windows":
if "arm" in machine:
asset = next((a for a in assets if "win-arm64" in a["name"]), None)
else:
if "avx512" in processor:
asset = next((a for a in assets if "win-avx512-x64" in a["name"]), None)
elif "avx2" in processor:
asset = next((a for a in assets if "win-avx2-x64" in a["name"]), None)
elif "avx" in processor:
asset = next((a for a in assets if "win-avx-x64" in a["name"]), None)
else:
asset = next((a for a in assets if "win-noavx-x64" in a["name"]), None)
elif system == "darwin":
if "arm" in machine:
asset = next((a for a in assets if "macos-arm64" in a["name"]), None)
else:
asset = next((a for a in assets if "macos-x64" in a["name"]), None)
elif system == "linux":
asset = next((a for a in assets if "ubuntu-x64" in a["name"]), None)
else:
asset = None
log(f"Selected asset: {asset['name'] if asset else None}", verbose)
return asset
def _find_available_port(self, start_port=10000, end_port=11000):
"""Find an available port between `start_port` and `end_port`."""
for port in range(start_port, end_port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
if sock.connect_ex(("localhost", port)) != 0:
return port
return None
def set_executable(file_path): def _set_executable(self, file_path):
current_mode = os.stat(file_path).st_mode """Ensure the file at `file_path` is executable."""
os.chmod(file_path, current_mode | stat.S_IEXEC)
def download_and_unzip(url, asset_name, cache_dir, verbose=False):
os.makedirs(cache_dir, exist_ok=True)
zip_path = os.path.join(cache_dir, asset_name)
log(f"Downloading from: {url}", verbose)
response = requests.get(url)
if response.status_code == 200:
with open(zip_path, "wb") as file:
file.write(response.content)
log(f"Downloaded: {asset_name}", verbose)
extract_dir = os.path.join(cache_dir, "llama_cpp")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_dir)
log(f"Extracted to: {extract_dir}", verbose)
# Set execute permissions for all extracted files
if platform.system() != "Windows": if platform.system() != "Windows":
for root, dirs, files in os.walk(extract_dir): current_mode = os.stat(file_path).st_mode
for file in files: os.chmod(
file_path = os.path.join(root, file) file_path, current_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
set_executable(file_path) )
log("Set execute permissions for extracted files", verbose)
else:
log("Skipping permission setting on Windows", verbose)
print(f"Successfully downloaded and extracted {asset_name}")
return True
else:
print(f"Failed to download {asset_name}")
return False
def _log(self, message):
"""Print a log message if verbosity is enabled."""
if self.verbose:
print(f"[LlamaCppServer] {message}")
def check_cache(release_info, asset, cache_dir, verbose=False): def _install_llama_cpp_binaries(self):
cache_info_path = os.path.join(cache_dir, "cache_info.json") """Download and install llama.cpp binaries."""
if os.path.exists(cache_info_path): self._log("Installing llama.cpp binaries...")
with open(cache_info_path, "r") as f: release_info = self._get_latest_release()
cache_info = json.load(f)
if (
cache_info.get("tag_name") == release_info["tag_name"]
and cache_info.get("asset_name") == asset["name"]
):
log("Latest version already downloaded.", verbose)
return True
return False
def update_cache_info(release_info, asset, cache_dir):
cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
cache_info_path = os.path.join(cache_dir, "cache_info.json")
with open(cache_info_path, "w") as f:
json.dump(cache_info, f)
def unzip_asset(asset_name, cache_dir, verbose=False):
zip_path = os.path.join(cache_dir, asset_name)
extract_dir = os.path.join(cache_dir, "llama_cpp")
if os.path.exists(zip_path):
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_dir)
log(f"Extracted to: {extract_dir}", verbose)
# Set execute permissions for all extracted files
if platform.system() != "Windows":
for root, dirs, files in os.walk(extract_dir):
for file in files:
file_path = os.path.join(root, file)
set_executable(file_path)
log("Set execute permissions for extracted files", verbose)
else:
log("Skipping permission setting on Windows", verbose)
print(f"Successfully extracted {asset_name}")
return True
else:
print(f"Zip file not found: {asset_name}")
return False
def main():
parser = argparse.ArgumentParser(
description="Download and extract llama.cpp binaries"
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="Increase output verbosity"
)
parser.add_argument("-c", "--cache", default="./cache", help="Cache directory")
args = parser.parse_args()
try:
release_info = get_latest_release(args.verbose)
assets = release_info["assets"] assets = release_info["assets"]
appropriate_asset = get_appropriate_asset(assets, args.verbose) asset = self._get_appropriate_asset(assets)
if appropriate_asset: if not asset:
asset_name = appropriate_asset["name"] raise RuntimeError("No appropriate binary found for your system.")
if check_cache(release_info, appropriate_asset, args.cache, args.verbose): asset_name = asset["name"]
print("Latest version already downloaded. Extracting cached version.") if self._check_cache(release_info, asset):
unzip_asset(asset_name, args.cache, args.verbose) self._log("Using cached llama.cpp binaries.")
else:
download_url = appropriate_asset["browser_download_url"]
if download_and_unzip(
download_url, asset_name, args.cache, args.verbose
):
update_cache_info(release_info, appropriate_asset, args.cache)
else: else:
print("No appropriate binary found for your system.") self._download_and_unzip(asset["browser_download_url"], asset_name)
except Exception as e: self._update_cache_info(release_info, asset)
print(f"An error occurred: {str(e)}") return os.path.join(self.cache_dir, "llama_cpp")
def _get_latest_release(self):
"""Fetch the latest release of llama.cpp from GitHub."""
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
response = requests.get(api_url)
if response.status_code == 200:
return response.json()
else:
raise RuntimeError(
f"Failed to fetch release info. Status code: {response.status_code}"
)
if __name__ == "__main__": def _get_appropriate_asset(self, assets):
main() """Select the appropriate binary asset for the current system."""
system = platform.system().lower()
machine = platform.machine().lower()
processor = platform.processor()
if system == "windows":
if "arm" in machine:
return next((a for a in assets if "win-arm64" in a["name"]), None)
elif "avx512" in processor:
return next((a for a in assets if "win-avx512-x64" in a["name"]), None)
elif "avx2" in processor:
return next((a for a in assets if "win-avx2-x64" in a["name"]), None)
elif "avx" in processor:
return next((a for a in assets if "win-avx-x64" in a["name"]), None)
else:
return next((a for a in assets if "win-noavx-x64" in a["name"]), None)
elif system == "darwin":
if "arm" in machine:
return next((a for a in assets if "macos-arm64" in a["name"]), None)
else:
return next((a for a in assets if "macos-x64" in a["name"]), None)
elif system == "linux":
return next((a for a in assets if "ubuntu-x64" in a["name"]), None)
return None
def _check_cache(self, release_info, asset):
"""Check whether the latest binaries are already cached."""
cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
if os.path.exists(cache_info_path):
with open(cache_info_path, "r") as f:
cache_info = json.load(f)
if (
cache_info.get("tag_name") == release_info["tag_name"]
and cache_info.get("asset_name") == asset["name"]
):
return True
return False
def _download_and_unzip(self, url, asset_name):
"""Download and extract llama.cpp binaries."""
os.makedirs(self.cache_dir, exist_ok=True)
zip_path = os.path.join(self.cache_dir, asset_name)
self._log(f"Downloading binary from: {url}")
response = requests.get(url)
if response.status_code == 200:
with open(zip_path, "wb") as file:
file.write(response.content)
self._log(f"Successfully downloaded: {asset_name}")
else:
raise RuntimeError(f"Failed to download binary: {url}")
extract_dir = os.path.join(self.cache_dir, "llama_cpp")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extract_dir)
self._log(f"Extracted binaries to: {extract_dir}")
def _update_cache_info(self, release_info, asset):
"""Update cache metadata with the downloaded release info."""
cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
with open(cache_info_path, "w") as f:
json.dump(cache_info, f)

7
pyproject.toml Normal file
View File

@ -0,0 +1,7 @@
[project]
name = "llama-cpp-runner"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = []