mirror of
https://github.com/open-webui/llama-cpp-runner
synced 2025-05-13 08:11:01 +00:00
uv init
This commit is contained in:
parent
55199b2827
commit
79e2046a28
1
.python-version
Normal file
1
.python-version
Normal file
@ -0,0 +1 @@
|
|||||||
|
3.11
|
6
hello.py
Normal file
6
hello.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
def main():
|
||||||
|
print("Hello from llama-cpp-runner!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
304
main.py
304
main.py
@ -2,97 +2,192 @@ import os
|
|||||||
import platform
|
import platform
|
||||||
import requests
|
import requests
|
||||||
import zipfile
|
import zipfile
|
||||||
import argparse
|
|
||||||
import json
|
import json
|
||||||
from packaging import version
|
import subprocess
|
||||||
|
import threading
|
||||||
import stat
|
import stat
|
||||||
|
import time
|
||||||
|
import socket
|
||||||
|
|
||||||
|
|
||||||
def log(message, verbose=False):
|
class LlamaCppServer:
|
||||||
if verbose:
|
def __init__(
|
||||||
print(message)
|
self, llama_cpp_path=None, gguf_path=None, cache_dir="./cache", verbose=False
|
||||||
|
):
|
||||||
|
self.verbose = verbose
|
||||||
|
self.cache_dir = cache_dir
|
||||||
|
self.llama_cpp_path = llama_cpp_path
|
||||||
|
self.gguf_path = gguf_path
|
||||||
|
self.server_process = None
|
||||||
|
self._server_url = None
|
||||||
|
self._server_thread = None
|
||||||
|
self.port = None
|
||||||
|
|
||||||
|
# Fetch or validate llama path
|
||||||
|
if llama_cpp_path is None:
|
||||||
|
self.llama_cpp_path = self._install_llama_cpp_binaries()
|
||||||
|
elif not os.path.exists(llama_cpp_path):
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Specified llama_cpp_path not found: {llama_cpp_path}"
|
||||||
|
)
|
||||||
|
|
||||||
def get_latest_release(verbose=False):
|
# Start the server if gguf_path is provided
|
||||||
|
if gguf_path:
|
||||||
|
self._start_server_in_thread()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def url(self):
|
||||||
|
"""Return the URL where the server is running."""
|
||||||
|
if self._server_url is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Server is not running. Start the server with a valid GGUF path."
|
||||||
|
)
|
||||||
|
return self._server_url
|
||||||
|
|
||||||
|
def kill(self):
|
||||||
|
"""Kill the server process and clean up."""
|
||||||
|
if self.server_process and self.server_process.poll() is None:
|
||||||
|
self.server_process.terminate()
|
||||||
|
self.server_process.wait()
|
||||||
|
self.server_process = None
|
||||||
|
self._server_url = None
|
||||||
|
self.port = None
|
||||||
|
self._log("Llama server successfully killed.")
|
||||||
|
if self._server_thread and self._server_thread.is_alive():
|
||||||
|
self._server_thread.join()
|
||||||
|
|
||||||
|
def _start_server_in_thread(self):
|
||||||
|
"""Start the server in a separate thread."""
|
||||||
|
|
||||||
|
def target():
|
||||||
|
try:
|
||||||
|
self._start_server()
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"Failed to start server: {e}")
|
||||||
|
|
||||||
|
self._server_thread = threading.Thread(target=target, daemon=True)
|
||||||
|
self._server_thread.start()
|
||||||
|
|
||||||
|
def _start_server(self):
|
||||||
|
"""Start the llama-server."""
|
||||||
|
if not self.gguf_path or not os.path.exists(self.gguf_path):
|
||||||
|
raise ValueError(
|
||||||
|
f"GGUF model path is not specified or invalid: {self.gguf_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
server_binary = os.path.join(
|
||||||
|
self.llama_cpp_path, "build", "bin", "llama-server"
|
||||||
|
)
|
||||||
|
if not os.path.exists(server_binary):
|
||||||
|
raise FileNotFoundError(f"Server binary not found: {server_binary}")
|
||||||
|
|
||||||
|
# Ensure the binary is executable
|
||||||
|
self._set_executable(server_binary)
|
||||||
|
|
||||||
|
# Find an available port
|
||||||
|
self.port = self._find_available_port(start_port=10000)
|
||||||
|
if self.port is None:
|
||||||
|
raise RuntimeError("No available port found between 10000 and 11000.")
|
||||||
|
|
||||||
|
self._log(f"Starting server with binary: {server_binary}")
|
||||||
|
self._log(f"Using GGUF path: {self.gguf_path}")
|
||||||
|
self._log(f"Using port: {self.port}")
|
||||||
|
|
||||||
|
self.server_process = subprocess.Popen(
|
||||||
|
[server_binary, "-m", self.gguf_path, "--port", str(self.port)],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
universal_newlines=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for the server to confirm it is ready by monitoring its output
|
||||||
|
self._server_url = None
|
||||||
|
for line in iter(self.server_process.stdout.readline, ""):
|
||||||
|
self._log(line.strip())
|
||||||
|
if "Listening on" in line:
|
||||||
|
self._server_url = f"http://localhost:{self.port}"
|
||||||
|
self._log(f"Server is now accessible at {self._server_url}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not self._server_url:
|
||||||
|
raise RuntimeError("Failed to confirm server is running.")
|
||||||
|
|
||||||
|
def _find_available_port(self, start_port=10000, end_port=11000):
|
||||||
|
"""Find an available port between `start_port` and `end_port`."""
|
||||||
|
for port in range(start_port, end_port):
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||||
|
if sock.connect_ex(("localhost", port)) != 0:
|
||||||
|
return port
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _set_executable(self, file_path):
|
||||||
|
"""Ensure the file at `file_path` is executable."""
|
||||||
|
if platform.system() != "Windows":
|
||||||
|
current_mode = os.stat(file_path).st_mode
|
||||||
|
os.chmod(
|
||||||
|
file_path, current_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
|
||||||
|
)
|
||||||
|
|
||||||
|
def _log(self, message):
|
||||||
|
"""Print a log message if verbosity is enabled."""
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LlamaCppServer] {message}")
|
||||||
|
|
||||||
|
def _install_llama_cpp_binaries(self):
|
||||||
|
"""Download and install llama.cpp binaries."""
|
||||||
|
self._log("Installing llama.cpp binaries...")
|
||||||
|
release_info = self._get_latest_release()
|
||||||
|
assets = release_info["assets"]
|
||||||
|
asset = self._get_appropriate_asset(assets)
|
||||||
|
if not asset:
|
||||||
|
raise RuntimeError("No appropriate binary found for your system.")
|
||||||
|
asset_name = asset["name"]
|
||||||
|
if self._check_cache(release_info, asset):
|
||||||
|
self._log("Using cached llama.cpp binaries.")
|
||||||
|
else:
|
||||||
|
self._download_and_unzip(asset["browser_download_url"], asset_name)
|
||||||
|
self._update_cache_info(release_info, asset)
|
||||||
|
return os.path.join(self.cache_dir, "llama_cpp")
|
||||||
|
|
||||||
|
def _get_latest_release(self):
|
||||||
|
"""Fetch the latest release of llama.cpp from GitHub."""
|
||||||
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
|
||||||
log(f"Fetching latest release info from: {api_url}", verbose)
|
|
||||||
response = requests.get(api_url)
|
response = requests.get(api_url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
return response.json()
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise RuntimeError(
|
||||||
f"Failed to fetch release info. Status code: {response.status_code}"
|
f"Failed to fetch release info. Status code: {response.status_code}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _get_appropriate_asset(self, assets):
|
||||||
def get_appropriate_asset(assets, verbose=False):
|
"""Select the appropriate binary asset for the current system."""
|
||||||
system = platform.system().lower()
|
system = platform.system().lower()
|
||||||
machine = platform.machine().lower()
|
machine = platform.machine().lower()
|
||||||
processor = platform.processor()
|
processor = platform.processor()
|
||||||
log(f"System: {system}", verbose)
|
|
||||||
log(f"Machine: {machine}", verbose)
|
|
||||||
log(f"Processor: {processor}", verbose)
|
|
||||||
if system == "windows":
|
if system == "windows":
|
||||||
if "arm" in machine:
|
if "arm" in machine:
|
||||||
asset = next((a for a in assets if "win-arm64" in a["name"]), None)
|
return next((a for a in assets if "win-arm64" in a["name"]), None)
|
||||||
else:
|
elif "avx512" in processor:
|
||||||
if "avx512" in processor:
|
return next((a for a in assets if "win-avx512-x64" in a["name"]), None)
|
||||||
asset = next((a for a in assets if "win-avx512-x64" in a["name"]), None)
|
|
||||||
elif "avx2" in processor:
|
elif "avx2" in processor:
|
||||||
asset = next((a for a in assets if "win-avx2-x64" in a["name"]), None)
|
return next((a for a in assets if "win-avx2-x64" in a["name"]), None)
|
||||||
elif "avx" in processor:
|
elif "avx" in processor:
|
||||||
asset = next((a for a in assets if "win-avx-x64" in a["name"]), None)
|
return next((a for a in assets if "win-avx-x64" in a["name"]), None)
|
||||||
else:
|
else:
|
||||||
asset = next((a for a in assets if "win-noavx-x64" in a["name"]), None)
|
return next((a for a in assets if "win-noavx-x64" in a["name"]), None)
|
||||||
elif system == "darwin":
|
elif system == "darwin":
|
||||||
if "arm" in machine:
|
if "arm" in machine:
|
||||||
asset = next((a for a in assets if "macos-arm64" in a["name"]), None)
|
return next((a for a in assets if "macos-arm64" in a["name"]), None)
|
||||||
else:
|
else:
|
||||||
asset = next((a for a in assets if "macos-x64" in a["name"]), None)
|
return next((a for a in assets if "macos-x64" in a["name"]), None)
|
||||||
elif system == "linux":
|
elif system == "linux":
|
||||||
asset = next((a for a in assets if "ubuntu-x64" in a["name"]), None)
|
return next((a for a in assets if "ubuntu-x64" in a["name"]), None)
|
||||||
else:
|
return None
|
||||||
asset = None
|
|
||||||
log(f"Selected asset: {asset['name'] if asset else None}", verbose)
|
|
||||||
return asset
|
|
||||||
|
|
||||||
|
def _check_cache(self, release_info, asset):
|
||||||
def set_executable(file_path):
|
"""Check whether the latest binaries are already cached."""
|
||||||
current_mode = os.stat(file_path).st_mode
|
cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
|
||||||
os.chmod(file_path, current_mode | stat.S_IEXEC)
|
|
||||||
|
|
||||||
|
|
||||||
def download_and_unzip(url, asset_name, cache_dir, verbose=False):
|
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
|
||||||
zip_path = os.path.join(cache_dir, asset_name)
|
|
||||||
log(f"Downloading from: {url}", verbose)
|
|
||||||
response = requests.get(url)
|
|
||||||
if response.status_code == 200:
|
|
||||||
with open(zip_path, "wb") as file:
|
|
||||||
file.write(response.content)
|
|
||||||
log(f"Downloaded: {asset_name}", verbose)
|
|
||||||
extract_dir = os.path.join(cache_dir, "llama_cpp")
|
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
|
||||||
zip_ref.extractall(extract_dir)
|
|
||||||
log(f"Extracted to: {extract_dir}", verbose)
|
|
||||||
# Set execute permissions for all extracted files
|
|
||||||
if platform.system() != "Windows":
|
|
||||||
for root, dirs, files in os.walk(extract_dir):
|
|
||||||
for file in files:
|
|
||||||
file_path = os.path.join(root, file)
|
|
||||||
set_executable(file_path)
|
|
||||||
log("Set execute permissions for extracted files", verbose)
|
|
||||||
else:
|
|
||||||
log("Skipping permission setting on Windows", verbose)
|
|
||||||
print(f"Successfully downloaded and extracted {asset_name}")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"Failed to download {asset_name}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def check_cache(release_info, asset, cache_dir, verbose=False):
|
|
||||||
cache_info_path = os.path.join(cache_dir, "cache_info.json")
|
|
||||||
if os.path.exists(cache_info_path):
|
if os.path.exists(cache_info_path):
|
||||||
with open(cache_info_path, "r") as f:
|
with open(cache_info_path, "r") as f:
|
||||||
cache_info = json.load(f)
|
cache_info = json.load(f)
|
||||||
@ -100,70 +195,29 @@ def check_cache(release_info, asset, cache_dir, verbose=False):
|
|||||||
cache_info.get("tag_name") == release_info["tag_name"]
|
cache_info.get("tag_name") == release_info["tag_name"]
|
||||||
and cache_info.get("asset_name") == asset["name"]
|
and cache_info.get("asset_name") == asset["name"]
|
||||||
):
|
):
|
||||||
log("Latest version already downloaded.", verbose)
|
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _download_and_unzip(self, url, asset_name):
|
||||||
def update_cache_info(release_info, asset, cache_dir):
|
"""Download and extract llama.cpp binaries."""
|
||||||
cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
|
os.makedirs(self.cache_dir, exist_ok=True)
|
||||||
cache_info_path = os.path.join(cache_dir, "cache_info.json")
|
zip_path = os.path.join(self.cache_dir, asset_name)
|
||||||
with open(cache_info_path, "w") as f:
|
self._log(f"Downloading binary from: {url}")
|
||||||
json.dump(cache_info, f)
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
with open(zip_path, "wb") as file:
|
||||||
def unzip_asset(asset_name, cache_dir, verbose=False):
|
file.write(response.content)
|
||||||
zip_path = os.path.join(cache_dir, asset_name)
|
self._log(f"Successfully downloaded: {asset_name}")
|
||||||
extract_dir = os.path.join(cache_dir, "llama_cpp")
|
else:
|
||||||
if os.path.exists(zip_path):
|
raise RuntimeError(f"Failed to download binary: {url}")
|
||||||
|
extract_dir = os.path.join(self.cache_dir, "llama_cpp")
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
zip_ref.extractall(extract_dir)
|
zip_ref.extractall(extract_dir)
|
||||||
log(f"Extracted to: {extract_dir}", verbose)
|
self._log(f"Extracted binaries to: {extract_dir}")
|
||||||
# Set execute permissions for all extracted files
|
|
||||||
if platform.system() != "Windows":
|
|
||||||
for root, dirs, files in os.walk(extract_dir):
|
|
||||||
for file in files:
|
|
||||||
file_path = os.path.join(root, file)
|
|
||||||
set_executable(file_path)
|
|
||||||
log("Set execute permissions for extracted files", verbose)
|
|
||||||
else:
|
|
||||||
log("Skipping permission setting on Windows", verbose)
|
|
||||||
print(f"Successfully extracted {asset_name}")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"Zip file not found: {asset_name}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
def _update_cache_info(self, release_info, asset):
|
||||||
def main():
|
"""Update cache metadata with the downloaded release info."""
|
||||||
parser = argparse.ArgumentParser(
|
cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
|
||||||
description="Download and extract llama.cpp binaries"
|
cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
|
||||||
)
|
with open(cache_info_path, "w") as f:
|
||||||
parser.add_argument(
|
json.dump(cache_info, f)
|
||||||
"-v", "--verbose", action="store_true", help="Increase output verbosity"
|
|
||||||
)
|
|
||||||
parser.add_argument("-c", "--cache", default="./cache", help="Cache directory")
|
|
||||||
args = parser.parse_args()
|
|
||||||
try:
|
|
||||||
release_info = get_latest_release(args.verbose)
|
|
||||||
assets = release_info["assets"]
|
|
||||||
appropriate_asset = get_appropriate_asset(assets, args.verbose)
|
|
||||||
if appropriate_asset:
|
|
||||||
asset_name = appropriate_asset["name"]
|
|
||||||
if check_cache(release_info, appropriate_asset, args.cache, args.verbose):
|
|
||||||
print("Latest version already downloaded. Extracting cached version.")
|
|
||||||
unzip_asset(asset_name, args.cache, args.verbose)
|
|
||||||
else:
|
|
||||||
download_url = appropriate_asset["browser_download_url"]
|
|
||||||
if download_and_unzip(
|
|
||||||
download_url, asset_name, args.cache, args.verbose
|
|
||||||
):
|
|
||||||
update_cache_info(release_info, appropriate_asset, args.cache)
|
|
||||||
else:
|
|
||||||
print("No appropriate binary found for your system.")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"An error occurred: {str(e)}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
7
pyproject.toml
Normal file
7
pyproject.toml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
[project]
|
||||||
|
name = "llama-cpp-runner"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = []
|
Loading…
Reference in New Issue
Block a user