diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..2c07333
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/hello.py b/hello.py
new file mode 100644
index 0000000..f19c9e2
--- /dev/null
+++ b/hello.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from llama-cpp-runner!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py
index 78cec40..8e03a7d 100644
--- a/main.py
+++ b/main.py
@@ -2,168 +2,222 @@ import os
 import platform
 import requests
 import zipfile
-import argparse
 import json
-from packaging import version
+import subprocess
+import threading
 import stat
+import time
+import socket
 
 
-def log(message, verbose=False):
-    if verbose:
-        print(message)
+class LlamaCppServer:
+    def __init__(
+        self, llama_cpp_path=None, gguf_path=None, cache_dir="./cache", verbose=False
+    ):
+        self.verbose = verbose
+        self.cache_dir = cache_dir
+        self.llama_cpp_path = llama_cpp_path
+        self.gguf_path = gguf_path
+        self.server_process = None
+        self._server_url = None
+        self._server_thread = None
+        self.port = None
 
+        # Fetch or validate llama path
+        if llama_cpp_path is None:
+            self.llama_cpp_path = self._install_llama_cpp_binaries()
+        elif not os.path.exists(llama_cpp_path):
+            raise FileNotFoundError(
+                f"Specified llama_cpp_path not found: {llama_cpp_path}"
+            )
 
-def get_latest_release(verbose=False):
-    api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
-    log(f"Fetching latest release info from: {api_url}", verbose)
-    response = requests.get(api_url)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        raise Exception(
-            f"Failed to fetch release info. Status code: {response.status_code}"
+        # Start the server if gguf_path is provided
+        if gguf_path:
+            self._start_server_in_thread()
+
+    @property
+    def url(self):
+        """Return the URL where the server is running."""
+        if self._server_url is None:
+            raise ValueError(
+                "Server is not running. Start the server with a valid GGUF path."
+            )
+        return self._server_url
+
+    def kill(self):
+        """Kill the server process and clean up."""
+        if self.server_process and self.server_process.poll() is None:
+            self.server_process.terminate()
+            self.server_process.wait()
+            self.server_process = None
+            self._server_url = None
+            self.port = None
+            self._log("Llama server successfully killed.")
+        if self._server_thread and self._server_thread.is_alive():
+            self._server_thread.join()
+
+    def _start_server_in_thread(self):
+        """Start the server in a separate thread."""
+
+        def target():
+            try:
+                self._start_server()
+            except Exception as e:
+                self._log(f"Failed to start server: {e}")
+
+        self._server_thread = threading.Thread(target=target, daemon=True)
+        self._server_thread.start()
+
+    def _start_server(self):
+        """Start the llama-server."""
+        if not self.gguf_path or not os.path.exists(self.gguf_path):
+            raise ValueError(
+                f"GGUF model path is not specified or invalid: {self.gguf_path}"
+            )
+
+        server_binary = os.path.join(
+            self.llama_cpp_path, "build", "bin", "llama-server"
+        )
+        if not os.path.exists(server_binary):
+            raise FileNotFoundError(f"Server binary not found: {server_binary}")
+
+        # Ensure the binary is executable
+        self._set_executable(server_binary)
+
+        # Find an available port
+        self.port = self._find_available_port(start_port=10000)
+        if self.port is None:
+            raise RuntimeError("No available port found between 10000 and 11000.")
+
+        self._log(f"Starting server with binary: {server_binary}")
+        self._log(f"Using GGUF path: {self.gguf_path}")
+        self._log(f"Using port: {self.port}")
+
+        self.server_process = subprocess.Popen(
+            [server_binary, "-m", self.gguf_path, "--port", str(self.port)],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True,
         )
 
+        # Wait for the server to confirm it is ready by monitoring its output
+        self._server_url = None
+        for line in iter(self.server_process.stdout.readline, ""):
+            self._log(line.strip())
+            if "Listening on" in line:
+                self._server_url = f"http://localhost:{self.port}"
+                self._log(f"Server is now accessible at {self._server_url}")
+                break
 
-def get_appropriate_asset(assets, verbose=False):
-    system = platform.system().lower()
-    machine = platform.machine().lower()
-    processor = platform.processor()
-    log(f"System: {system}", verbose)
-    log(f"Machine: {machine}", verbose)
-    log(f"Processor: {processor}", verbose)
-    if system == "windows":
-        if "arm" in machine:
-            asset = next((a for a in assets if "win-arm64" in a["name"]), None)
-        else:
-            if "avx512" in processor:
-                asset = next((a for a in assets if "win-avx512-x64" in a["name"]), None)
-            elif "avx2" in processor:
-                asset = next((a for a in assets if "win-avx2-x64" in a["name"]), None)
-            elif "avx" in processor:
-                asset = next((a for a in assets if "win-avx-x64" in a["name"]), None)
-            else:
-                asset = next((a for a in assets if "win-noavx-x64" in a["name"]), None)
-    elif system == "darwin":
-        if "arm" in machine:
-            asset = next((a for a in assets if "macos-arm64" in a["name"]), None)
-        else:
-            asset = next((a for a in assets if "macos-x64" in a["name"]), None)
-    elif system == "linux":
-        asset = next((a for a in assets if "ubuntu-x64" in a["name"]), None)
-    else:
-        asset = None
-    log(f"Selected asset: {asset['name'] if asset else None}", verbose)
-    return asset
+        if not self._server_url:
+            raise RuntimeError("Failed to confirm server is running.")
 
+    def _find_available_port(self, start_port=10000, end_port=11000):
+        """Find an available port between `start_port` and `end_port`."""
+        for port in range(start_port, end_port):
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+                if sock.connect_ex(("localhost", port)) != 0:
+                    return port
+        return None
 
-def set_executable(file_path):
-    current_mode = os.stat(file_path).st_mode
-    os.chmod(file_path, current_mode | stat.S_IEXEC)
-
-
-def download_and_unzip(url, asset_name, cache_dir, verbose=False):
-    os.makedirs(cache_dir, exist_ok=True)
-    zip_path = os.path.join(cache_dir, asset_name)
-    log(f"Downloading from: {url}", verbose)
-    response = requests.get(url)
-    if response.status_code == 200:
-        with open(zip_path, "wb") as file:
-            file.write(response.content)
-        log(f"Downloaded: {asset_name}", verbose)
-        extract_dir = os.path.join(cache_dir, "llama_cpp")
-        with zipfile.ZipFile(zip_path, "r") as zip_ref:
-            zip_ref.extractall(extract_dir)
-        log(f"Extracted to: {extract_dir}", verbose)
-        # Set execute permissions for all extracted files
+    def _set_executable(self, file_path):
+        """Ensure the file at `file_path` is executable."""
         if platform.system() != "Windows":
-            for root, dirs, files in os.walk(extract_dir):
-                for file in files:
-                    file_path = os.path.join(root, file)
-                    set_executable(file_path)
-            log("Set execute permissions for extracted files", verbose)
-        else:
-            log("Skipping permission setting on Windows", verbose)
-        print(f"Successfully downloaded and extracted {asset_name}")
-        return True
-    else:
-        print(f"Failed to download {asset_name}")
-        return False
+            current_mode = os.stat(file_path).st_mode
+            os.chmod(
+                file_path, current_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
+            )
 
+    def _log(self, message):
+        """Print a log message if verbosity is enabled."""
+        if self.verbose:
+            print(f"[LlamaCppServer] {message}")
 
-def check_cache(release_info, asset, cache_dir, verbose=False):
-    cache_info_path = os.path.join(cache_dir, "cache_info.json")
-    if os.path.exists(cache_info_path):
-        with open(cache_info_path, "r") as f:
-            cache_info = json.load(f)
-        if (
-            cache_info.get("tag_name") == release_info["tag_name"]
-            and cache_info.get("asset_name") == asset["name"]
-        ):
-            log("Latest version already downloaded.", verbose)
-            return True
-    return False
-
-
-def update_cache_info(release_info, asset, cache_dir):
-    cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
-    cache_info_path = os.path.join(cache_dir, "cache_info.json")
-    with open(cache_info_path, "w") as f:
-        json.dump(cache_info, f)
-
-
-def unzip_asset(asset_name, cache_dir, verbose=False):
-    zip_path = os.path.join(cache_dir, asset_name)
-    extract_dir = os.path.join(cache_dir, "llama_cpp")
-    if os.path.exists(zip_path):
-        with zipfile.ZipFile(zip_path, "r") as zip_ref:
-            zip_ref.extractall(extract_dir)
-        log(f"Extracted to: {extract_dir}", verbose)
-        # Set execute permissions for all extracted files
-        if platform.system() != "Windows":
-            for root, dirs, files in os.walk(extract_dir):
-                for file in files:
-                    file_path = os.path.join(root, file)
-                    set_executable(file_path)
-            log("Set execute permissions for extracted files", verbose)
-        else:
-            log("Skipping permission setting on Windows", verbose)
-        print(f"Successfully extracted {asset_name}")
-        return True
-    else:
-        print(f"Zip file not found: {asset_name}")
-        return False
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Download and extract llama.cpp binaries"
-    )
-    parser.add_argument(
-        "-v", "--verbose", action="store_true", help="Increase output verbosity"
-    )
-    parser.add_argument("-c", "--cache", default="./cache", help="Cache directory")
-    args = parser.parse_args()
-    try:
-        release_info = get_latest_release(args.verbose)
+    def _install_llama_cpp_binaries(self):
+        """Download and install llama.cpp binaries."""
+        self._log("Installing llama.cpp binaries...")
+        release_info = self._get_latest_release()
         assets = release_info["assets"]
-        appropriate_asset = get_appropriate_asset(assets, args.verbose)
-        if appropriate_asset:
-            asset_name = appropriate_asset["name"]
-            if check_cache(release_info, appropriate_asset, args.cache, args.verbose):
-                print("Latest version already downloaded. Extracting cached version.")
-                unzip_asset(asset_name, args.cache, args.verbose)
-            else:
-                download_url = appropriate_asset["browser_download_url"]
-                if download_and_unzip(
-                    download_url, asset_name, args.cache, args.verbose
-                ):
-                    update_cache_info(release_info, appropriate_asset, args.cache)
+        asset = self._get_appropriate_asset(assets)
+        if not asset:
+            raise RuntimeError("No appropriate binary found for your system.")
+        asset_name = asset["name"]
+        if self._check_cache(release_info, asset):
+            self._log("Using cached llama.cpp binaries.")
         else:
-            print("No appropriate binary found for your system.")
-    except Exception as e:
-        print(f"An error occurred: {str(e)}")
+            self._download_and_unzip(asset["browser_download_url"], asset_name)
+            self._update_cache_info(release_info, asset)
+        return os.path.join(self.cache_dir, "llama_cpp")
 
+    def _get_latest_release(self):
+        """Fetch the latest release of llama.cpp from GitHub."""
+        api_url = "https://api.github.com/repos/ggerganov/llama.cpp/releases/latest"
+        response = requests.get(api_url)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            raise RuntimeError(
+                f"Failed to fetch release info. Status code: {response.status_code}"
+            )
 
-if __name__ == "__main__":
-    main()
+    def _get_appropriate_asset(self, assets):
+        """Select the appropriate binary asset for the current system."""
+        system = platform.system().lower()
+        machine = platform.machine().lower()
+        processor = platform.processor()
+        if system == "windows":
+            if "arm" in machine:
+                return next((a for a in assets if "win-arm64" in a["name"]), None)
+            elif "avx512" in processor:
+                return next((a for a in assets if "win-avx512-x64" in a["name"]), None)
+            elif "avx2" in processor:
+                return next((a for a in assets if "win-avx2-x64" in a["name"]), None)
+            elif "avx" in processor:
+                return next((a for a in assets if "win-avx-x64" in a["name"]), None)
+            else:
+                return next((a for a in assets if "win-noavx-x64" in a["name"]), None)
+        elif system == "darwin":
+            if "arm" in machine:
+                return next((a for a in assets if "macos-arm64" in a["name"]), None)
+            else:
+                return next((a for a in assets if "macos-x64" in a["name"]), None)
+        elif system == "linux":
+            return next((a for a in assets if "ubuntu-x64" in a["name"]), None)
+        return None
+
+    def _check_cache(self, release_info, asset):
+        """Check whether the latest binaries are already cached."""
+        cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
+        if os.path.exists(cache_info_path):
+            with open(cache_info_path, "r") as f:
+                cache_info = json.load(f)
+            if (
+                cache_info.get("tag_name") == release_info["tag_name"]
+                and cache_info.get("asset_name") == asset["name"]
+            ):
+                return True
+        return False
+
+    def _download_and_unzip(self, url, asset_name):
+        """Download and extract llama.cpp binaries."""
+        os.makedirs(self.cache_dir, exist_ok=True)
+        zip_path = os.path.join(self.cache_dir, asset_name)
+        self._log(f"Downloading binary from: {url}")
+        response = requests.get(url)
+        if response.status_code == 200:
+            with open(zip_path, "wb") as file:
+                file.write(response.content)
+            self._log(f"Successfully downloaded: {asset_name}")
+        else:
+            raise RuntimeError(f"Failed to download binary: {url}")
+        extract_dir = os.path.join(self.cache_dir, "llama_cpp")
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(extract_dir)
+        self._log(f"Extracted binaries to: {extract_dir}")
+
+    def _update_cache_info(self, release_info, asset):
+        """Update cache metadata with the downloaded release info."""
+        cache_info = {"tag_name": release_info["tag_name"], "asset_name": asset["name"]}
+        cache_info_path = os.path.join(self.cache_dir, "cache_info.json")
+        with open(cache_info_path, "w") as f:
+            json.dump(cache_info, f)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..ac98692
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "llama-cpp-runner"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = []