diff --git a/utils/backup_tool.py b/utils/backup_tool.py
new file mode 100755
index 0000000..70c06bd
--- /dev/null
+++ b/utils/backup_tool.py
@@ -0,0 +1,699 @@
+#########################################################################################
+# A self-installing script for ClearML local server instance backups.
+# This tool provides functionality to create and restore ClearML snapshots.
+# It supports backing up Elasticsearch, MongoDB, Redis, and fileserver data.
+# It also allows scheduling backups using cron jobs.
+# Usage:
+# - Display help and available commands: `uv run backup_tool.py --help`
+# - Create a snapshot: `uv run backup_tool.py create-snapshot --help`
+# - Restore a snapshot: `uv run backup_tool.py restore-snapshot --help`
+# - Setup cron job for automatic backups: `uv run backup_tool.py setup-schedule --help`
+# - Clear existing cron jobs: `uv run backup_tool.py clear-schedule --help`
+#########################################################################################
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "docker",
+# "loguru",
+# "python-crontab",
+# "typer",
+# "pyyaml",
+# ]
+# ///
+
+
+import os
+import pwd
+import subprocess
+import sys
+import re
+import time
+import json
+from pathlib import Path
+from datetime import datetime, timedelta
+
+import yaml
+import docker
+import typer
+from crontab import CronTab
+from loguru import logger
+from pprint import pformat
+
+
+app = typer.Typer(add_completion=False)
+
+
+log_format = (
+ "{time:YYYY-MM-DD at HH:mm:ss} | "
+ "{level: <7} | "
+ "{extra[name]} - "
+ "{message}"
+ )
+
+
+class ClearMLBackupManager:
+ def __init__(self, docker_compose_file: str):
+ self.timestamp = datetime.now().strftime("%Y-%m-%d-%H%M")
+ self.docker_compose_file = docker_compose_file
+
+ # setup logging
+ self.logger = logger
+ self.logger.remove(0)
+ self.logger.add(
+ sys.stdout,
+ format=log_format,
+ colorize=True
+ )
+ self.logger = self.logger.bind(name="MAIN")
+
+ # parse docker compose file
+ if not os.path.exists(docker_compose_file):
+ self.logger.error(f"Docker Compose file not found at {docker_compose_file}.")
+ raise FileNotFoundError(f"Docker Compose file not found at {docker_compose_file}.")
+ self.compose_dict = yaml.safe_load(open(docker_compose_file, "r"))
+
+ # setup containers
+ self.containers = self.setup_containers()
+ if self.containers is None:
+ self.logger.error("Failed to identify containers. Exiting backup process.")
+ raise RuntimeError("Failed to identify containers. Exiting backup process.")
+
+ def cleanup_old_backups(self, backup_root: str, keep_last: int = 2) -> int:
+ """
+ Removes old ClearML snapshot backups, keeping only the most recent `keep_last` snapshots.
+
+ Args:
+ backup_root (str): The root directory where backups are stored.
+ keep_last (int): Number of most recent snapshots to keep.
+ """
+ backup_root_path = Path(backup_root)
+ if not backup_root_path.exists() or not backup_root_path.is_dir():
+ self.logger.error(f"Backup root directory does not exist or is not a directory: {backup_root}")
+ return 1
+
+ # Match folders like: clearml_snapshot_2025-06-05-1030
+ snapshot_dirs = sorted(
+ [p for p in backup_root_path.iterdir() if p.is_dir() and re.match(r"clearml_snapshot_\d{4}-\d{2}-\d{2}-\d{4}", p.name)],
+ key=lambda p: p.name,
+ reverse=True # most recent first
+ )
+
+ if len(snapshot_dirs) <= keep_last:
+ self.logger.info(f"Only {len(snapshot_dirs)} snapshots found. Nothing to clean.")
+ return 0
+
+ to_delete = snapshot_dirs[keep_last:]
+ for folder in to_delete:
+ try:
+ self.logger.info(f"Removing old snapshot: {folder}")
+ subprocess.run(["rm", "-rf", str(folder)], check=True)
+ except Exception as e:
+ self.logger.error(f"Failed to delete {folder}: {e}")
+ return 1
+
+ return 0
+
+ def create_snapshot(self, backup_root: str) -> tuple[int, str]:
+ """ Main method to create a ClearML snapshot. It will backup Elasticsearch, MongoDB, Redis and fileserver data."""
+
+ backup_path = os.path.join(backup_root, f"clearml_snapshot_{self.timestamp}")
+ os.makedirs(backup_path, exist_ok=True)
+
+ # Route logger to the snapshot directory
+ self.logger.add(
+ os.path.join(backup_path, "clearml_backup.log"),
+ format=log_format
+ )
+ self.logger.info("Starting ClearML snapshot creation...")
+
+ # Copy Docker Compose file to backup directory
+ compose_backup_path = os.path.join(backup_path, "docker-compose.yml")
+ response = subprocess.run(["cp", self.docker_compose_file, compose_backup_path], check=True)
+ if response.returncode != 0:
+ self.logger.error(f"Failed to copy Docker Compose file to {compose_backup_path}.")
+ return 1, backup_path
+ self.logger.info(f"Copied Docker Compose file to {compose_backup_path}.")
+
+ # Copy config directory to backup directory
+ config_dir_path = None
+ for volume in self.compose_dict["services"]["apiserver"]["volumes"]:
+ if "/opt/clearml/config" in volume:
+ config_dir_path = volume.split(":")[0]
+ break
+ response = subprocess.run(["cp", "-r", config_dir_path, os.path.join(backup_path, "config")], check=True)
+ if not config_dir_path or not os.path.exists(config_dir_path):
+ self.logger.error(f"Config directory not found in Docker Compose file or does not exist: {config_dir_path}")
+ return 1, backup_path
+ self.logger.info(f"Copied config directory from {config_dir_path} to {os.path.join(backup_path, 'config')}.")
+
+ # Backup Elasticsearch
+ self.logger = self.logger.bind(name="ELASTICSEARCH")
+ status = self.backup_elasticsearch(backup_path)
+ if status != 0:
+ self.logger.error("Elasticsearch backup failed. Exiting backup process.")
+ return status, backup_path
+
+ # Backup MongoDB
+ self.logger = self.logger.bind(name="MONGODB")
+ self.backup_mongodb(backup_path)
+ if status != 0:
+ self.logger.error("MongoDB backup failed. Exiting backup process.")
+ return status, backup_path
+
+ # Backup Redis
+ self.logger = self.logger.bind(name="REDIS")
+ status = self.backup_redis(backup_path)
+ if status != 0:
+ self.logger.error("Redis backup failed. Exiting backup process.")
+ return status, backup_path
+
+ # Backup fileserver
+ self.logger = self.logger.bind(name="FILESERVER")
+ status = self.backup_fileserver(backup_path)
+ if status != 0:
+ self.logger.error("Fileserver backup failed. Exiting backup process.")
+ return status, backup_path
+
+ self.logger = self.logger.bind(name="MAIN")
+ self.logger.info("ClearML snapshot created successfully.")
+ return 0, backup_path
+
+ def restore_snapshot(self, backup_path: str) -> int:
+ """ Main method to restore a ClearML snapshot. It will restore Elasticsearch, MongoDB, Redis and fileserver data."""
+
+ if not os.path.exists(backup_path):
+ self.logger.error(f"Backup path does not exist: {backup_path}")
+ return 1
+
+ self.logger.info("Starting ClearML snapshot restoration...")
+
+ # Restore Elasticsearch
+ self.logger = self.logger.bind(name="ELASTICSEARCH")
+ status = self.restore_elasticsearch(backup_path)
+ if status != 0:
+ self.logger.error("Elasticsearch restoration failed. Exiting restore process.")
+ return status
+
+ # Restore MongoDB
+ self.logger = self.logger.bind(name="MONGODB")
+ status = self.restore_mongodb(backup_path)
+ if status != 0:
+ self.logger.error("MongoDB restoration failed. Exiting restore process.")
+ return status
+
+ # # Restore Redis
+ self.logger = self.logger.bind(name="REDIS")
+ status = self.restore_redis(backup_path)
+ if status != 0:
+ self.logger.error("Redis restoration failed. Exiting restore process.")
+ return status
+
+ # # Restore fileserver
+ self.logger = self.logger.bind(name="FILESERVER")
+ status = self.restore_fileserver(backup_path)
+ if status != 0:
+ self.logger.error("Fileserver restoration failed. Exiting restore process.")
+ return status
+
+ self.logger = self.logger.bind(name="MAIN")
+ self.logger.info("ClearML snapshot restored successfully.")
+ return 0
+
+ def setup_containers(self) -> dict | None:
+ """ Identifies ClearML containers and returns them in a dictionary."""
+
+ containers = {}
+ docker_client = docker.from_env()
+ for container in docker_client.containers.list():
+ if "clearml-elastic" in container.name:
+ if "elastic" in containers.keys():
+ self.logger.error(f"Multiple Elasticsearch containers found: {containers['elastic'].id} and {container.id}. Using the first one.")
+ containers["elastic"] = container
+ self.logger.info(f"Found Elasticsearch container: {container.name} ({container}, {container.image})")
+ elif "clearml-mongo" in container.name:
+ if "mongo" in containers.keys():
+ self.logger.error(f"Multiple MongoDB containers found: {containers['mongo'].id} and {container.id}. Using the first one.")
+ containers["mongo"] = container
+ self.logger.info(f"Found MongoDB container: {container.name} ({container}, {container.image})")
+ elif "clearml-redis" in container.name:
+ if "redis" in containers.keys():
+ self.logger.error(f"Multiple Redis containers found: {containers['redis'].id} and {container.id}. Using the first one.")
+ containers["redis"] = container
+ self.logger.info(f"Found Redis container: {container.name} ({container}, {container.image})")
+
+ if not "elastic" in containers:
+ self.logger.error("No Elasticsearch container.")
+ return
+ if not "mongo" in containers:
+ self.logger.error("No MongoDB container found.")
+ return
+ if not "redis" in containers:
+ self.logger.error("No Redis container found.")
+ return
+
+ return containers
+
+ def backup_elasticsearch(self, backup_path: str) -> int:
+ """ Backs up Elasticsearch data by creating a snapshot and copying it to the host."""
+ if not "path.repo" in self.compose_dict["services"]["elasticsearch"]["environment"]:
+ self.logger.error("Elasticsearch path.repo environment variable not found in Docker Compose file.")
+ return 1
+
+ es_container_backup_dir = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
+ es_local_backup_dir = os.path.join(backup_path, os.path.basename(os.path.normpath(es_container_backup_dir)))
+ repo_name = "backup"
+ snapshot_name = f"snapshot_{self.timestamp}"
+
+ # Register snapshot repo
+ self.logger.info(f"Registering Elasticsearch snapshot repository '{repo_name}' at {es_container_backup_dir}...")
+ response = self.containers["elastic"].exec_run(
+ f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
+ f"-H 'Content-Type: application/json' "
+ f"-d '{{\"type\": \"fs\", \"settings\": {{\"location\": \"{es_container_backup_dir}\"}}}}'"
+ )
+ response = response.output.decode()
+ response = json.loads(response) if response else {}
+ if "error" in response:
+ self.logger.error(f"Failed to register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
+ return 1
+ else:
+ self.logger.info(f"Elasticsearch snapshot repository registered: \n{pformat(response)}")
+
+
+ # Trigger snapshot
+ self.logger.info(f"Elasticsearch snapshot creation started...")
+ response = self.containers["elastic"].exec_run(
+ f"curl -s -X PUT localhost:9200/_snapshot/{repo_name}/{snapshot_name}?wait_for_completion=true"
+ )
+ response = response.output.decode()
+ response = json.loads(response) if response else {}
+ if "error" in response:
+ self.logger.error(f"Failed to create Elasticsearch snapshot: \n{pformat(response['error'])}")
+ return 1
+ else:
+ self.logger.info(f"Elasticsearch snapshot created: \n{pformat(response)}")
+
+ # Copy snapshot data from container
+ self.logger.info(f"Copying Elasticsearch snapshot data from container to local directory: {es_local_backup_dir}")
+ response = subprocess.run([
+ "docker",
+ "cp",
+ f"{self.containers['elastic'].id}:{es_container_backup_dir}",
+ backup_path,
+ "-q"
+ ])
+ # check files got copied
+ if not os.path.exists(es_local_backup_dir) or not os.listdir(es_local_backup_dir):
+ self.logger.error("Elasticsearch backup directory is empty. Backup failed.")
+ return 1
+ else:
+ self.logger.info(f"Elasticsearch snapshot data copied to: {es_local_backup_dir}")
+
+ return 0
+
+ def restore_elasticsearch(self, backup_path: str) -> int:
+ """ Restores Elasticsearch data from a snapshot by copying it to the container's backup directory."""
+ # Copy the snapshot files back into the container's repo path
+ es_repo = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
+ es_repo_root = os.path.dirname(es_repo)
+ host_snapshot_dir = os.path.join(backup_path, os.path.basename(es_repo))
+ self.logger.info(f"Copying Elasticsearch snapshot files from {host_snapshot_dir} to container at {es_repo_root}")
+ response = subprocess.run([
+ "docker", "cp",
+ host_snapshot_dir,
+ f"{self.containers['elastic'].id}:{es_repo_root}"
+ ], check=True)
+ if response.returncode != 0:
+ self.logger.error(f"Failed to copy Elasticsearch snapshot files from {host_snapshot_dir} to container.")
+ return 1
+ else:
+ self.logger.info(f"Copied Elasticsearch snapshot into container at {es_repo}")
+
+ # Re-register the repo
+ self.logger.info("Re-registering Elasticsearch snapshot repository...")
+ repo_name = "backup"
+ response = self.containers["elastic"].exec_run(
+ f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
+ f"-H 'Content-Type: application/json' "
+ f"-d '{{\"type\":\"fs\",\"settings\":{{\"location\":\"{es_repo}\"}}}}'"
+ )
+ response = response.output.decode()
+ response = json.loads(response) if response else {}
+ self.logger.info(f"Elasticsearch snapshot repository re-registration response: \n{pformat(response)}")
+ if "error" in response:
+ self.logger.error(f"Failed to re-register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
+ return 1
+ else:
+ self.logger.info("Elasticsearch snapshot repository re-registered successfully.")
+
+ # Close any existing indices
+ self.logger.info("Closing all Elasticsearch indices to avoid conflicts during restore...")
+ indices = self.containers["elastic"].exec_run(
+ "curl -s localhost:9200/_cat/indices?h=index"
+ ).output.decode().strip().splitlines()
+ if indices:
+ index_list = ",".join(indices)
+ response = self.containers["elastic"].exec_run(
+ f"curl -s -X POST localhost:9200/{index_list}/_close"
+ )
+ response = response.output.decode()
+ response = json.loads(response) if response else {}
+ self.logger.info(f"Close indices response: \n{pformat(response)}")
+ if "error" in response:
+ self.logger.error(f"Failed to close Elasticsearch indices: \n{pformat(response['error'])}")
+ return 1
+ else:
+ self.logger.info("Closed all Elasticsearch indices.")
+ else:
+ self.logger.info("No Elasticsearch indices found to close.")
+
+ # Trigger the restore
+ snap_timestamp = backup_path.split("_")[-1]
+ snap_name = f"snapshot_{snap_timestamp}"
+ self.logger.info(f"Restoring Elasticsearch snapshot: {snap_name} from repository: {repo_name}...")
+ response = self.containers["elastic"].exec_run(
+ f"curl -s -X POST localhost:9200/_snapshot/{repo_name}/{snap_name}/_restore?wait_for_completion=true "
+ f"-H 'Content-Type: application/json' -d '{{\"include_global_state\":true}}'"
+ )
+ response = response.output.decode()
+ response = json.loads(response) if response else {}
+ if "error" in response:
+ self.logger.error(f"Failed to restore Elasticsearch snapshot: {pformat(response['error'])}")
+ return 1
+ else:
+ self.logger.info(f"Elasticsearch snapshot restored: {pformat(response)}")
+
+ self.logger.info("Elasticsearch snapshot restored.")
+ return 0
+
+ def backup_mongodb(self, backup_path: str) -> int:
+ """ Backs up MongoDB data by creating a dump and copying it to the host."""
+ mongo_container_backup_dir = "/tmp/mongodump"
+ mongo_backup_dir = os.path.join(backup_path, "mongo_backup")
+
+ # clean up old backup directory if exists
+ self.logger.info(f"Cleaning up old MongoDB backup directory: {mongo_container_backup_dir}")
+ self.containers["mongo"].exec_run(f"rm -rf {mongo_container_backup_dir}")
+ # create backup directory on host
+ self.logger.info(f"Creating MongoDB backup directory on host: {mongo_container_backup_dir}")
+ response = self.containers["mongo"].exec_run(f"mongodump --out {mongo_container_backup_dir}")
+ if response.exit_code != 0:
+ self.logger.error(f"Failed to create MongoDB dump: {response.output.decode()}")
+ return 1
+ self.logger.info(f"MongoDB dumped: {response.output.decode()}")
+ # copy backup from container to host
+ self.logger.info(f"Copying MongoDB backup data from container to local directory: {mongo_backup_dir}")
+ response = subprocess.run([
+ "docker",
+ "cp",
+ f"{self.containers['mongo'].id}:{mongo_container_backup_dir}",
+ mongo_backup_dir,
+ "-q"
+ ])
+ # check files got copied
+ if not os.path.exists(mongo_backup_dir) or not os.listdir(mongo_backup_dir):
+ self.logger.error("MongoDB backup directory is empty. Backup failed.")
+ return 1
+
+ self.logger.info(f"MongoDB backup data copied to: {mongo_backup_dir}")
+ return 0
+
+ def restore_mongodb(self, backup_path: str) -> int:
+ """ Restores MongoDB data from a snapshot by copying the dump back into the container and restoring it."""
+ # Copy dump back into container
+ container_target = "/tmp/mongodump_restore"
+ host_dump_dir = os.path.join(backup_path, "mongo_backup")
+ self.logger.info(f"Copying MongoDB dump from {host_dump_dir} to container at {container_target}")
+ response = subprocess.run([
+ "docker", "cp",
+ host_dump_dir,
+ f"{self.containers['mongo'].id}:{container_target}"
+ ], check=True)
+ if response.returncode != 0:
+ self.logger.error(f"Failed to copy MongoDB dump from {host_dump_dir} to container.")
+ return 1
+ self.logger.info(f"Copied Mongo dump into container at {container_target}")
+
+ # Restore to overwrite existing data
+ self.logger.info("Restoring MongoDB data from dump...")
+ response = self.containers["mongo"].exec_run(
+ f"mongorestore --drop {container_target}",
+ user="mongodb" # same user as backup
+ )
+ if response.exit_code != 0:
+ self.logger.error(f"Failed to restore MongoDB data: {response.output.decode()}")
+ return 1
+
+ self.logger.info("MongoDB data restored successfully.")
+ return 0
+
+ def backup_redis(self, backup_path: str) -> int:
+ """ Backs up Redis data by triggering a SAVE command and copying the dump.rdb file to the host."""
+ redis_local_backup_file = os.path.join(backup_path, "dump.rdb")
+
+ # trigger redis backup
+ self.logger.info("Triggering Redis SAVE to create a snapshot...")
+ response = self.containers["redis"].exec_run("redis-cli SAVE")
+ if not response.output.decode().startswith("OK"):
+ self.logger.error(f"Failed to trigger Redis SAVE command: {response.output.decode()}")
+ return 1
+ self.logger.info(f"Redis SAVE command response: {response.output.decode()}")
+
+ # Copy dump.rdb to host
+ self.logger.info(f"Copying Redis dump.rdb from container to local file: {redis_local_backup_file}")
+ response = subprocess.run([
+ "docker",
+ "cp",
+ f"{self.containers['redis'].id}:/data/dump.rdb",
+ redis_local_backup_file,
+ "-q"
+ ])
+ if response.returncode != 0:
+ self.logger.error(f"Failed to copy Redis dump.rdb from container to {redis_local_backup_file}.")
+ return 1
+
+ self.logger.info(f"Redis backup file copied to: {redis_local_backup_file}")
+ return 0
+
+ def restore_redis(self, backup_path: str) -> int:
+ """ Restores Redis data from a snapshot by copying the dump.rdb file back into the container and restarting it."""
+ # Stop Redis to avoid racing writes
+ self.containers["redis"].stop()
+ self.logger.info("Redis container stopped for restore.")
+
+ # Copy dump.rdb back into container
+ host_rdb = os.path.join(backup_path, "dump.rdb")
+ response = subprocess.run([
+ "docker", "cp",
+ host_rdb,
+ f"{self.containers['redis'].id}:/data/dump.rdb"
+ ], check=True)
+ if response.returncode != 0:
+ self.logger.error(f"Failed to copy Redis dump.rdb from {host_rdb} to container.")
+ return 1
+ self.logger.info(f"Copied dump.rdb into Redis container.")
+
+ # Restart Redis
+ self.containers["redis"].start()
+ self.logger.info("Redis container restarted.")
+ return 0
+
+ def backup_fileserver(self, backup_path: str) -> int:
+ """ Backs up fileserver data by copying the fileserver path to a backup directory."""
+ fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
+ fileserver_path = None
+ for volume in fileserver_volumes:
+ if "/mnt/fileserver" in volume:
+ fileserver_path = volume.split(":")[0]
+ self.logger.info(f"Fileserver path: {fileserver_path}")
+ break
+
+ # Ensure fileserver path exists
+ if not fileserver_path:
+ self.logger.error("Fileserver path not found in Docker Compose file.")
+ return 1
+ if not os.path.exists(fileserver_path):
+ self.logger.error(f"Fileserver path does not exist: {fileserver_path}")
+ return 1
+ else:
+ self.logger.info(f"Copying fileserver from {fileserver_path} with rsync...")
+ # Copy fileserver data
+ response = subprocess.run([
+ "rsync", "-av", "--delete", str(fileserver_path), str(backup_path)
+ ], check=True)
+ if response.returncode != 0:
+ self.logger.error(f"Rsync failed: {response}.")
+ return 1
+ else:
+ self.logger.info(f"Rsync successful.")
+ # Check files got copied
+ fileserver_backup_dir = os.path.join(backup_path, "fileserver")
+ if not os.path.exists(fileserver_backup_dir) or not os.listdir(fileserver_backup_dir):
+ self.logger.error("Fileserver backup directory is empty. Backup failed.")
+ return 1
+ else:
+ self.logger.info(f"Fileserver data copied to: {fileserver_backup_dir}")
+ return 0
+
+ def restore_fileserver(self, backup_path: str) -> int:
+ """ Restores fileserver data from a snapshot by rsyncing it back to the live volume."""
+ # Read original volume mount from compose
+ fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
+ fileserver_path = None
+ for volume in fileserver_volumes:
+ if "/mnt/fileserver" in volume:
+ fileserver_path = os.path.dirname(volume.split(":")[0])
+ self.logger.info(f"Fileserver path: {fileserver_path}")
+ break
+ self.logger.info(f"Restoring fileserver to {fileserver_path}")
+
+ # Rsync backup back into the live volume
+ src = os.path.join(backup_path, "fileserver")
+ response = subprocess.run([
+ "rsync", "-av", "--delete",
+ src, fileserver_path
+ ], check=True)
+ if response.returncode != 0:
+ self.logger.error(f"Rsync failed: {response}.")
+ return 1
+
+ self.logger.info("Fileserver data restored successfully.")
+ return 0
+
+
+@app.command()
+def create_snapshot(
+ backup_root: str = typer.Option(
+ help="Root directory where ClearML backups will be stored."
+ ),
+ docker_compose_file: str = typer.Option(
+ help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml')."
+ ),
+ retention: int = typer.Option(
+ 0,
+ help="Number of most recent snapshots to keep. Older snapshots will be deleted. Default is 0 (no clean up).",
+ )
+):
+ """Create a timestamped ClearML snapshot."""
+ tic = time.time()
+ backup_manager = ClearMLBackupManager(docker_compose_file=docker_compose_file)
+ status, backup_path = backup_manager.create_snapshot(backup_root)
+ if status == 0 and retention > 0:
+ backup_manager.cleanup_old_backups(backup_root, keep_last=retention)
+
+ if status != 0:
+ typer.secho(f"{datetime.now()} | Backup failed. Check snapshot logs for details: {backup_path}", fg=typer.colors.RED)
+ else:
+ typer.secho(
+ f"{datetime.now()} | Backup completed in {str(timedelta(seconds=int(time.time() - tic)))}. Snapshot located in {backup_path}.",
+ fg=typer.colors.GREEN
+ )
+
+
+@app.command()
+def restore_snapshot(
+ snapshot_path: str = typer.Option(
+ help="Path to the ClearML snapshot directory to restore from."
+ )
+):
+ """Restore a ClearML snapshot."""
+ typer.secho(f"WARNING! This will overwrite existing ClearML data. Proceed with caution.", fg=typer.colors.YELLOW)
+ typer.secho(f"Before you proceed, make sure that:", fg=typer.colors.YELLOW)
+ typer.secho(f"- You have a manual backup of your current ClearML data (in case there are any on the current server instance).", fg=typer.colors.YELLOW)
+ typer.secho(f"- The data subfolders are created with correct permissions (see https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server_linux_mac).", fg=typer.colors.YELLOW)
+ typer.secho(f"- You are using a docker-compose.yml and config/ copy from {snapshot_path}.", fg=typer.colors.YELLOW)
+ typer.secho(f"- The target ClearML server instance is up and running.", fg=typer.colors.YELLOW)
+ typer.confirm("Do you want to proceed with the restoration?", abort=True)
+
+ if snapshot_path.endswith("/"):
+ snapshot_path = snapshot_path[:-1]
+
+ backup_manager = ClearMLBackupManager(docker_compose_file=os.path.join(snapshot_path, "docker-compose.yml"))
+ status = backup_manager.restore_snapshot(snapshot_path)
+ if status != 0:
+ typer.secho(f"Snapshot restoration failed. Check logs for details", fg=typer.colors.RED)
+ else:
+ typer.secho(f"Snapshot restored successfully.", fg=typer.colors.GREEN)
+
+
+@app.command()
+def clear_schedule():
+ """Clear the existing ClearML backup cron job."""
+ user = pwd.getpwuid(os.getuid())
+ cron = CronTab(user=user.pw_name)
+ for job in cron:
+ if job.comment == "clearml-backup-tool":
+ typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
+ cron.remove(job)
+ cron.write()
+ typer.secho("Cleared all existing ClearML backup cron jobs.", fg=typer.colors.GREEN)
+
+
+@app.command()
+def setup_schedule(
+ backup_root: str = typer.Option(
+ "./clearml_backup",
+ help="Root directory where ClearML backups will be stored. Default is './clearml_backup'.",
+ prompt="Enter the backup root directory",
+ ),
+ docker_compose_file: str = typer.Option(
+ "/opt/clearml/docker-compose.yml",
+ help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml').",
+ prompt="Enter the path to the Docker Compose file"
+ ),
+ retention: int = typer.Option(
+ 2,
+ help="Number of most recent snapshots to keep. Older snapshots will be deleted. (0 = no cleanup).",
+ prompt="Enter the number of most recent snapshots to keep (0 = no cleanup)"
+ ),
+ backup_period: str = typer.Option(
+ "7d",
+ help="Backup period for the cron job in the format '{number}{unit}} where unit is one of 'm' (minutes), 'h' (hours), 'd' (days)'.",
+ prompt="Enter the backup period for the cron job (format: '{number}{unit}').",
+ )
+):
+ """Set up a cron job to automatically create ClearML snapshots. You can run this without any arguments to go through an interactive setup."""
+ assert re.match(r'^\d+[mhd]$', backup_period), "Backup period must be in the format '{number}{unit}' where unit is one of 'm', 'h', 'd'."
+
+ user = pwd.getpwuid(os.getuid())
+ cron = CronTab(user=user.pw_name)
+ abs_backup_root = os.path.abspath(backup_root)
+ abs_docker_compose_file = os.path.abspath(docker_compose_file)
+
+ for job in cron:
+ if job.comment == "clearml-backup-tool":
+ typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
+ cron.remove(job)
+ cron.write()
+
+ uv_path = subprocess.run(["which", "uv"], capture_output=True, text=True, check=True).stdout.strip()
+ command = (f"{uv_path} run {os.path.abspath(__file__)} create-snapshot "
+ f"--backup-root {abs_backup_root} "
+ f"--docker-compose-file {abs_docker_compose_file} "
+ f"--retention {retention} "
+ f"| tail -n 1 >> {abs_backup_root}/autobackup.log 2>&1"
+ )
+ job = cron.new(command=command, comment="clearml-backup-tool")
+ num, unit = int(backup_period[:-1]), backup_period[-1]
+ match unit:
+ case 'm':
+ job.minute.every(num)
+ case 'h':
+ job.hour.every(num)
+ case 'd':
+ job.day.every(num)
+ case _:
+ raise ValueError(f"Invalid backup period unit: {unit}. Must be one of 'm', 'h', 'd'.")
+ cron.write()
+
+ for job in cron:
+ if job.comment == "clearml-backup-tool":
+ break
+ typer.secho(f"Set up cron job: {job}", fg=typer.colors.BLUE)
+ typer.secho(f"Scheduled ClearML backup every {num}{unit}. Job will log to {abs_backup_root}/autobackup.log.", fg=typer.colors.GREEN)
+
+
+if __name__ == "__main__":
+ app()