self-installing script for running, scheduling and restorint local clearml server instances

This commit is contained in:
kodym@t3d.team 2025-06-17 09:18:06 +02:00
parent d998b46cb2
commit fa48caaa49

699
utils/backup_tool.py Executable file
View File

@ -0,0 +1,699 @@
#########################################################################################
# A self-installing script for ClearML local server instance backups.
# This tool provides functionality to create and restore ClearML snapshots.
# It supports backing up Elasticsearch, MongoDB, Redis, and fileserver data.
# It also allows scheduling backups using cron jobs.
# Usage:
# - Display help and available commands: `uv run backup_tool.py --help`
# - Create a snapshot: `uv run backup_tool.py create-snapshot --help`
# - Restore a snapshot: `uv run backup_tool.py restore-snapshot --help`
# - Setup cron job for automatic backups: `uv run backup_tool.py setup-schedule --help`
# - Clear existing cron jobs: `uv run backup_tool.py clear-schedule --help`
#########################################################################################
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "docker",
# "loguru",
# "python-crontab",
# "typer",
# "pyyaml",
# ]
# ///
import os
import pwd
import subprocess
import sys
import re
import time
import json
from pathlib import Path
from datetime import datetime, timedelta
import yaml
import docker
import typer
from crontab import CronTab
from loguru import logger
from pprint import pformat
app = typer.Typer(add_completion=False)
log_format = (
"<green>{time:YYYY-MM-DD at HH:mm:ss}</green> | "
"<level>{level: <7}</level> | "
"<bold><magenta>{extra[name]}</magenta></bold> - "
"{message}"
)
class ClearMLBackupManager:
def __init__(self, docker_compose_file: str):
self.timestamp = datetime.now().strftime("%Y-%m-%d-%H%M")
self.docker_compose_file = docker_compose_file
# setup logging
self.logger = logger
self.logger.remove(0)
self.logger.add(
sys.stdout,
format=log_format,
colorize=True
)
self.logger = self.logger.bind(name="MAIN")
# parse docker compose file
if not os.path.exists(docker_compose_file):
self.logger.error(f"Docker Compose file not found at {docker_compose_file}.")
raise FileNotFoundError(f"Docker Compose file not found at {docker_compose_file}.")
self.compose_dict = yaml.safe_load(open(docker_compose_file, "r"))
# setup containers
self.containers = self.setup_containers()
if self.containers is None:
self.logger.error("Failed to identify containers. Exiting backup process.")
raise RuntimeError("Failed to identify containers. Exiting backup process.")
def cleanup_old_backups(self, backup_root: str, keep_last: int = 2) -> int:
"""
Removes old ClearML snapshot backups, keeping only the most recent `keep_last` snapshots.
Args:
backup_root (str): The root directory where backups are stored.
keep_last (int): Number of most recent snapshots to keep.
"""
backup_root_path = Path(backup_root)
if not backup_root_path.exists() or not backup_root_path.is_dir():
self.logger.error(f"Backup root directory does not exist or is not a directory: {backup_root}")
return 1
# Match folders like: clearml_snapshot_2025-06-05-1030
snapshot_dirs = sorted(
[p for p in backup_root_path.iterdir() if p.is_dir() and re.match(r"clearml_snapshot_\d{4}-\d{2}-\d{2}-\d{4}", p.name)],
key=lambda p: p.name,
reverse=True # most recent first
)
if len(snapshot_dirs) <= keep_last:
self.logger.info(f"Only {len(snapshot_dirs)} snapshots found. Nothing to clean.")
return 0
to_delete = snapshot_dirs[keep_last:]
for folder in to_delete:
try:
self.logger.info(f"Removing old snapshot: {folder}")
subprocess.run(["rm", "-rf", str(folder)], check=True)
except Exception as e:
self.logger.error(f"Failed to delete {folder}: {e}")
return 1
return 0
def create_snapshot(self, backup_root: str) -> tuple[int, str]:
""" Main method to create a ClearML snapshot. It will backup Elasticsearch, MongoDB, Redis and fileserver data."""
backup_path = os.path.join(backup_root, f"clearml_snapshot_{self.timestamp}")
os.makedirs(backup_path, exist_ok=True)
# Route logger to the snapshot directory
self.logger.add(
os.path.join(backup_path, "clearml_backup.log"),
format=log_format
)
self.logger.info("Starting ClearML snapshot creation...")
# Copy Docker Compose file to backup directory
compose_backup_path = os.path.join(backup_path, "docker-compose.yml")
response = subprocess.run(["cp", self.docker_compose_file, compose_backup_path], check=True)
if response.returncode != 0:
self.logger.error(f"Failed to copy Docker Compose file to {compose_backup_path}.")
return 1, backup_path
self.logger.info(f"Copied Docker Compose file to {compose_backup_path}.")
# Copy config directory to backup directory
config_dir_path = None
for volume in self.compose_dict["services"]["apiserver"]["volumes"]:
if "/opt/clearml/config" in volume:
config_dir_path = volume.split(":")[0]
break
response = subprocess.run(["cp", "-r", config_dir_path, os.path.join(backup_path, "config")], check=True)
if not config_dir_path or not os.path.exists(config_dir_path):
self.logger.error(f"Config directory not found in Docker Compose file or does not exist: {config_dir_path}")
return 1, backup_path
self.logger.info(f"Copied config directory from {config_dir_path} to {os.path.join(backup_path, 'config')}.")
# Backup Elasticsearch
self.logger = self.logger.bind(name="ELASTICSEARCH")
status = self.backup_elasticsearch(backup_path)
if status != 0:
self.logger.error("Elasticsearch backup failed. Exiting backup process.")
return status, backup_path
# Backup MongoDB
self.logger = self.logger.bind(name="MONGODB")
self.backup_mongodb(backup_path)
if status != 0:
self.logger.error("MongoDB backup failed. Exiting backup process.")
return status, backup_path
# Backup Redis
self.logger = self.logger.bind(name="REDIS")
status = self.backup_redis(backup_path)
if status != 0:
self.logger.error("Redis backup failed. Exiting backup process.")
return status, backup_path
# Backup fileserver
self.logger = self.logger.bind(name="FILESERVER")
status = self.backup_fileserver(backup_path)
if status != 0:
self.logger.error("Fileserver backup failed. Exiting backup process.")
return status, backup_path
self.logger = self.logger.bind(name="MAIN")
self.logger.info("ClearML snapshot created successfully.")
return 0, backup_path
def restore_snapshot(self, backup_path: str) -> int:
""" Main method to restore a ClearML snapshot. It will restore Elasticsearch, MongoDB, Redis and fileserver data."""
if not os.path.exists(backup_path):
self.logger.error(f"Backup path does not exist: {backup_path}")
return 1
self.logger.info("Starting ClearML snapshot restoration...")
# Restore Elasticsearch
self.logger = self.logger.bind(name="ELASTICSEARCH")
status = self.restore_elasticsearch(backup_path)
if status != 0:
self.logger.error("Elasticsearch restoration failed. Exiting restore process.")
return status
# Restore MongoDB
self.logger = self.logger.bind(name="MONGODB")
status = self.restore_mongodb(backup_path)
if status != 0:
self.logger.error("MongoDB restoration failed. Exiting restore process.")
return status
# # Restore Redis
self.logger = self.logger.bind(name="REDIS")
status = self.restore_redis(backup_path)
if status != 0:
self.logger.error("Redis restoration failed. Exiting restore process.")
return status
# # Restore fileserver
self.logger = self.logger.bind(name="FILESERVER")
status = self.restore_fileserver(backup_path)
if status != 0:
self.logger.error("Fileserver restoration failed. Exiting restore process.")
return status
self.logger = self.logger.bind(name="MAIN")
self.logger.info("ClearML snapshot restored successfully.")
return 0
def setup_containers(self) -> dict | None:
""" Identifies ClearML containers and returns them in a dictionary."""
containers = {}
docker_client = docker.from_env()
for container in docker_client.containers.list():
if "clearml-elastic" in container.name:
if "elastic" in containers.keys():
self.logger.error(f"Multiple Elasticsearch containers found: {containers['elastic'].id} and {container.id}. Using the first one.")
containers["elastic"] = container
self.logger.info(f"Found Elasticsearch container: {container.name} ({container}, {container.image})")
elif "clearml-mongo" in container.name:
if "mongo" in containers.keys():
self.logger.error(f"Multiple MongoDB containers found: {containers['mongo'].id} and {container.id}. Using the first one.")
containers["mongo"] = container
self.logger.info(f"Found MongoDB container: {container.name} ({container}, {container.image})")
elif "clearml-redis" in container.name:
if "redis" in containers.keys():
self.logger.error(f"Multiple Redis containers found: {containers['redis'].id} and {container.id}. Using the first one.")
containers["redis"] = container
self.logger.info(f"Found Redis container: {container.name} ({container}, {container.image})")
if not "elastic" in containers:
self.logger.error("No Elasticsearch container.")
return
if not "mongo" in containers:
self.logger.error("No MongoDB container found.")
return
if not "redis" in containers:
self.logger.error("No Redis container found.")
return
return containers
def backup_elasticsearch(self, backup_path: str) -> int:
""" Backs up Elasticsearch data by creating a snapshot and copying it to the host."""
if not "path.repo" in self.compose_dict["services"]["elasticsearch"]["environment"]:
self.logger.error("Elasticsearch path.repo environment variable not found in Docker Compose file.")
return 1
es_container_backup_dir = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
es_local_backup_dir = os.path.join(backup_path, os.path.basename(os.path.normpath(es_container_backup_dir)))
repo_name = "backup"
snapshot_name = f"snapshot_{self.timestamp}"
# Register snapshot repo
self.logger.info(f"Registering Elasticsearch snapshot repository '{repo_name}' at {es_container_backup_dir}...")
response = self.containers["elastic"].exec_run(
f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
f"-H 'Content-Type: application/json' "
f"-d '{{\"type\": \"fs\", \"settings\": {{\"location\": \"{es_container_backup_dir}\"}}}}'"
)
response = response.output.decode()
response = json.loads(response) if response else {}
if "error" in response:
self.logger.error(f"Failed to register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
return 1
else:
self.logger.info(f"Elasticsearch snapshot repository registered: \n{pformat(response)}")
# Trigger snapshot
self.logger.info(f"Elasticsearch snapshot creation started...")
response = self.containers["elastic"].exec_run(
f"curl -s -X PUT localhost:9200/_snapshot/{repo_name}/{snapshot_name}?wait_for_completion=true"
)
response = response.output.decode()
response = json.loads(response) if response else {}
if "error" in response:
self.logger.error(f"Failed to create Elasticsearch snapshot: \n{pformat(response['error'])}")
return 1
else:
self.logger.info(f"Elasticsearch snapshot created: \n{pformat(response)}")
# Copy snapshot data from container
self.logger.info(f"Copying Elasticsearch snapshot data from container to local directory: {es_local_backup_dir}")
response = subprocess.run([
"docker",
"cp",
f"{self.containers['elastic'].id}:{es_container_backup_dir}",
backup_path,
"-q"
])
# check files got copied
if not os.path.exists(es_local_backup_dir) or not os.listdir(es_local_backup_dir):
self.logger.error("Elasticsearch backup directory is empty. Backup failed.")
return 1
else:
self.logger.info(f"Elasticsearch snapshot data copied to: {es_local_backup_dir}")
return 0
def restore_elasticsearch(self, backup_path: str) -> int:
""" Restores Elasticsearch data from a snapshot by copying it to the container's backup directory."""
# Copy the snapshot files back into the container's repo path
es_repo = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
es_repo_root = os.path.dirname(es_repo)
host_snapshot_dir = os.path.join(backup_path, os.path.basename(es_repo))
self.logger.info(f"Copying Elasticsearch snapshot files from {host_snapshot_dir} to container at {es_repo_root}")
response = subprocess.run([
"docker", "cp",
host_snapshot_dir,
f"{self.containers['elastic'].id}:{es_repo_root}"
], check=True)
if response.returncode != 0:
self.logger.error(f"Failed to copy Elasticsearch snapshot files from {host_snapshot_dir} to container.")
return 1
else:
self.logger.info(f"Copied Elasticsearch snapshot into container at {es_repo}")
# Re-register the repo
self.logger.info("Re-registering Elasticsearch snapshot repository...")
repo_name = "backup"
response = self.containers["elastic"].exec_run(
f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
f"-H 'Content-Type: application/json' "
f"-d '{{\"type\":\"fs\",\"settings\":{{\"location\":\"{es_repo}\"}}}}'"
)
response = response.output.decode()
response = json.loads(response) if response else {}
self.logger.info(f"Elasticsearch snapshot repository re-registration response: \n{pformat(response)}")
if "error" in response:
self.logger.error(f"Failed to re-register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
return 1
else:
self.logger.info("Elasticsearch snapshot repository re-registered successfully.")
# Close any existing indices
self.logger.info("Closing all Elasticsearch indices to avoid conflicts during restore...")
indices = self.containers["elastic"].exec_run(
"curl -s localhost:9200/_cat/indices?h=index"
).output.decode().strip().splitlines()
if indices:
index_list = ",".join(indices)
response = self.containers["elastic"].exec_run(
f"curl -s -X POST localhost:9200/{index_list}/_close"
)
response = response.output.decode()
response = json.loads(response) if response else {}
self.logger.info(f"Close indices response: \n{pformat(response)}")
if "error" in response:
self.logger.error(f"Failed to close Elasticsearch indices: \n{pformat(response['error'])}")
return 1
else:
self.logger.info("Closed all Elasticsearch indices.")
else:
self.logger.info("No Elasticsearch indices found to close.")
# Trigger the restore
snap_timestamp = backup_path.split("_")[-1]
snap_name = f"snapshot_{snap_timestamp}"
self.logger.info(f"Restoring Elasticsearch snapshot: {snap_name} from repository: {repo_name}...")
response = self.containers["elastic"].exec_run(
f"curl -s -X POST localhost:9200/_snapshot/{repo_name}/{snap_name}/_restore?wait_for_completion=true "
f"-H 'Content-Type: application/json' -d '{{\"include_global_state\":true}}'"
)
response = response.output.decode()
response = json.loads(response) if response else {}
if "error" in response:
self.logger.error(f"Failed to restore Elasticsearch snapshot: {pformat(response['error'])}")
return 1
else:
self.logger.info(f"Elasticsearch snapshot restored: {pformat(response)}")
self.logger.info("Elasticsearch snapshot restored.")
return 0
def backup_mongodb(self, backup_path: str) -> int:
""" Backs up MongoDB data by creating a dump and copying it to the host."""
mongo_container_backup_dir = "/tmp/mongodump"
mongo_backup_dir = os.path.join(backup_path, "mongo_backup")
# clean up old backup directory if exists
self.logger.info(f"Cleaning up old MongoDB backup directory: {mongo_container_backup_dir}")
self.containers["mongo"].exec_run(f"rm -rf {mongo_container_backup_dir}")
# create backup directory on host
self.logger.info(f"Creating MongoDB backup directory on host: {mongo_container_backup_dir}")
response = self.containers["mongo"].exec_run(f"mongodump --out {mongo_container_backup_dir}")
if response.exit_code != 0:
self.logger.error(f"Failed to create MongoDB dump: {response.output.decode()}")
return 1
self.logger.info(f"MongoDB dumped: {response.output.decode()}")
# copy backup from container to host
self.logger.info(f"Copying MongoDB backup data from container to local directory: {mongo_backup_dir}")
response = subprocess.run([
"docker",
"cp",
f"{self.containers['mongo'].id}:{mongo_container_backup_dir}",
mongo_backup_dir,
"-q"
])
# check files got copied
if not os.path.exists(mongo_backup_dir) or not os.listdir(mongo_backup_dir):
self.logger.error("MongoDB backup directory is empty. Backup failed.")
return 1
self.logger.info(f"MongoDB backup data copied to: {mongo_backup_dir}")
return 0
def restore_mongodb(self, backup_path: str) -> int:
""" Restores MongoDB data from a snapshot by copying the dump back into the container and restoring it."""
# Copy dump back into container
container_target = "/tmp/mongodump_restore"
host_dump_dir = os.path.join(backup_path, "mongo_backup")
self.logger.info(f"Copying MongoDB dump from {host_dump_dir} to container at {container_target}")
response = subprocess.run([
"docker", "cp",
host_dump_dir,
f"{self.containers['mongo'].id}:{container_target}"
], check=True)
if response.returncode != 0:
self.logger.error(f"Failed to copy MongoDB dump from {host_dump_dir} to container.")
return 1
self.logger.info(f"Copied Mongo dump into container at {container_target}")
# Restore to overwrite existing data
self.logger.info("Restoring MongoDB data from dump...")
response = self.containers["mongo"].exec_run(
f"mongorestore --drop {container_target}",
user="mongodb" # same user as backup
)
if response.exit_code != 0:
self.logger.error(f"Failed to restore MongoDB data: {response.output.decode()}")
return 1
self.logger.info("MongoDB data restored successfully.")
return 0
def backup_redis(self, backup_path: str) -> int:
""" Backs up Redis data by triggering a SAVE command and copying the dump.rdb file to the host."""
redis_local_backup_file = os.path.join(backup_path, "dump.rdb")
# trigger redis backup
self.logger.info("Triggering Redis SAVE to create a snapshot...")
response = self.containers["redis"].exec_run("redis-cli SAVE")
if not response.output.decode().startswith("OK"):
self.logger.error(f"Failed to trigger Redis SAVE command: {response.output.decode()}")
return 1
self.logger.info(f"Redis SAVE command response: {response.output.decode()}")
# Copy dump.rdb to host
self.logger.info(f"Copying Redis dump.rdb from container to local file: {redis_local_backup_file}")
response = subprocess.run([
"docker",
"cp",
f"{self.containers['redis'].id}:/data/dump.rdb",
redis_local_backup_file,
"-q"
])
if response.returncode != 0:
self.logger.error(f"Failed to copy Redis dump.rdb from container to {redis_local_backup_file}.")
return 1
self.logger.info(f"Redis backup file copied to: {redis_local_backup_file}")
return 0
def restore_redis(self, backup_path: str) -> int:
""" Restores Redis data from a snapshot by copying the dump.rdb file back into the container and restarting it."""
# Stop Redis to avoid racing writes
self.containers["redis"].stop()
self.logger.info("Redis container stopped for restore.")
# Copy dump.rdb back into container
host_rdb = os.path.join(backup_path, "dump.rdb")
response = subprocess.run([
"docker", "cp",
host_rdb,
f"{self.containers['redis'].id}:/data/dump.rdb"
], check=True)
if response.returncode != 0:
self.logger.error(f"Failed to copy Redis dump.rdb from {host_rdb} to container.")
return 1
self.logger.info(f"Copied dump.rdb into Redis container.")
# Restart Redis
self.containers["redis"].start()
self.logger.info("Redis container restarted.")
return 0
def backup_fileserver(self, backup_path: str) -> int:
""" Backs up fileserver data by copying the fileserver path to a backup directory."""
fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
fileserver_path = None
for volume in fileserver_volumes:
if "/mnt/fileserver" in volume:
fileserver_path = volume.split(":")[0]
self.logger.info(f"Fileserver path: {fileserver_path}")
break
# Ensure fileserver path exists
if not fileserver_path:
self.logger.error("Fileserver path not found in Docker Compose file.")
return 1
if not os.path.exists(fileserver_path):
self.logger.error(f"Fileserver path does not exist: {fileserver_path}")
return 1
else:
self.logger.info(f"Copying fileserver from {fileserver_path} with rsync...")
# Copy fileserver data
response = subprocess.run([
"rsync", "-av", "--delete", str(fileserver_path), str(backup_path)
], check=True)
if response.returncode != 0:
self.logger.error(f"Rsync failed: {response}.")
return 1
else:
self.logger.info(f"Rsync successful.")
# Check files got copied
fileserver_backup_dir = os.path.join(backup_path, "fileserver")
if not os.path.exists(fileserver_backup_dir) or not os.listdir(fileserver_backup_dir):
self.logger.error("Fileserver backup directory is empty. Backup failed.")
return 1
else:
self.logger.info(f"Fileserver data copied to: {fileserver_backup_dir}")
return 0
def restore_fileserver(self, backup_path: str) -> int:
""" Restores fileserver data from a snapshot by rsyncing it back to the live volume."""
# Read original volume mount from compose
fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
fileserver_path = None
for volume in fileserver_volumes:
if "/mnt/fileserver" in volume:
fileserver_path = os.path.dirname(volume.split(":")[0])
self.logger.info(f"Fileserver path: {fileserver_path}")
break
self.logger.info(f"Restoring fileserver to {fileserver_path}")
# Rsync backup back into the live volume
src = os.path.join(backup_path, "fileserver")
response = subprocess.run([
"rsync", "-av", "--delete",
src, fileserver_path
], check=True)
if response.returncode != 0:
self.logger.error(f"Rsync failed: {response}.")
return 1
self.logger.info("Fileserver data restored successfully.")
return 0
@app.command()
def create_snapshot(
backup_root: str = typer.Option(
help="Root directory where ClearML backups will be stored."
),
docker_compose_file: str = typer.Option(
help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml')."
),
retention: int = typer.Option(
0,
help="Number of most recent snapshots to keep. Older snapshots will be deleted. Default is 0 (no clean up).",
)
):
"""Create a timestamped ClearML snapshot."""
tic = time.time()
backup_manager = ClearMLBackupManager(docker_compose_file=docker_compose_file)
status, backup_path = backup_manager.create_snapshot(backup_root)
if status == 0 and retention > 0:
backup_manager.cleanup_old_backups(backup_root, keep_last=retention)
if status != 0:
typer.secho(f"{datetime.now()} | Backup failed. Check snapshot logs for details: {backup_path}", fg=typer.colors.RED)
else:
typer.secho(
f"{datetime.now()} | Backup completed in {str(timedelta(seconds=int(time.time() - tic)))}. Snapshot located in {backup_path}.",
fg=typer.colors.GREEN
)
@app.command()
def restore_snapshot(
snapshot_path: str = typer.Option(
help="Path to the ClearML snapshot directory to restore from."
)
):
"""Restore a ClearML snapshot."""
typer.secho(f"WARNING! This will overwrite existing ClearML data. Proceed with caution.", fg=typer.colors.YELLOW)
typer.secho(f"Before you proceed, make sure that:", fg=typer.colors.YELLOW)
typer.secho(f"- You have a manual backup of your current ClearML data (in case there are any on the current server instance).", fg=typer.colors.YELLOW)
typer.secho(f"- The data subfolders are created with correct permissions (see https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server_linux_mac).", fg=typer.colors.YELLOW)
typer.secho(f"- You are using a docker-compose.yml and config/ copy from {snapshot_path}.", fg=typer.colors.YELLOW)
typer.secho(f"- The target ClearML server instance is up and running.", fg=typer.colors.YELLOW)
typer.confirm("Do you want to proceed with the restoration?", abort=True)
if snapshot_path.endswith("/"):
snapshot_path = snapshot_path[:-1]
backup_manager = ClearMLBackupManager(docker_compose_file=os.path.join(snapshot_path, "docker-compose.yml"))
status = backup_manager.restore_snapshot(snapshot_path)
if status != 0:
typer.secho(f"Snapshot restoration failed. Check logs for details", fg=typer.colors.RED)
else:
typer.secho(f"Snapshot restored successfully.", fg=typer.colors.GREEN)
@app.command()
def clear_schedule():
"""Clear the existing ClearML backup cron job."""
user = pwd.getpwuid(os.getuid())
cron = CronTab(user=user.pw_name)
for job in cron:
if job.comment == "clearml-backup-tool":
typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
cron.remove(job)
cron.write()
typer.secho("Cleared all existing ClearML backup cron jobs.", fg=typer.colors.GREEN)
@app.command()
def setup_schedule(
backup_root: str = typer.Option(
"./clearml_backup",
help="Root directory where ClearML backups will be stored. Default is './clearml_backup'.",
prompt="Enter the backup root directory",
),
docker_compose_file: str = typer.Option(
"/opt/clearml/docker-compose.yml",
help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml').",
prompt="Enter the path to the Docker Compose file"
),
retention: int = typer.Option(
2,
help="Number of most recent snapshots to keep. Older snapshots will be deleted. (0 = no cleanup).",
prompt="Enter the number of most recent snapshots to keep (0 = no cleanup)"
),
backup_period: str = typer.Option(
"7d",
help="Backup period for the cron job in the format '{number}{unit}} where unit is one of 'm' (minutes), 'h' (hours), 'd' (days)'.",
prompt="Enter the backup period for the cron job (format: '{number}{unit}').",
)
):
"""Set up a cron job to automatically create ClearML snapshots. You can run this without any arguments to go through an interactive setup."""
assert re.match(r'^\d+[mhd]$', backup_period), "Backup period must be in the format '{number}{unit}' where unit is one of 'm', 'h', 'd'."
user = pwd.getpwuid(os.getuid())
cron = CronTab(user=user.pw_name)
abs_backup_root = os.path.abspath(backup_root)
abs_docker_compose_file = os.path.abspath(docker_compose_file)
for job in cron:
if job.comment == "clearml-backup-tool":
typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
cron.remove(job)
cron.write()
uv_path = subprocess.run(["which", "uv"], capture_output=True, text=True, check=True).stdout.strip()
command = (f"{uv_path} run {os.path.abspath(__file__)} create-snapshot "
f"--backup-root {abs_backup_root} "
f"--docker-compose-file {abs_docker_compose_file} "
f"--retention {retention} "
f"| tail -n 1 >> {abs_backup_root}/autobackup.log 2>&1"
)
job = cron.new(command=command, comment="clearml-backup-tool")
num, unit = int(backup_period[:-1]), backup_period[-1]
match unit:
case 'm':
job.minute.every(num)
case 'h':
job.hour.every(num)
case 'd':
job.day.every(num)
case _:
raise ValueError(f"Invalid backup period unit: {unit}. Must be one of 'm', 'h', 'd'.")
cron.write()
for job in cron:
if job.comment == "clearml-backup-tool":
break
typer.secho(f"Set up cron job: {job}", fg=typer.colors.BLUE)
typer.secho(f"Scheduled ClearML backup every {num}{unit}. Job will log to {abs_backup_root}/autobackup.log.", fg=typer.colors.GREEN)
if __name__ == "__main__":
app()