mirror of
https://github.com/clearml/clearml-server
synced 2025-06-23 08:45:30 +00:00
self-installing script for running, scheduling and restorint local clearml server instances
This commit is contained in:
parent
d998b46cb2
commit
fa48caaa49
699
utils/backup_tool.py
Executable file
699
utils/backup_tool.py
Executable file
@ -0,0 +1,699 @@
|
|||||||
|
#########################################################################################
|
||||||
|
# A self-installing script for ClearML local server instance backups.
|
||||||
|
# This tool provides functionality to create and restore ClearML snapshots.
|
||||||
|
# It supports backing up Elasticsearch, MongoDB, Redis, and fileserver data.
|
||||||
|
# It also allows scheduling backups using cron jobs.
|
||||||
|
# Usage:
|
||||||
|
# - Display help and available commands: `uv run backup_tool.py --help`
|
||||||
|
# - Create a snapshot: `uv run backup_tool.py create-snapshot --help`
|
||||||
|
# - Restore a snapshot: `uv run backup_tool.py restore-snapshot --help`
|
||||||
|
# - Setup cron job for automatic backups: `uv run backup_tool.py setup-schedule --help`
|
||||||
|
# - Clear existing cron jobs: `uv run backup_tool.py clear-schedule --help`
|
||||||
|
#########################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
# /// script
|
||||||
|
# requires-python = ">=3.10"
|
||||||
|
# dependencies = [
|
||||||
|
# "docker",
|
||||||
|
# "loguru",
|
||||||
|
# "python-crontab",
|
||||||
|
# "typer",
|
||||||
|
# "pyyaml",
|
||||||
|
# ]
|
||||||
|
# ///
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pwd
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
import docker
|
||||||
|
import typer
|
||||||
|
from crontab import CronTab
|
||||||
|
from loguru import logger
|
||||||
|
from pprint import pformat
|
||||||
|
|
||||||
|
|
||||||
|
app = typer.Typer(add_completion=False)
|
||||||
|
|
||||||
|
|
||||||
|
log_format = (
|
||||||
|
"<green>{time:YYYY-MM-DD at HH:mm:ss}</green> | "
|
||||||
|
"<level>{level: <7}</level> | "
|
||||||
|
"<bold><magenta>{extra[name]}</magenta></bold> - "
|
||||||
|
"{message}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ClearMLBackupManager:
|
||||||
|
def __init__(self, docker_compose_file: str):
|
||||||
|
self.timestamp = datetime.now().strftime("%Y-%m-%d-%H%M")
|
||||||
|
self.docker_compose_file = docker_compose_file
|
||||||
|
|
||||||
|
# setup logging
|
||||||
|
self.logger = logger
|
||||||
|
self.logger.remove(0)
|
||||||
|
self.logger.add(
|
||||||
|
sys.stdout,
|
||||||
|
format=log_format,
|
||||||
|
colorize=True
|
||||||
|
)
|
||||||
|
self.logger = self.logger.bind(name="MAIN")
|
||||||
|
|
||||||
|
# parse docker compose file
|
||||||
|
if not os.path.exists(docker_compose_file):
|
||||||
|
self.logger.error(f"Docker Compose file not found at {docker_compose_file}.")
|
||||||
|
raise FileNotFoundError(f"Docker Compose file not found at {docker_compose_file}.")
|
||||||
|
self.compose_dict = yaml.safe_load(open(docker_compose_file, "r"))
|
||||||
|
|
||||||
|
# setup containers
|
||||||
|
self.containers = self.setup_containers()
|
||||||
|
if self.containers is None:
|
||||||
|
self.logger.error("Failed to identify containers. Exiting backup process.")
|
||||||
|
raise RuntimeError("Failed to identify containers. Exiting backup process.")
|
||||||
|
|
||||||
|
def cleanup_old_backups(self, backup_root: str, keep_last: int = 2) -> int:
|
||||||
|
"""
|
||||||
|
Removes old ClearML snapshot backups, keeping only the most recent `keep_last` snapshots.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backup_root (str): The root directory where backups are stored.
|
||||||
|
keep_last (int): Number of most recent snapshots to keep.
|
||||||
|
"""
|
||||||
|
backup_root_path = Path(backup_root)
|
||||||
|
if not backup_root_path.exists() or not backup_root_path.is_dir():
|
||||||
|
self.logger.error(f"Backup root directory does not exist or is not a directory: {backup_root}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Match folders like: clearml_snapshot_2025-06-05-1030
|
||||||
|
snapshot_dirs = sorted(
|
||||||
|
[p for p in backup_root_path.iterdir() if p.is_dir() and re.match(r"clearml_snapshot_\d{4}-\d{2}-\d{2}-\d{4}", p.name)],
|
||||||
|
key=lambda p: p.name,
|
||||||
|
reverse=True # most recent first
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(snapshot_dirs) <= keep_last:
|
||||||
|
self.logger.info(f"Only {len(snapshot_dirs)} snapshots found. Nothing to clean.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
to_delete = snapshot_dirs[keep_last:]
|
||||||
|
for folder in to_delete:
|
||||||
|
try:
|
||||||
|
self.logger.info(f"Removing old snapshot: {folder}")
|
||||||
|
subprocess.run(["rm", "-rf", str(folder)], check=True)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to delete {folder}: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def create_snapshot(self, backup_root: str) -> tuple[int, str]:
|
||||||
|
""" Main method to create a ClearML snapshot. It will backup Elasticsearch, MongoDB, Redis and fileserver data."""
|
||||||
|
|
||||||
|
backup_path = os.path.join(backup_root, f"clearml_snapshot_{self.timestamp}")
|
||||||
|
os.makedirs(backup_path, exist_ok=True)
|
||||||
|
|
||||||
|
# Route logger to the snapshot directory
|
||||||
|
self.logger.add(
|
||||||
|
os.path.join(backup_path, "clearml_backup.log"),
|
||||||
|
format=log_format
|
||||||
|
)
|
||||||
|
self.logger.info("Starting ClearML snapshot creation...")
|
||||||
|
|
||||||
|
# Copy Docker Compose file to backup directory
|
||||||
|
compose_backup_path = os.path.join(backup_path, "docker-compose.yml")
|
||||||
|
response = subprocess.run(["cp", self.docker_compose_file, compose_backup_path], check=True)
|
||||||
|
if response.returncode != 0:
|
||||||
|
self.logger.error(f"Failed to copy Docker Compose file to {compose_backup_path}.")
|
||||||
|
return 1, backup_path
|
||||||
|
self.logger.info(f"Copied Docker Compose file to {compose_backup_path}.")
|
||||||
|
|
||||||
|
# Copy config directory to backup directory
|
||||||
|
config_dir_path = None
|
||||||
|
for volume in self.compose_dict["services"]["apiserver"]["volumes"]:
|
||||||
|
if "/opt/clearml/config" in volume:
|
||||||
|
config_dir_path = volume.split(":")[0]
|
||||||
|
break
|
||||||
|
response = subprocess.run(["cp", "-r", config_dir_path, os.path.join(backup_path, "config")], check=True)
|
||||||
|
if not config_dir_path or not os.path.exists(config_dir_path):
|
||||||
|
self.logger.error(f"Config directory not found in Docker Compose file or does not exist: {config_dir_path}")
|
||||||
|
return 1, backup_path
|
||||||
|
self.logger.info(f"Copied config directory from {config_dir_path} to {os.path.join(backup_path, 'config')}.")
|
||||||
|
|
||||||
|
# Backup Elasticsearch
|
||||||
|
self.logger = self.logger.bind(name="ELASTICSEARCH")
|
||||||
|
status = self.backup_elasticsearch(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("Elasticsearch backup failed. Exiting backup process.")
|
||||||
|
return status, backup_path
|
||||||
|
|
||||||
|
# Backup MongoDB
|
||||||
|
self.logger = self.logger.bind(name="MONGODB")
|
||||||
|
self.backup_mongodb(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("MongoDB backup failed. Exiting backup process.")
|
||||||
|
return status, backup_path
|
||||||
|
|
||||||
|
# Backup Redis
|
||||||
|
self.logger = self.logger.bind(name="REDIS")
|
||||||
|
status = self.backup_redis(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("Redis backup failed. Exiting backup process.")
|
||||||
|
return status, backup_path
|
||||||
|
|
||||||
|
# Backup fileserver
|
||||||
|
self.logger = self.logger.bind(name="FILESERVER")
|
||||||
|
status = self.backup_fileserver(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("Fileserver backup failed. Exiting backup process.")
|
||||||
|
return status, backup_path
|
||||||
|
|
||||||
|
self.logger = self.logger.bind(name="MAIN")
|
||||||
|
self.logger.info("ClearML snapshot created successfully.")
|
||||||
|
return 0, backup_path
|
||||||
|
|
||||||
|
def restore_snapshot(self, backup_path: str) -> int:
|
||||||
|
""" Main method to restore a ClearML snapshot. It will restore Elasticsearch, MongoDB, Redis and fileserver data."""
|
||||||
|
|
||||||
|
if not os.path.exists(backup_path):
|
||||||
|
self.logger.error(f"Backup path does not exist: {backup_path}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.logger.info("Starting ClearML snapshot restoration...")
|
||||||
|
|
||||||
|
# Restore Elasticsearch
|
||||||
|
self.logger = self.logger.bind(name="ELASTICSEARCH")
|
||||||
|
status = self.restore_elasticsearch(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("Elasticsearch restoration failed. Exiting restore process.")
|
||||||
|
return status
|
||||||
|
|
||||||
|
# Restore MongoDB
|
||||||
|
self.logger = self.logger.bind(name="MONGODB")
|
||||||
|
status = self.restore_mongodb(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("MongoDB restoration failed. Exiting restore process.")
|
||||||
|
return status
|
||||||
|
|
||||||
|
# # Restore Redis
|
||||||
|
self.logger = self.logger.bind(name="REDIS")
|
||||||
|
status = self.restore_redis(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("Redis restoration failed. Exiting restore process.")
|
||||||
|
return status
|
||||||
|
|
||||||
|
# # Restore fileserver
|
||||||
|
self.logger = self.logger.bind(name="FILESERVER")
|
||||||
|
status = self.restore_fileserver(backup_path)
|
||||||
|
if status != 0:
|
||||||
|
self.logger.error("Fileserver restoration failed. Exiting restore process.")
|
||||||
|
return status
|
||||||
|
|
||||||
|
self.logger = self.logger.bind(name="MAIN")
|
||||||
|
self.logger.info("ClearML snapshot restored successfully.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def setup_containers(self) -> dict | None:
|
||||||
|
""" Identifies ClearML containers and returns them in a dictionary."""
|
||||||
|
|
||||||
|
containers = {}
|
||||||
|
docker_client = docker.from_env()
|
||||||
|
for container in docker_client.containers.list():
|
||||||
|
if "clearml-elastic" in container.name:
|
||||||
|
if "elastic" in containers.keys():
|
||||||
|
self.logger.error(f"Multiple Elasticsearch containers found: {containers['elastic'].id} and {container.id}. Using the first one.")
|
||||||
|
containers["elastic"] = container
|
||||||
|
self.logger.info(f"Found Elasticsearch container: {container.name} ({container}, {container.image})")
|
||||||
|
elif "clearml-mongo" in container.name:
|
||||||
|
if "mongo" in containers.keys():
|
||||||
|
self.logger.error(f"Multiple MongoDB containers found: {containers['mongo'].id} and {container.id}. Using the first one.")
|
||||||
|
containers["mongo"] = container
|
||||||
|
self.logger.info(f"Found MongoDB container: {container.name} ({container}, {container.image})")
|
||||||
|
elif "clearml-redis" in container.name:
|
||||||
|
if "redis" in containers.keys():
|
||||||
|
self.logger.error(f"Multiple Redis containers found: {containers['redis'].id} and {container.id}. Using the first one.")
|
||||||
|
containers["redis"] = container
|
||||||
|
self.logger.info(f"Found Redis container: {container.name} ({container}, {container.image})")
|
||||||
|
|
||||||
|
if not "elastic" in containers:
|
||||||
|
self.logger.error("No Elasticsearch container.")
|
||||||
|
return
|
||||||
|
if not "mongo" in containers:
|
||||||
|
self.logger.error("No MongoDB container found.")
|
||||||
|
return
|
||||||
|
if not "redis" in containers:
|
||||||
|
self.logger.error("No Redis container found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
return containers
|
||||||
|
|
||||||
|
def backup_elasticsearch(self, backup_path: str) -> int:
|
||||||
|
""" Backs up Elasticsearch data by creating a snapshot and copying it to the host."""
|
||||||
|
if not "path.repo" in self.compose_dict["services"]["elasticsearch"]["environment"]:
|
||||||
|
self.logger.error("Elasticsearch path.repo environment variable not found in Docker Compose file.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
es_container_backup_dir = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
|
||||||
|
es_local_backup_dir = os.path.join(backup_path, os.path.basename(os.path.normpath(es_container_backup_dir)))
|
||||||
|
repo_name = "backup"
|
||||||
|
snapshot_name = f"snapshot_{self.timestamp}"
|
||||||
|
|
||||||
|
# Register snapshot repo
|
||||||
|
self.logger.info(f"Registering Elasticsearch snapshot repository '{repo_name}' at {es_container_backup_dir}...")
|
||||||
|
response = self.containers["elastic"].exec_run(
|
||||||
|
f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
|
||||||
|
f"-H 'Content-Type: application/json' "
|
||||||
|
f"-d '{{\"type\": \"fs\", \"settings\": {{\"location\": \"{es_container_backup_dir}\"}}}}'"
|
||||||
|
)
|
||||||
|
response = response.output.decode()
|
||||||
|
response = json.loads(response) if response else {}
|
||||||
|
if "error" in response:
|
||||||
|
self.logger.error(f"Failed to register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Elasticsearch snapshot repository registered: \n{pformat(response)}")
|
||||||
|
|
||||||
|
|
||||||
|
# Trigger snapshot
|
||||||
|
self.logger.info(f"Elasticsearch snapshot creation started...")
|
||||||
|
response = self.containers["elastic"].exec_run(
|
||||||
|
f"curl -s -X PUT localhost:9200/_snapshot/{repo_name}/{snapshot_name}?wait_for_completion=true"
|
||||||
|
)
|
||||||
|
response = response.output.decode()
|
||||||
|
response = json.loads(response) if response else {}
|
||||||
|
if "error" in response:
|
||||||
|
self.logger.error(f"Failed to create Elasticsearch snapshot: \n{pformat(response['error'])}")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Elasticsearch snapshot created: \n{pformat(response)}")
|
||||||
|
|
||||||
|
# Copy snapshot data from container
|
||||||
|
self.logger.info(f"Copying Elasticsearch snapshot data from container to local directory: {es_local_backup_dir}")
|
||||||
|
response = subprocess.run([
|
||||||
|
"docker",
|
||||||
|
"cp",
|
||||||
|
f"{self.containers['elastic'].id}:{es_container_backup_dir}",
|
||||||
|
backup_path,
|
||||||
|
"-q"
|
||||||
|
])
|
||||||
|
# check files got copied
|
||||||
|
if not os.path.exists(es_local_backup_dir) or not os.listdir(es_local_backup_dir):
|
||||||
|
self.logger.error("Elasticsearch backup directory is empty. Backup failed.")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Elasticsearch snapshot data copied to: {es_local_backup_dir}")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def restore_elasticsearch(self, backup_path: str) -> int:
|
||||||
|
""" Restores Elasticsearch data from a snapshot by copying it to the container's backup directory."""
|
||||||
|
# Copy the snapshot files back into the container's repo path
|
||||||
|
es_repo = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
|
||||||
|
es_repo_root = os.path.dirname(es_repo)
|
||||||
|
host_snapshot_dir = os.path.join(backup_path, os.path.basename(es_repo))
|
||||||
|
self.logger.info(f"Copying Elasticsearch snapshot files from {host_snapshot_dir} to container at {es_repo_root}")
|
||||||
|
response = subprocess.run([
|
||||||
|
"docker", "cp",
|
||||||
|
host_snapshot_dir,
|
||||||
|
f"{self.containers['elastic'].id}:{es_repo_root}"
|
||||||
|
], check=True)
|
||||||
|
if response.returncode != 0:
|
||||||
|
self.logger.error(f"Failed to copy Elasticsearch snapshot files from {host_snapshot_dir} to container.")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Copied Elasticsearch snapshot into container at {es_repo}")
|
||||||
|
|
||||||
|
# Re-register the repo
|
||||||
|
self.logger.info("Re-registering Elasticsearch snapshot repository...")
|
||||||
|
repo_name = "backup"
|
||||||
|
response = self.containers["elastic"].exec_run(
|
||||||
|
f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
|
||||||
|
f"-H 'Content-Type: application/json' "
|
||||||
|
f"-d '{{\"type\":\"fs\",\"settings\":{{\"location\":\"{es_repo}\"}}}}'"
|
||||||
|
)
|
||||||
|
response = response.output.decode()
|
||||||
|
response = json.loads(response) if response else {}
|
||||||
|
self.logger.info(f"Elasticsearch snapshot repository re-registration response: \n{pformat(response)}")
|
||||||
|
if "error" in response:
|
||||||
|
self.logger.error(f"Failed to re-register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info("Elasticsearch snapshot repository re-registered successfully.")
|
||||||
|
|
||||||
|
# Close any existing indices
|
||||||
|
self.logger.info("Closing all Elasticsearch indices to avoid conflicts during restore...")
|
||||||
|
indices = self.containers["elastic"].exec_run(
|
||||||
|
"curl -s localhost:9200/_cat/indices?h=index"
|
||||||
|
).output.decode().strip().splitlines()
|
||||||
|
if indices:
|
||||||
|
index_list = ",".join(indices)
|
||||||
|
response = self.containers["elastic"].exec_run(
|
||||||
|
f"curl -s -X POST localhost:9200/{index_list}/_close"
|
||||||
|
)
|
||||||
|
response = response.output.decode()
|
||||||
|
response = json.loads(response) if response else {}
|
||||||
|
self.logger.info(f"Close indices response: \n{pformat(response)}")
|
||||||
|
if "error" in response:
|
||||||
|
self.logger.error(f"Failed to close Elasticsearch indices: \n{pformat(response['error'])}")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info("Closed all Elasticsearch indices.")
|
||||||
|
else:
|
||||||
|
self.logger.info("No Elasticsearch indices found to close.")
|
||||||
|
|
||||||
|
# Trigger the restore
|
||||||
|
snap_timestamp = backup_path.split("_")[-1]
|
||||||
|
snap_name = f"snapshot_{snap_timestamp}"
|
||||||
|
self.logger.info(f"Restoring Elasticsearch snapshot: {snap_name} from repository: {repo_name}...")
|
||||||
|
response = self.containers["elastic"].exec_run(
|
||||||
|
f"curl -s -X POST localhost:9200/_snapshot/{repo_name}/{snap_name}/_restore?wait_for_completion=true "
|
||||||
|
f"-H 'Content-Type: application/json' -d '{{\"include_global_state\":true}}'"
|
||||||
|
)
|
||||||
|
response = response.output.decode()
|
||||||
|
response = json.loads(response) if response else {}
|
||||||
|
if "error" in response:
|
||||||
|
self.logger.error(f"Failed to restore Elasticsearch snapshot: {pformat(response['error'])}")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Elasticsearch snapshot restored: {pformat(response)}")
|
||||||
|
|
||||||
|
self.logger.info("Elasticsearch snapshot restored.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def backup_mongodb(self, backup_path: str) -> int:
|
||||||
|
""" Backs up MongoDB data by creating a dump and copying it to the host."""
|
||||||
|
mongo_container_backup_dir = "/tmp/mongodump"
|
||||||
|
mongo_backup_dir = os.path.join(backup_path, "mongo_backup")
|
||||||
|
|
||||||
|
# clean up old backup directory if exists
|
||||||
|
self.logger.info(f"Cleaning up old MongoDB backup directory: {mongo_container_backup_dir}")
|
||||||
|
self.containers["mongo"].exec_run(f"rm -rf {mongo_container_backup_dir}")
|
||||||
|
# create backup directory on host
|
||||||
|
self.logger.info(f"Creating MongoDB backup directory on host: {mongo_container_backup_dir}")
|
||||||
|
response = self.containers["mongo"].exec_run(f"mongodump --out {mongo_container_backup_dir}")
|
||||||
|
if response.exit_code != 0:
|
||||||
|
self.logger.error(f"Failed to create MongoDB dump: {response.output.decode()}")
|
||||||
|
return 1
|
||||||
|
self.logger.info(f"MongoDB dumped: {response.output.decode()}")
|
||||||
|
# copy backup from container to host
|
||||||
|
self.logger.info(f"Copying MongoDB backup data from container to local directory: {mongo_backup_dir}")
|
||||||
|
response = subprocess.run([
|
||||||
|
"docker",
|
||||||
|
"cp",
|
||||||
|
f"{self.containers['mongo'].id}:{mongo_container_backup_dir}",
|
||||||
|
mongo_backup_dir,
|
||||||
|
"-q"
|
||||||
|
])
|
||||||
|
# check files got copied
|
||||||
|
if not os.path.exists(mongo_backup_dir) or not os.listdir(mongo_backup_dir):
|
||||||
|
self.logger.error("MongoDB backup directory is empty. Backup failed.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.logger.info(f"MongoDB backup data copied to: {mongo_backup_dir}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def restore_mongodb(self, backup_path: str) -> int:
|
||||||
|
""" Restores MongoDB data from a snapshot by copying the dump back into the container and restoring it."""
|
||||||
|
# Copy dump back into container
|
||||||
|
container_target = "/tmp/mongodump_restore"
|
||||||
|
host_dump_dir = os.path.join(backup_path, "mongo_backup")
|
||||||
|
self.logger.info(f"Copying MongoDB dump from {host_dump_dir} to container at {container_target}")
|
||||||
|
response = subprocess.run([
|
||||||
|
"docker", "cp",
|
||||||
|
host_dump_dir,
|
||||||
|
f"{self.containers['mongo'].id}:{container_target}"
|
||||||
|
], check=True)
|
||||||
|
if response.returncode != 0:
|
||||||
|
self.logger.error(f"Failed to copy MongoDB dump from {host_dump_dir} to container.")
|
||||||
|
return 1
|
||||||
|
self.logger.info(f"Copied Mongo dump into container at {container_target}")
|
||||||
|
|
||||||
|
# Restore to overwrite existing data
|
||||||
|
self.logger.info("Restoring MongoDB data from dump...")
|
||||||
|
response = self.containers["mongo"].exec_run(
|
||||||
|
f"mongorestore --drop {container_target}",
|
||||||
|
user="mongodb" # same user as backup
|
||||||
|
)
|
||||||
|
if response.exit_code != 0:
|
||||||
|
self.logger.error(f"Failed to restore MongoDB data: {response.output.decode()}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.logger.info("MongoDB data restored successfully.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def backup_redis(self, backup_path: str) -> int:
|
||||||
|
""" Backs up Redis data by triggering a SAVE command and copying the dump.rdb file to the host."""
|
||||||
|
redis_local_backup_file = os.path.join(backup_path, "dump.rdb")
|
||||||
|
|
||||||
|
# trigger redis backup
|
||||||
|
self.logger.info("Triggering Redis SAVE to create a snapshot...")
|
||||||
|
response = self.containers["redis"].exec_run("redis-cli SAVE")
|
||||||
|
if not response.output.decode().startswith("OK"):
|
||||||
|
self.logger.error(f"Failed to trigger Redis SAVE command: {response.output.decode()}")
|
||||||
|
return 1
|
||||||
|
self.logger.info(f"Redis SAVE command response: {response.output.decode()}")
|
||||||
|
|
||||||
|
# Copy dump.rdb to host
|
||||||
|
self.logger.info(f"Copying Redis dump.rdb from container to local file: {redis_local_backup_file}")
|
||||||
|
response = subprocess.run([
|
||||||
|
"docker",
|
||||||
|
"cp",
|
||||||
|
f"{self.containers['redis'].id}:/data/dump.rdb",
|
||||||
|
redis_local_backup_file,
|
||||||
|
"-q"
|
||||||
|
])
|
||||||
|
if response.returncode != 0:
|
||||||
|
self.logger.error(f"Failed to copy Redis dump.rdb from container to {redis_local_backup_file}.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.logger.info(f"Redis backup file copied to: {redis_local_backup_file}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def restore_redis(self, backup_path: str) -> int:
|
||||||
|
""" Restores Redis data from a snapshot by copying the dump.rdb file back into the container and restarting it."""
|
||||||
|
# Stop Redis to avoid racing writes
|
||||||
|
self.containers["redis"].stop()
|
||||||
|
self.logger.info("Redis container stopped for restore.")
|
||||||
|
|
||||||
|
# Copy dump.rdb back into container
|
||||||
|
host_rdb = os.path.join(backup_path, "dump.rdb")
|
||||||
|
response = subprocess.run([
|
||||||
|
"docker", "cp",
|
||||||
|
host_rdb,
|
||||||
|
f"{self.containers['redis'].id}:/data/dump.rdb"
|
||||||
|
], check=True)
|
||||||
|
if response.returncode != 0:
|
||||||
|
self.logger.error(f"Failed to copy Redis dump.rdb from {host_rdb} to container.")
|
||||||
|
return 1
|
||||||
|
self.logger.info(f"Copied dump.rdb into Redis container.")
|
||||||
|
|
||||||
|
# Restart Redis
|
||||||
|
self.containers["redis"].start()
|
||||||
|
self.logger.info("Redis container restarted.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def backup_fileserver(self, backup_path: str) -> int:
|
||||||
|
""" Backs up fileserver data by copying the fileserver path to a backup directory."""
|
||||||
|
fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
|
||||||
|
fileserver_path = None
|
||||||
|
for volume in fileserver_volumes:
|
||||||
|
if "/mnt/fileserver" in volume:
|
||||||
|
fileserver_path = volume.split(":")[0]
|
||||||
|
self.logger.info(f"Fileserver path: {fileserver_path}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Ensure fileserver path exists
|
||||||
|
if not fileserver_path:
|
||||||
|
self.logger.error("Fileserver path not found in Docker Compose file.")
|
||||||
|
return 1
|
||||||
|
if not os.path.exists(fileserver_path):
|
||||||
|
self.logger.error(f"Fileserver path does not exist: {fileserver_path}")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Copying fileserver from {fileserver_path} with rsync...")
|
||||||
|
# Copy fileserver data
|
||||||
|
response = subprocess.run([
|
||||||
|
"rsync", "-av", "--delete", str(fileserver_path), str(backup_path)
|
||||||
|
], check=True)
|
||||||
|
if response.returncode != 0:
|
||||||
|
self.logger.error(f"Rsync failed: {response}.")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Rsync successful.")
|
||||||
|
# Check files got copied
|
||||||
|
fileserver_backup_dir = os.path.join(backup_path, "fileserver")
|
||||||
|
if not os.path.exists(fileserver_backup_dir) or not os.listdir(fileserver_backup_dir):
|
||||||
|
self.logger.error("Fileserver backup directory is empty. Backup failed.")
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
self.logger.info(f"Fileserver data copied to: {fileserver_backup_dir}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def restore_fileserver(self, backup_path: str) -> int:
|
||||||
|
""" Restores fileserver data from a snapshot by rsyncing it back to the live volume."""
|
||||||
|
# Read original volume mount from compose
|
||||||
|
fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
|
||||||
|
fileserver_path = None
|
||||||
|
for volume in fileserver_volumes:
|
||||||
|
if "/mnt/fileserver" in volume:
|
||||||
|
fileserver_path = os.path.dirname(volume.split(":")[0])
|
||||||
|
self.logger.info(f"Fileserver path: {fileserver_path}")
|
||||||
|
break
|
||||||
|
self.logger.info(f"Restoring fileserver to {fileserver_path}")
|
||||||
|
|
||||||
|
# Rsync backup back into the live volume
|
||||||
|
src = os.path.join(backup_path, "fileserver")
|
||||||
|
response = subprocess.run([
|
||||||
|
"rsync", "-av", "--delete",
|
||||||
|
src, fileserver_path
|
||||||
|
], check=True)
|
||||||
|
if response.returncode != 0:
|
||||||
|
self.logger.error(f"Rsync failed: {response}.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.logger.info("Fileserver data restored successfully.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def create_snapshot(
|
||||||
|
backup_root: str = typer.Option(
|
||||||
|
help="Root directory where ClearML backups will be stored."
|
||||||
|
),
|
||||||
|
docker_compose_file: str = typer.Option(
|
||||||
|
help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml')."
|
||||||
|
),
|
||||||
|
retention: int = typer.Option(
|
||||||
|
0,
|
||||||
|
help="Number of most recent snapshots to keep. Older snapshots will be deleted. Default is 0 (no clean up).",
|
||||||
|
)
|
||||||
|
):
|
||||||
|
"""Create a timestamped ClearML snapshot."""
|
||||||
|
tic = time.time()
|
||||||
|
backup_manager = ClearMLBackupManager(docker_compose_file=docker_compose_file)
|
||||||
|
status, backup_path = backup_manager.create_snapshot(backup_root)
|
||||||
|
if status == 0 and retention > 0:
|
||||||
|
backup_manager.cleanup_old_backups(backup_root, keep_last=retention)
|
||||||
|
|
||||||
|
if status != 0:
|
||||||
|
typer.secho(f"{datetime.now()} | Backup failed. Check snapshot logs for details: {backup_path}", fg=typer.colors.RED)
|
||||||
|
else:
|
||||||
|
typer.secho(
|
||||||
|
f"{datetime.now()} | Backup completed in {str(timedelta(seconds=int(time.time() - tic)))}. Snapshot located in {backup_path}.",
|
||||||
|
fg=typer.colors.GREEN
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def restore_snapshot(
|
||||||
|
snapshot_path: str = typer.Option(
|
||||||
|
help="Path to the ClearML snapshot directory to restore from."
|
||||||
|
)
|
||||||
|
):
|
||||||
|
"""Restore a ClearML snapshot."""
|
||||||
|
typer.secho(f"WARNING! This will overwrite existing ClearML data. Proceed with caution.", fg=typer.colors.YELLOW)
|
||||||
|
typer.secho(f"Before you proceed, make sure that:", fg=typer.colors.YELLOW)
|
||||||
|
typer.secho(f"- You have a manual backup of your current ClearML data (in case there are any on the current server instance).", fg=typer.colors.YELLOW)
|
||||||
|
typer.secho(f"- The data subfolders are created with correct permissions (see https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server_linux_mac).", fg=typer.colors.YELLOW)
|
||||||
|
typer.secho(f"- You are using a docker-compose.yml and config/ copy from {snapshot_path}.", fg=typer.colors.YELLOW)
|
||||||
|
typer.secho(f"- The target ClearML server instance is up and running.", fg=typer.colors.YELLOW)
|
||||||
|
typer.confirm("Do you want to proceed with the restoration?", abort=True)
|
||||||
|
|
||||||
|
if snapshot_path.endswith("/"):
|
||||||
|
snapshot_path = snapshot_path[:-1]
|
||||||
|
|
||||||
|
backup_manager = ClearMLBackupManager(docker_compose_file=os.path.join(snapshot_path, "docker-compose.yml"))
|
||||||
|
status = backup_manager.restore_snapshot(snapshot_path)
|
||||||
|
if status != 0:
|
||||||
|
typer.secho(f"Snapshot restoration failed. Check logs for details", fg=typer.colors.RED)
|
||||||
|
else:
|
||||||
|
typer.secho(f"Snapshot restored successfully.", fg=typer.colors.GREEN)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def clear_schedule():
|
||||||
|
"""Clear the existing ClearML backup cron job."""
|
||||||
|
user = pwd.getpwuid(os.getuid())
|
||||||
|
cron = CronTab(user=user.pw_name)
|
||||||
|
for job in cron:
|
||||||
|
if job.comment == "clearml-backup-tool":
|
||||||
|
typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
|
||||||
|
cron.remove(job)
|
||||||
|
cron.write()
|
||||||
|
typer.secho("Cleared all existing ClearML backup cron jobs.", fg=typer.colors.GREEN)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def setup_schedule(
|
||||||
|
backup_root: str = typer.Option(
|
||||||
|
"./clearml_backup",
|
||||||
|
help="Root directory where ClearML backups will be stored. Default is './clearml_backup'.",
|
||||||
|
prompt="Enter the backup root directory",
|
||||||
|
),
|
||||||
|
docker_compose_file: str = typer.Option(
|
||||||
|
"/opt/clearml/docker-compose.yml",
|
||||||
|
help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml').",
|
||||||
|
prompt="Enter the path to the Docker Compose file"
|
||||||
|
),
|
||||||
|
retention: int = typer.Option(
|
||||||
|
2,
|
||||||
|
help="Number of most recent snapshots to keep. Older snapshots will be deleted. (0 = no cleanup).",
|
||||||
|
prompt="Enter the number of most recent snapshots to keep (0 = no cleanup)"
|
||||||
|
),
|
||||||
|
backup_period: str = typer.Option(
|
||||||
|
"7d",
|
||||||
|
help="Backup period for the cron job in the format '{number}{unit}} where unit is one of 'm' (minutes), 'h' (hours), 'd' (days)'.",
|
||||||
|
prompt="Enter the backup period for the cron job (format: '{number}{unit}').",
|
||||||
|
)
|
||||||
|
):
|
||||||
|
"""Set up a cron job to automatically create ClearML snapshots. You can run this without any arguments to go through an interactive setup."""
|
||||||
|
assert re.match(r'^\d+[mhd]$', backup_period), "Backup period must be in the format '{number}{unit}' where unit is one of 'm', 'h', 'd'."
|
||||||
|
|
||||||
|
user = pwd.getpwuid(os.getuid())
|
||||||
|
cron = CronTab(user=user.pw_name)
|
||||||
|
abs_backup_root = os.path.abspath(backup_root)
|
||||||
|
abs_docker_compose_file = os.path.abspath(docker_compose_file)
|
||||||
|
|
||||||
|
for job in cron:
|
||||||
|
if job.comment == "clearml-backup-tool":
|
||||||
|
typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
|
||||||
|
cron.remove(job)
|
||||||
|
cron.write()
|
||||||
|
|
||||||
|
uv_path = subprocess.run(["which", "uv"], capture_output=True, text=True, check=True).stdout.strip()
|
||||||
|
command = (f"{uv_path} run {os.path.abspath(__file__)} create-snapshot "
|
||||||
|
f"--backup-root {abs_backup_root} "
|
||||||
|
f"--docker-compose-file {abs_docker_compose_file} "
|
||||||
|
f"--retention {retention} "
|
||||||
|
f"| tail -n 1 >> {abs_backup_root}/autobackup.log 2>&1"
|
||||||
|
)
|
||||||
|
job = cron.new(command=command, comment="clearml-backup-tool")
|
||||||
|
num, unit = int(backup_period[:-1]), backup_period[-1]
|
||||||
|
match unit:
|
||||||
|
case 'm':
|
||||||
|
job.minute.every(num)
|
||||||
|
case 'h':
|
||||||
|
job.hour.every(num)
|
||||||
|
case 'd':
|
||||||
|
job.day.every(num)
|
||||||
|
case _:
|
||||||
|
raise ValueError(f"Invalid backup period unit: {unit}. Must be one of 'm', 'h', 'd'.")
|
||||||
|
cron.write()
|
||||||
|
|
||||||
|
for job in cron:
|
||||||
|
if job.comment == "clearml-backup-tool":
|
||||||
|
break
|
||||||
|
typer.secho(f"Set up cron job: {job}", fg=typer.colors.BLUE)
|
||||||
|
typer.secho(f"Scheduled ClearML backup every {num}{unit}. Job will log to {abs_backup_root}/autobackup.log.", fg=typer.colors.GREEN)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app()
|
Loading…
Reference in New Issue
Block a user