mirror of
				https://github.com/clearml/clearml-server
				synced 2025-06-26 23:15:47 +00:00 
			
		
		
		
	self-installing script for running, scheduling and restorint local clearml server instances
This commit is contained in:
		
							parent
							
								
									d998b46cb2
								
							
						
					
					
						commit
						fa48caaa49
					
				
							
								
								
									
										699
									
								
								utils/backup_tool.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										699
									
								
								utils/backup_tool.py
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,699 @@
 | 
			
		||||
#########################################################################################
 | 
			
		||||
# A self-installing script for ClearML local server instance backups.
 | 
			
		||||
# This tool provides functionality to create and restore ClearML snapshots.
 | 
			
		||||
# It supports backing up Elasticsearch, MongoDB, Redis, and fileserver data.
 | 
			
		||||
# It also allows scheduling backups using cron jobs.
 | 
			
		||||
# Usage:
 | 
			
		||||
#   - Display help and available commands: `uv run backup_tool.py --help`
 | 
			
		||||
#   - Create a snapshot: `uv run backup_tool.py create-snapshot --help`
 | 
			
		||||
#   - Restore a snapshot: `uv run backup_tool.py restore-snapshot --help`
 | 
			
		||||
#   - Setup cron job for automatic backups: `uv run backup_tool.py setup-schedule --help`
 | 
			
		||||
#   - Clear existing cron jobs: `uv run backup_tool.py clear-schedule --help`
 | 
			
		||||
#########################################################################################
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# /// script
 | 
			
		||||
# requires-python = ">=3.10"
 | 
			
		||||
# dependencies = [
 | 
			
		||||
#     "docker",
 | 
			
		||||
#     "loguru",
 | 
			
		||||
#     "python-crontab",
 | 
			
		||||
#     "typer",
 | 
			
		||||
#     "pyyaml",
 | 
			
		||||
# ]
 | 
			
		||||
# ///
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import pwd
 | 
			
		||||
import subprocess
 | 
			
		||||
import sys
 | 
			
		||||
import re
 | 
			
		||||
import time
 | 
			
		||||
import json
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
 | 
			
		||||
import yaml
 | 
			
		||||
import docker
 | 
			
		||||
import typer
 | 
			
		||||
from crontab import CronTab
 | 
			
		||||
from loguru import logger
 | 
			
		||||
from pprint import pformat
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
app = typer.Typer(add_completion=False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
log_format = (
 | 
			
		||||
        "<green>{time:YYYY-MM-DD at HH:mm:ss}</green> | "
 | 
			
		||||
        "<level>{level: <7}</level> | "
 | 
			
		||||
        "<bold><magenta>{extra[name]}</magenta></bold> - "
 | 
			
		||||
        "{message}"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ClearMLBackupManager:
 | 
			
		||||
    def __init__(self, docker_compose_file: str):
 | 
			
		||||
        self.timestamp = datetime.now().strftime("%Y-%m-%d-%H%M")
 | 
			
		||||
        self.docker_compose_file = docker_compose_file
 | 
			
		||||
 | 
			
		||||
        # setup logging
 | 
			
		||||
        self.logger = logger
 | 
			
		||||
        self.logger.remove(0)
 | 
			
		||||
        self.logger.add(
 | 
			
		||||
            sys.stdout,
 | 
			
		||||
            format=log_format,
 | 
			
		||||
            colorize=True
 | 
			
		||||
        )
 | 
			
		||||
        self.logger = self.logger.bind(name="MAIN")
 | 
			
		||||
 | 
			
		||||
        # parse docker compose file
 | 
			
		||||
        if not os.path.exists(docker_compose_file):
 | 
			
		||||
            self.logger.error(f"Docker Compose file not found at {docker_compose_file}.")
 | 
			
		||||
            raise FileNotFoundError(f"Docker Compose file not found at {docker_compose_file}.")
 | 
			
		||||
        self.compose_dict = yaml.safe_load(open(docker_compose_file, "r"))
 | 
			
		||||
 | 
			
		||||
        # setup containers
 | 
			
		||||
        self.containers = self.setup_containers()
 | 
			
		||||
        if self.containers is None:
 | 
			
		||||
            self.logger.error("Failed to identify containers. Exiting backup process.")
 | 
			
		||||
            raise RuntimeError("Failed to identify containers. Exiting backup process.")
 | 
			
		||||
        
 | 
			
		||||
    def cleanup_old_backups(self, backup_root: str, keep_last: int = 2) -> int:
 | 
			
		||||
        """
 | 
			
		||||
        Removes old ClearML snapshot backups, keeping only the most recent `keep_last` snapshots.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            backup_root (str): The root directory where backups are stored.
 | 
			
		||||
            keep_last (int): Number of most recent snapshots to keep.
 | 
			
		||||
        """
 | 
			
		||||
        backup_root_path = Path(backup_root)
 | 
			
		||||
        if not backup_root_path.exists() or not backup_root_path.is_dir():
 | 
			
		||||
            self.logger.error(f"Backup root directory does not exist or is not a directory: {backup_root}")
 | 
			
		||||
            return 1
 | 
			
		||||
 | 
			
		||||
        # Match folders like: clearml_snapshot_2025-06-05-1030
 | 
			
		||||
        snapshot_dirs = sorted(
 | 
			
		||||
            [p for p in backup_root_path.iterdir() if p.is_dir() and re.match(r"clearml_snapshot_\d{4}-\d{2}-\d{2}-\d{4}", p.name)],
 | 
			
		||||
            key=lambda p: p.name,
 | 
			
		||||
            reverse=True  # most recent first
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        if len(snapshot_dirs) <= keep_last:
 | 
			
		||||
            self.logger.info(f"Only {len(snapshot_dirs)} snapshots found. Nothing to clean.")
 | 
			
		||||
            return 0
 | 
			
		||||
 | 
			
		||||
        to_delete = snapshot_dirs[keep_last:]
 | 
			
		||||
        for folder in to_delete:
 | 
			
		||||
            try:
 | 
			
		||||
                self.logger.info(f"Removing old snapshot: {folder}")
 | 
			
		||||
                subprocess.run(["rm", "-rf", str(folder)], check=True)
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                self.logger.error(f"Failed to delete {folder}: {e}")
 | 
			
		||||
                return 1
 | 
			
		||||
            
 | 
			
		||||
        return 0
 | 
			
		||||
 | 
			
		||||
    def create_snapshot(self, backup_root: str) -> tuple[int, str]:
 | 
			
		||||
        """ Main method to create a ClearML snapshot. It will backup Elasticsearch, MongoDB, Redis and fileserver data."""
 | 
			
		||||
 | 
			
		||||
        backup_path = os.path.join(backup_root, f"clearml_snapshot_{self.timestamp}")
 | 
			
		||||
        os.makedirs(backup_path, exist_ok=True)
 | 
			
		||||
 | 
			
		||||
        # Route logger to the snapshot directory
 | 
			
		||||
        self.logger.add(
 | 
			
		||||
            os.path.join(backup_path, "clearml_backup.log"),
 | 
			
		||||
            format=log_format
 | 
			
		||||
        )
 | 
			
		||||
        self.logger.info("Starting ClearML snapshot creation...")
 | 
			
		||||
 | 
			
		||||
        # Copy Docker Compose file to backup directory
 | 
			
		||||
        compose_backup_path = os.path.join(backup_path, "docker-compose.yml")
 | 
			
		||||
        response = subprocess.run(["cp", self.docker_compose_file, compose_backup_path], check=True)
 | 
			
		||||
        if response.returncode != 0:
 | 
			
		||||
            self.logger.error(f"Failed to copy Docker Compose file to {compose_backup_path}.")
 | 
			
		||||
            return 1, backup_path
 | 
			
		||||
        self.logger.info(f"Copied Docker Compose file to {compose_backup_path}.")
 | 
			
		||||
        
 | 
			
		||||
        # Copy config directory to backup directory
 | 
			
		||||
        config_dir_path = None
 | 
			
		||||
        for volume in self.compose_dict["services"]["apiserver"]["volumes"]:
 | 
			
		||||
            if "/opt/clearml/config" in volume:
 | 
			
		||||
                config_dir_path = volume.split(":")[0]
 | 
			
		||||
                break
 | 
			
		||||
        response = subprocess.run(["cp", "-r", config_dir_path, os.path.join(backup_path, "config")], check=True)
 | 
			
		||||
        if not config_dir_path or not os.path.exists(config_dir_path):
 | 
			
		||||
            self.logger.error(f"Config directory not found in Docker Compose file or does not exist: {config_dir_path}")
 | 
			
		||||
            return 1, backup_path
 | 
			
		||||
        self.logger.info(f"Copied config directory from {config_dir_path} to {os.path.join(backup_path, 'config')}.")
 | 
			
		||||
        
 | 
			
		||||
        # Backup Elasticsearch
 | 
			
		||||
        self.logger = self.logger.bind(name="ELASTICSEARCH")
 | 
			
		||||
        status = self.backup_elasticsearch(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("Elasticsearch backup failed. Exiting backup process.")
 | 
			
		||||
            return status, backup_path
 | 
			
		||||
 | 
			
		||||
        # Backup MongoDB
 | 
			
		||||
        self.logger = self.logger.bind(name="MONGODB")
 | 
			
		||||
        self.backup_mongodb(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("MongoDB backup failed. Exiting backup process.")
 | 
			
		||||
            return status, backup_path
 | 
			
		||||
 | 
			
		||||
        # Backup Redis
 | 
			
		||||
        self.logger = self.logger.bind(name="REDIS")
 | 
			
		||||
        status = self.backup_redis(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("Redis backup failed. Exiting backup process.")
 | 
			
		||||
            return status, backup_path
 | 
			
		||||
 | 
			
		||||
        # Backup fileserver
 | 
			
		||||
        self.logger = self.logger.bind(name="FILESERVER")
 | 
			
		||||
        status = self.backup_fileserver(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("Fileserver backup failed. Exiting backup process.")
 | 
			
		||||
            return status, backup_path
 | 
			
		||||
 | 
			
		||||
        self.logger = self.logger.bind(name="MAIN")
 | 
			
		||||
        self.logger.info("ClearML snapshot created successfully.")
 | 
			
		||||
        return 0, backup_path
 | 
			
		||||
        
 | 
			
		||||
    def restore_snapshot(self, backup_path: str) -> int:
 | 
			
		||||
        """ Main method to restore a ClearML snapshot. It will restore Elasticsearch, MongoDB, Redis and fileserver data."""
 | 
			
		||||
 | 
			
		||||
        if not os.path.exists(backup_path):
 | 
			
		||||
            self.logger.error(f"Backup path does not exist: {backup_path}")
 | 
			
		||||
            return 1
 | 
			
		||||
 | 
			
		||||
        self.logger.info("Starting ClearML snapshot restoration...")
 | 
			
		||||
 | 
			
		||||
        # Restore Elasticsearch
 | 
			
		||||
        self.logger = self.logger.bind(name="ELASTICSEARCH")
 | 
			
		||||
        status = self.restore_elasticsearch(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("Elasticsearch restoration failed. Exiting restore process.")
 | 
			
		||||
            return status
 | 
			
		||||
        
 | 
			
		||||
        # Restore MongoDB
 | 
			
		||||
        self.logger = self.logger.bind(name="MONGODB")
 | 
			
		||||
        status = self.restore_mongodb(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("MongoDB restoration failed. Exiting restore process.")
 | 
			
		||||
            return status
 | 
			
		||||
 | 
			
		||||
        # # Restore Redis
 | 
			
		||||
        self.logger = self.logger.bind(name="REDIS")
 | 
			
		||||
        status = self.restore_redis(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("Redis restoration failed. Exiting restore process.")
 | 
			
		||||
            return status
 | 
			
		||||
 | 
			
		||||
        # # Restore fileserver
 | 
			
		||||
        self.logger = self.logger.bind(name="FILESERVER")
 | 
			
		||||
        status = self.restore_fileserver(backup_path)
 | 
			
		||||
        if status != 0:
 | 
			
		||||
            self.logger.error("Fileserver restoration failed. Exiting restore process.")
 | 
			
		||||
            return status
 | 
			
		||||
 | 
			
		||||
        self.logger = self.logger.bind(name="MAIN")
 | 
			
		||||
        self.logger.info("ClearML snapshot restored successfully.")
 | 
			
		||||
        return 0
 | 
			
		||||
 | 
			
		||||
    def setup_containers(self) -> dict | None:
 | 
			
		||||
        """ Identifies ClearML containers and returns them in a dictionary."""
 | 
			
		||||
 | 
			
		||||
        containers = {}
 | 
			
		||||
        docker_client = docker.from_env()
 | 
			
		||||
        for container in docker_client.containers.list():
 | 
			
		||||
            if "clearml-elastic" in container.name:
 | 
			
		||||
                if "elastic" in containers.keys():
 | 
			
		||||
                    self.logger.error(f"Multiple Elasticsearch containers found: {containers['elastic'].id} and {container.id}. Using the first one.")
 | 
			
		||||
                containers["elastic"] = container
 | 
			
		||||
                self.logger.info(f"Found Elasticsearch container: {container.name} ({container}, {container.image})")
 | 
			
		||||
            elif "clearml-mongo" in container.name:
 | 
			
		||||
                if "mongo" in containers.keys():
 | 
			
		||||
                    self.logger.error(f"Multiple MongoDB containers found: {containers['mongo'].id} and {container.id}. Using the first one.")
 | 
			
		||||
                containers["mongo"] = container
 | 
			
		||||
                self.logger.info(f"Found MongoDB container: {container.name} ({container}, {container.image})")
 | 
			
		||||
            elif "clearml-redis" in container.name:
 | 
			
		||||
                if "redis" in containers.keys():
 | 
			
		||||
                    self.logger.error(f"Multiple Redis containers found: {containers['redis'].id} and {container.id}. Using the first one.")
 | 
			
		||||
                containers["redis"] = container
 | 
			
		||||
                self.logger.info(f"Found Redis container: {container.name} ({container}, {container.image})")
 | 
			
		||||
 | 
			
		||||
        if not "elastic" in containers:
 | 
			
		||||
            self.logger.error("No Elasticsearch container.")
 | 
			
		||||
            return
 | 
			
		||||
        if not "mongo" in containers:
 | 
			
		||||
            self.logger.error("No MongoDB container found.")
 | 
			
		||||
            return
 | 
			
		||||
        if not "redis" in containers:
 | 
			
		||||
            self.logger.error("No Redis container found.")
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        return containers
 | 
			
		||||
 | 
			
		||||
    def backup_elasticsearch(self, backup_path: str) -> int:
 | 
			
		||||
        """ Backs up Elasticsearch data by creating a snapshot and copying it to the host."""
 | 
			
		||||
        if not "path.repo" in self.compose_dict["services"]["elasticsearch"]["environment"]:
 | 
			
		||||
            self.logger.error("Elasticsearch path.repo environment variable not found in Docker Compose file.")
 | 
			
		||||
            return 1
 | 
			
		||||
        
 | 
			
		||||
        es_container_backup_dir = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
 | 
			
		||||
        es_local_backup_dir = os.path.join(backup_path, os.path.basename(os.path.normpath(es_container_backup_dir)))
 | 
			
		||||
        repo_name = "backup"
 | 
			
		||||
        snapshot_name = f"snapshot_{self.timestamp}"
 | 
			
		||||
 | 
			
		||||
        # Register snapshot repo
 | 
			
		||||
        self.logger.info(f"Registering Elasticsearch snapshot repository '{repo_name}' at {es_container_backup_dir}...")
 | 
			
		||||
        response = self.containers["elastic"].exec_run(
 | 
			
		||||
            f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
 | 
			
		||||
            f"-H 'Content-Type: application/json' "
 | 
			
		||||
            f"-d '{{\"type\": \"fs\", \"settings\": {{\"location\": \"{es_container_backup_dir}\"}}}}'"
 | 
			
		||||
        )
 | 
			
		||||
        response = response.output.decode()
 | 
			
		||||
        response = json.loads(response) if response else {}
 | 
			
		||||
        if "error" in response:
 | 
			
		||||
            self.logger.error(f"Failed to register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Elasticsearch snapshot repository registered: \n{pformat(response)}")
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        # Trigger snapshot
 | 
			
		||||
        self.logger.info(f"Elasticsearch snapshot creation started...")
 | 
			
		||||
        response = self.containers["elastic"].exec_run(
 | 
			
		||||
            f"curl -s -X PUT localhost:9200/_snapshot/{repo_name}/{snapshot_name}?wait_for_completion=true"
 | 
			
		||||
        )
 | 
			
		||||
        response = response.output.decode()
 | 
			
		||||
        response = json.loads(response) if response else {}
 | 
			
		||||
        if "error" in response:
 | 
			
		||||
            self.logger.error(f"Failed to create Elasticsearch snapshot: \n{pformat(response['error'])}")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Elasticsearch snapshot created: \n{pformat(response)}")
 | 
			
		||||
 | 
			
		||||
        # Copy snapshot data from container
 | 
			
		||||
        self.logger.info(f"Copying Elasticsearch snapshot data from container to local directory: {es_local_backup_dir}")
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "docker", 
 | 
			
		||||
            "cp",
 | 
			
		||||
            f"{self.containers['elastic'].id}:{es_container_backup_dir}", 
 | 
			
		||||
            backup_path,
 | 
			
		||||
            "-q"
 | 
			
		||||
        ])
 | 
			
		||||
        # check files got copied
 | 
			
		||||
        if not os.path.exists(es_local_backup_dir) or not os.listdir(es_local_backup_dir):
 | 
			
		||||
            self.logger.error("Elasticsearch backup directory is empty. Backup failed.")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Elasticsearch snapshot data copied to: {es_local_backup_dir}")  
 | 
			
		||||
              
 | 
			
		||||
        return 0
 | 
			
		||||
    
 | 
			
		||||
    def restore_elasticsearch(self, backup_path: str) -> int:
 | 
			
		||||
        """ Restores Elasticsearch data from a snapshot by copying it to the container's backup directory."""
 | 
			
		||||
        # Copy the snapshot files back into the container's repo path
 | 
			
		||||
        es_repo = self.compose_dict["services"]["elasticsearch"]["environment"]["path.repo"]
 | 
			
		||||
        es_repo_root = os.path.dirname(es_repo)
 | 
			
		||||
        host_snapshot_dir = os.path.join(backup_path, os.path.basename(es_repo))
 | 
			
		||||
        self.logger.info(f"Copying Elasticsearch snapshot files from {host_snapshot_dir} to container at {es_repo_root}")
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "docker", "cp",
 | 
			
		||||
            host_snapshot_dir,
 | 
			
		||||
            f"{self.containers['elastic'].id}:{es_repo_root}"
 | 
			
		||||
        ], check=True)
 | 
			
		||||
        if response.returncode != 0:
 | 
			
		||||
            self.logger.error(f"Failed to copy Elasticsearch snapshot files from {host_snapshot_dir} to container.")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Copied Elasticsearch snapshot into container at {es_repo}")
 | 
			
		||||
 | 
			
		||||
        # Re-register the repo
 | 
			
		||||
        self.logger.info("Re-registering Elasticsearch snapshot repository...")
 | 
			
		||||
        repo_name = "backup"
 | 
			
		||||
        response = self.containers["elastic"].exec_run(
 | 
			
		||||
            f"curl -s -X PUT localhost:9200/_snapshot/{repo_name} "
 | 
			
		||||
            f"-H 'Content-Type: application/json' "
 | 
			
		||||
            f"-d '{{\"type\":\"fs\",\"settings\":{{\"location\":\"{es_repo}\"}}}}'"
 | 
			
		||||
        )
 | 
			
		||||
        response = response.output.decode()
 | 
			
		||||
        response = json.loads(response) if response else {}
 | 
			
		||||
        self.logger.info(f"Elasticsearch snapshot repository re-registration response: \n{pformat(response)}")
 | 
			
		||||
        if "error" in response:
 | 
			
		||||
            self.logger.error(f"Failed to re-register Elasticsearch snapshot repository: \n{pformat(response['error'])}")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info("Elasticsearch snapshot repository re-registered successfully.")
 | 
			
		||||
 | 
			
		||||
        # Close any existing indices
 | 
			
		||||
        self.logger.info("Closing all Elasticsearch indices to avoid conflicts during restore...")
 | 
			
		||||
        indices = self.containers["elastic"].exec_run(
 | 
			
		||||
            "curl -s localhost:9200/_cat/indices?h=index"
 | 
			
		||||
        ).output.decode().strip().splitlines()
 | 
			
		||||
        if indices:
 | 
			
		||||
            index_list = ",".join(indices)
 | 
			
		||||
            response = self.containers["elastic"].exec_run(
 | 
			
		||||
                f"curl -s -X POST localhost:9200/{index_list}/_close"
 | 
			
		||||
            )
 | 
			
		||||
            response = response.output.decode()
 | 
			
		||||
            response = json.loads(response) if response else {}
 | 
			
		||||
            self.logger.info(f"Close indices response: \n{pformat(response)}")
 | 
			
		||||
            if "error" in response:
 | 
			
		||||
                self.logger.error(f"Failed to close Elasticsearch indices: \n{pformat(response['error'])}")
 | 
			
		||||
                return 1
 | 
			
		||||
            else:
 | 
			
		||||
                self.logger.info("Closed all Elasticsearch indices.")
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info("No Elasticsearch indices found to close.")
 | 
			
		||||
 | 
			
		||||
        # Trigger the restore
 | 
			
		||||
        snap_timestamp = backup_path.split("_")[-1]
 | 
			
		||||
        snap_name = f"snapshot_{snap_timestamp}"
 | 
			
		||||
        self.logger.info(f"Restoring Elasticsearch snapshot: {snap_name} from repository: {repo_name}...")
 | 
			
		||||
        response = self.containers["elastic"].exec_run(
 | 
			
		||||
            f"curl -s -X POST localhost:9200/_snapshot/{repo_name}/{snap_name}/_restore?wait_for_completion=true "
 | 
			
		||||
            f"-H 'Content-Type: application/json' -d '{{\"include_global_state\":true}}'"
 | 
			
		||||
        )
 | 
			
		||||
        response = response.output.decode()
 | 
			
		||||
        response = json.loads(response) if response else {}
 | 
			
		||||
        if "error" in response:
 | 
			
		||||
            self.logger.error(f"Failed to restore Elasticsearch snapshot: {pformat(response['error'])}")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Elasticsearch snapshot restored: {pformat(response)}")
 | 
			
		||||
        
 | 
			
		||||
        self.logger.info("Elasticsearch snapshot restored.")
 | 
			
		||||
        return 0
 | 
			
		||||
    
 | 
			
		||||
    def backup_mongodb(self, backup_path: str) -> int:
 | 
			
		||||
        """ Backs up MongoDB data by creating a dump and copying it to the host."""
 | 
			
		||||
        mongo_container_backup_dir = "/tmp/mongodump"
 | 
			
		||||
        mongo_backup_dir = os.path.join(backup_path, "mongo_backup")
 | 
			
		||||
 | 
			
		||||
        # clean up old backup directory if exists
 | 
			
		||||
        self.logger.info(f"Cleaning up old MongoDB backup directory: {mongo_container_backup_dir}")
 | 
			
		||||
        self.containers["mongo"].exec_run(f"rm -rf {mongo_container_backup_dir}")
 | 
			
		||||
        # create backup directory on host
 | 
			
		||||
        self.logger.info(f"Creating MongoDB backup directory on host: {mongo_container_backup_dir}")
 | 
			
		||||
        response = self.containers["mongo"].exec_run(f"mongodump --out {mongo_container_backup_dir}")
 | 
			
		||||
        if response.exit_code != 0:
 | 
			
		||||
            self.logger.error(f"Failed to create MongoDB dump: {response.output.decode()}")
 | 
			
		||||
            return 1
 | 
			
		||||
        self.logger.info(f"MongoDB dumped: {response.output.decode()}")
 | 
			
		||||
        # copy backup from container to host
 | 
			
		||||
        self.logger.info(f"Copying MongoDB backup data from container to local directory: {mongo_backup_dir}")
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "docker", 
 | 
			
		||||
            "cp",
 | 
			
		||||
            f"{self.containers['mongo'].id}:{mongo_container_backup_dir}",
 | 
			
		||||
            mongo_backup_dir,
 | 
			
		||||
            "-q"
 | 
			
		||||
        ])
 | 
			
		||||
        # check files got copied
 | 
			
		||||
        if not os.path.exists(mongo_backup_dir) or not os.listdir(mongo_backup_dir):
 | 
			
		||||
            self.logger.error("MongoDB backup directory is empty. Backup failed.")
 | 
			
		||||
            return 1
 | 
			
		||||
            
 | 
			
		||||
        self.logger.info(f"MongoDB backup data copied to: {mongo_backup_dir}")
 | 
			
		||||
        return 0
 | 
			
		||||
    
 | 
			
		||||
    def restore_mongodb(self, backup_path: str) -> int:
 | 
			
		||||
        """ Restores MongoDB data from a snapshot by copying the dump back into the container and restoring it."""
 | 
			
		||||
        # Copy dump back into container
 | 
			
		||||
        container_target = "/tmp/mongodump_restore"
 | 
			
		||||
        host_dump_dir = os.path.join(backup_path, "mongo_backup")
 | 
			
		||||
        self.logger.info(f"Copying MongoDB dump from {host_dump_dir} to container at {container_target}")
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "docker", "cp",
 | 
			
		||||
            host_dump_dir,
 | 
			
		||||
            f"{self.containers['mongo'].id}:{container_target}"
 | 
			
		||||
        ], check=True)
 | 
			
		||||
        if response.returncode != 0:
 | 
			
		||||
            self.logger.error(f"Failed to copy MongoDB dump from {host_dump_dir} to container.")
 | 
			
		||||
            return 1
 | 
			
		||||
        self.logger.info(f"Copied Mongo dump into container at {container_target}")
 | 
			
		||||
 | 
			
		||||
        # Restore to overwrite existing data
 | 
			
		||||
        self.logger.info("Restoring MongoDB data from dump...")
 | 
			
		||||
        response = self.containers["mongo"].exec_run(
 | 
			
		||||
            f"mongorestore --drop {container_target}",
 | 
			
		||||
            user="mongodb"  # same user as backup
 | 
			
		||||
        )
 | 
			
		||||
        if response.exit_code != 0:
 | 
			
		||||
            self.logger.error(f"Failed to restore MongoDB data: {response.output.decode()}")
 | 
			
		||||
            return 1
 | 
			
		||||
        
 | 
			
		||||
        self.logger.info("MongoDB data restored successfully.")
 | 
			
		||||
        return 0
 | 
			
		||||
    
 | 
			
		||||
    def backup_redis(self, backup_path: str) -> int:
 | 
			
		||||
        """ Backs up Redis data by triggering a SAVE command and copying the dump.rdb file to the host."""
 | 
			
		||||
        redis_local_backup_file = os.path.join(backup_path, "dump.rdb")
 | 
			
		||||
 | 
			
		||||
        # trigger redis backup
 | 
			
		||||
        self.logger.info("Triggering Redis SAVE to create a snapshot...")
 | 
			
		||||
        response = self.containers["redis"].exec_run("redis-cli SAVE")
 | 
			
		||||
        if not response.output.decode().startswith("OK"):
 | 
			
		||||
            self.logger.error(f"Failed to trigger Redis SAVE command: {response.output.decode()}")
 | 
			
		||||
            return 1
 | 
			
		||||
        self.logger.info(f"Redis SAVE command response: {response.output.decode()}")
 | 
			
		||||
 | 
			
		||||
        # Copy dump.rdb to host
 | 
			
		||||
        self.logger.info(f"Copying Redis dump.rdb from container to local file: {redis_local_backup_file}")
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "docker", 
 | 
			
		||||
            "cp",
 | 
			
		||||
            f"{self.containers['redis'].id}:/data/dump.rdb", 
 | 
			
		||||
            redis_local_backup_file,
 | 
			
		||||
            "-q"
 | 
			
		||||
        ])
 | 
			
		||||
        if response.returncode != 0:
 | 
			
		||||
            self.logger.error(f"Failed to copy Redis dump.rdb from container to {redis_local_backup_file}.")
 | 
			
		||||
            return 1
 | 
			
		||||
        
 | 
			
		||||
        self.logger.info(f"Redis backup file copied to: {redis_local_backup_file}")
 | 
			
		||||
        return 0    
 | 
			
		||||
    
 | 
			
		||||
    def restore_redis(self, backup_path: str) -> int:
 | 
			
		||||
        """ Restores Redis data from a snapshot by copying the dump.rdb file back into the container and restarting it."""
 | 
			
		||||
        # Stop Redis to avoid racing writes
 | 
			
		||||
        self.containers["redis"].stop()
 | 
			
		||||
        self.logger.info("Redis container stopped for restore.")
 | 
			
		||||
 | 
			
		||||
        # Copy dump.rdb back into container
 | 
			
		||||
        host_rdb = os.path.join(backup_path, "dump.rdb")
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "docker", "cp",
 | 
			
		||||
            host_rdb,
 | 
			
		||||
            f"{self.containers['redis'].id}:/data/dump.rdb"
 | 
			
		||||
        ], check=True)
 | 
			
		||||
        if response.returncode != 0:
 | 
			
		||||
            self.logger.error(f"Failed to copy Redis dump.rdb from {host_rdb} to container.")
 | 
			
		||||
            return 1
 | 
			
		||||
        self.logger.info(f"Copied dump.rdb into Redis container.")
 | 
			
		||||
 | 
			
		||||
        # Restart Redis
 | 
			
		||||
        self.containers["redis"].start()
 | 
			
		||||
        self.logger.info("Redis container restarted.")
 | 
			
		||||
        return 0
 | 
			
		||||
 | 
			
		||||
    def backup_fileserver(self, backup_path: str) -> int:
 | 
			
		||||
        """ Backs up fileserver data by copying the fileserver path to a backup directory."""
 | 
			
		||||
        fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
 | 
			
		||||
        fileserver_path = None
 | 
			
		||||
        for volume in fileserver_volumes:
 | 
			
		||||
            if "/mnt/fileserver" in volume:
 | 
			
		||||
                fileserver_path = volume.split(":")[0]
 | 
			
		||||
                self.logger.info(f"Fileserver path: {fileserver_path}")
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        # Ensure fileserver path exists
 | 
			
		||||
        if not fileserver_path:
 | 
			
		||||
            self.logger.error("Fileserver path not found in Docker Compose file.")
 | 
			
		||||
            return 1
 | 
			
		||||
        if not os.path.exists(fileserver_path):
 | 
			
		||||
            self.logger.error(f"Fileserver path does not exist: {fileserver_path}")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Copying fileserver from {fileserver_path} with rsync...")
 | 
			
		||||
        # Copy fileserver data 
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "rsync", "-av", "--delete", str(fileserver_path), str(backup_path)
 | 
			
		||||
        ], check=True)
 | 
			
		||||
        if response.returncode != 0:
 | 
			
		||||
            self.logger.error(f"Rsync failed: {response}.")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Rsync successful.")
 | 
			
		||||
        # Check files got copied
 | 
			
		||||
        fileserver_backup_dir = os.path.join(backup_path, "fileserver")
 | 
			
		||||
        if not os.path.exists(fileserver_backup_dir) or not os.listdir(fileserver_backup_dir):
 | 
			
		||||
            self.logger.error("Fileserver backup directory is empty. Backup failed.")
 | 
			
		||||
            return 1
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info(f"Fileserver data copied to: {fileserver_backup_dir}")
 | 
			
		||||
        return 0
 | 
			
		||||
 | 
			
		||||
    def restore_fileserver(self, backup_path: str) -> int:
 | 
			
		||||
        """ Restores fileserver data from a snapshot by rsyncing it back to the live volume."""
 | 
			
		||||
        # Read original volume mount from compose
 | 
			
		||||
        fileserver_volumes = self.compose_dict["services"]["fileserver"]["volumes"]
 | 
			
		||||
        fileserver_path = None
 | 
			
		||||
        for volume in fileserver_volumes:
 | 
			
		||||
            if "/mnt/fileserver" in volume:
 | 
			
		||||
                fileserver_path = os.path.dirname(volume.split(":")[0])
 | 
			
		||||
                self.logger.info(f"Fileserver path: {fileserver_path}")
 | 
			
		||||
                break
 | 
			
		||||
        self.logger.info(f"Restoring fileserver to {fileserver_path}")
 | 
			
		||||
 | 
			
		||||
        # Rsync backup back into the live volume
 | 
			
		||||
        src = os.path.join(backup_path, "fileserver")
 | 
			
		||||
        response = subprocess.run([
 | 
			
		||||
            "rsync", "-av", "--delete",
 | 
			
		||||
            src, fileserver_path
 | 
			
		||||
        ], check=True)
 | 
			
		||||
        if response.returncode != 0:
 | 
			
		||||
            self.logger.error(f"Rsync failed: {response}.")
 | 
			
		||||
            return 1
 | 
			
		||||
        
 | 
			
		||||
        self.logger.info("Fileserver data restored successfully.")
 | 
			
		||||
        return 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.command()
 | 
			
		||||
def create_snapshot(
 | 
			
		||||
    backup_root: str = typer.Option(
 | 
			
		||||
        help="Root directory where ClearML backups will be stored."
 | 
			
		||||
    ),
 | 
			
		||||
    docker_compose_file: str = typer.Option(
 | 
			
		||||
        help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml')."
 | 
			
		||||
    ),
 | 
			
		||||
    retention: int = typer.Option(
 | 
			
		||||
        0, 
 | 
			
		||||
        help="Number of most recent snapshots to keep. Older snapshots will be deleted. Default is 0 (no clean up).",
 | 
			
		||||
    )
 | 
			
		||||
):
 | 
			
		||||
    """Create a timestamped ClearML snapshot."""
 | 
			
		||||
    tic = time.time()
 | 
			
		||||
    backup_manager = ClearMLBackupManager(docker_compose_file=docker_compose_file)
 | 
			
		||||
    status, backup_path = backup_manager.create_snapshot(backup_root)
 | 
			
		||||
    if status == 0 and retention > 0:
 | 
			
		||||
            backup_manager.cleanup_old_backups(backup_root, keep_last=retention)
 | 
			
		||||
            
 | 
			
		||||
    if status != 0:
 | 
			
		||||
        typer.secho(f"{datetime.now()} | Backup failed. Check snapshot logs for details: {backup_path}", fg=typer.colors.RED)
 | 
			
		||||
    else:
 | 
			
		||||
        typer.secho(
 | 
			
		||||
            f"{datetime.now()} | Backup completed in {str(timedelta(seconds=int(time.time() - tic)))}. Snapshot located in {backup_path}.", 
 | 
			
		||||
            fg=typer.colors.GREEN
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.command()
 | 
			
		||||
def restore_snapshot(
 | 
			
		||||
    snapshot_path: str = typer.Option(
 | 
			
		||||
        help="Path to the ClearML snapshot directory to restore from."
 | 
			
		||||
    )
 | 
			
		||||
):
 | 
			
		||||
    """Restore a ClearML snapshot."""
 | 
			
		||||
    typer.secho(f"WARNING! This will overwrite existing ClearML data. Proceed with caution.", fg=typer.colors.YELLOW)
 | 
			
		||||
    typer.secho(f"Before you proceed, make sure that:", fg=typer.colors.YELLOW)
 | 
			
		||||
    typer.secho(f"- You have a manual backup of your current ClearML data (in case there are any on the current server instance).", fg=typer.colors.YELLOW)
 | 
			
		||||
    typer.secho(f"- The data subfolders are created with correct permissions (see https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server_linux_mac).", fg=typer.colors.YELLOW)
 | 
			
		||||
    typer.secho(f"- You are using a docker-compose.yml and config/ copy from {snapshot_path}.", fg=typer.colors.YELLOW)
 | 
			
		||||
    typer.secho(f"- The target ClearML server instance is up and running.", fg=typer.colors.YELLOW)
 | 
			
		||||
    typer.confirm("Do you want to proceed with the restoration?", abort=True)
 | 
			
		||||
    
 | 
			
		||||
    if snapshot_path.endswith("/"):
 | 
			
		||||
        snapshot_path = snapshot_path[:-1]
 | 
			
		||||
        
 | 
			
		||||
    backup_manager = ClearMLBackupManager(docker_compose_file=os.path.join(snapshot_path, "docker-compose.yml"))
 | 
			
		||||
    status = backup_manager.restore_snapshot(snapshot_path)
 | 
			
		||||
    if status != 0:
 | 
			
		||||
        typer.secho(f"Snapshot restoration failed. Check logs for details", fg=typer.colors.RED)
 | 
			
		||||
    else:
 | 
			
		||||
        typer.secho(f"Snapshot restored successfully.", fg=typer.colors.GREEN)
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
@app.command()
 | 
			
		||||
def clear_schedule():
 | 
			
		||||
    """Clear the existing ClearML backup cron job."""
 | 
			
		||||
    user = pwd.getpwuid(os.getuid())
 | 
			
		||||
    cron = CronTab(user=user.pw_name)
 | 
			
		||||
    for job in cron:
 | 
			
		||||
        if job.comment == "clearml-backup-tool":
 | 
			
		||||
            typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
 | 
			
		||||
            cron.remove(job)
 | 
			
		||||
    cron.write()
 | 
			
		||||
    typer.secho("Cleared all existing ClearML backup cron jobs.", fg=typer.colors.GREEN)
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
@app.command()
 | 
			
		||||
def setup_schedule(
 | 
			
		||||
    backup_root: str = typer.Option(
 | 
			
		||||
        "./clearml_backup",
 | 
			
		||||
        help="Root directory where ClearML backups will be stored. Default is './clearml_backup'.",
 | 
			
		||||
        prompt="Enter the backup root directory",
 | 
			
		||||
    ),
 | 
			
		||||
    docker_compose_file: str = typer.Option(
 | 
			
		||||
        "/opt/clearml/docker-compose.yml",
 | 
			
		||||
        help="Path to the Docker Compose file for ClearML server (typically '/opt/clearml/docker-compose.yml').",
 | 
			
		||||
        prompt="Enter the path to the Docker Compose file"
 | 
			
		||||
    ),
 | 
			
		||||
    retention: int = typer.Option(
 | 
			
		||||
        2,
 | 
			
		||||
        help="Number of most recent snapshots to keep. Older snapshots will be deleted. (0 = no cleanup).",
 | 
			
		||||
        prompt="Enter the number of most recent snapshots to keep (0 = no cleanup)"
 | 
			
		||||
    ),
 | 
			
		||||
    backup_period: str = typer.Option(
 | 
			
		||||
        "7d",
 | 
			
		||||
        help="Backup period for the cron job in the format '{number}{unit}} where unit is one of 'm' (minutes), 'h' (hours), 'd' (days)'.",
 | 
			
		||||
        prompt="Enter the backup period for the cron job (format: '{number}{unit}').",
 | 
			
		||||
    )
 | 
			
		||||
):
 | 
			
		||||
    """Set up a cron job to automatically create ClearML snapshots. You can run this without any arguments to go through an interactive setup."""
 | 
			
		||||
    assert re.match(r'^\d+[mhd]$', backup_period), "Backup period must be in the format '{number}{unit}' where unit is one of 'm', 'h', 'd'."
 | 
			
		||||
    
 | 
			
		||||
    user = pwd.getpwuid(os.getuid())
 | 
			
		||||
    cron = CronTab(user=user.pw_name)
 | 
			
		||||
    abs_backup_root = os.path.abspath(backup_root)
 | 
			
		||||
    abs_docker_compose_file = os.path.abspath(docker_compose_file)
 | 
			
		||||
    
 | 
			
		||||
    for job in cron:
 | 
			
		||||
        if job.comment == "clearml-backup-tool":
 | 
			
		||||
            typer.secho(f"Clearing cron job: {job}", fg=typer.colors.BLUE)
 | 
			
		||||
            cron.remove(job)
 | 
			
		||||
    cron.write()
 | 
			
		||||
    
 | 
			
		||||
    uv_path = subprocess.run(["which", "uv"], capture_output=True, text=True, check=True).stdout.strip()
 | 
			
		||||
    command = (f"{uv_path} run {os.path.abspath(__file__)} create-snapshot " 
 | 
			
		||||
               f"--backup-root {abs_backup_root} "
 | 
			
		||||
               f"--docker-compose-file {abs_docker_compose_file} "
 | 
			
		||||
               f"--retention {retention} "
 | 
			
		||||
               f"| tail -n 1 >> {abs_backup_root}/autobackup.log 2>&1"
 | 
			
		||||
               )
 | 
			
		||||
    job = cron.new(command=command, comment="clearml-backup-tool")
 | 
			
		||||
    num, unit = int(backup_period[:-1]), backup_period[-1]
 | 
			
		||||
    match unit:
 | 
			
		||||
        case 'm':
 | 
			
		||||
            job.minute.every(num)
 | 
			
		||||
        case 'h':
 | 
			
		||||
            job.hour.every(num)
 | 
			
		||||
        case 'd':
 | 
			
		||||
            job.day.every(num)
 | 
			
		||||
        case _:
 | 
			
		||||
            raise ValueError(f"Invalid backup period unit: {unit}. Must be one of 'm', 'h', 'd'.")
 | 
			
		||||
    cron.write()
 | 
			
		||||
    
 | 
			
		||||
    for job in cron:
 | 
			
		||||
        if job.comment == "clearml-backup-tool":
 | 
			
		||||
            break
 | 
			
		||||
    typer.secho(f"Set up cron job: {job}", fg=typer.colors.BLUE)
 | 
			
		||||
    typer.secho(f"Scheduled ClearML backup every {num}{unit}. Job will log to {abs_backup_root}/autobackup.log.", fg=typer.colors.GREEN)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    app()
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user