Add support for auto detecting torch and transformers accelerate distributed execution

2025-06-26 18:16:07 +00:00 · 2024-01-06 12:34:32 +02:00 · 2024-01-06 12:34:32 +02:00 · 801c7b4cd4
commit 801c7b4cd4
parent 141a183235
4 changed files with 180 additions and 11 deletions
--- a/clearml/backend_interface/task/repo/scriptinfo.py
+++ b/clearml/backend_interface/task/repo/scriptinfo.py
@ -102,7 +102,7 @@ class ScriptRequirements(object):
            for fname, lines in tfmodule.items():
                modules.add('tensorflow', fname, lines)
-        # if we have torch and it supports tensorboard, we should add that as well
+        # if we have torch, and it supports tensorboard, we should add that as well
        # (because it will not be detected automatically)
        if 'torch' in modules and 'tensorboard' not in modules and 'tensorboardX' not in modules:
            # noinspection PyBroadException
@ -336,14 +336,14 @@ class _JupyterObserver(object):
            # noinspection PyBroadException
            try:
                # noinspection PyPackageRequirements
-                from nbconvert.exporters import PythonExporter
+                from nbconvert.exporters import PythonExporter  # noqa
                _script_exporter = PythonExporter()
            except Exception:
                _script_exporter = None
            if _script_exporter is None:
                # noinspection PyPackageRequirements
-                from nbconvert.exporters.script import ScriptExporter
+                from nbconvert.exporters.script import ScriptExporter  # noqa
                _script_exporter = ScriptExporter()
        except Exception as ex:
@ -622,7 +622,7 @@ class ScriptInfo(object):
            # noinspection PyBroadException
            try:
                # noinspection PyPackageRequirements
-                from notebook.notebookapp import list_running_servers  # <= Notebook v6
+                from notebook.notebookapp import list_running_servers  # noqa <= Notebook v6
                # noinspection PyBroadException
                try:
                    jupyter_servers += list(list_running_servers())
@ -637,7 +637,7 @@ class ScriptInfo(object):
            # noinspection PyBroadException
            try:
                # noinspection PyPackageRequirements
-                from jupyter_server.serverapp import list_running_servers
+                from jupyter_server.serverapp import list_running_servers  # noqa
                # noinspection PyBroadException
                try:
                    jupyter_servers += list(list_running_servers())
@ -724,7 +724,7 @@ class ScriptInfo(object):
            is_google_colab = False
            log_history = False
            colab_name = None
-            # check if this is google.colab, then there is no local file
+            # check if this is `google.colab`, then there is no local file
            is_google_colab = ScriptInfo.is_google_colab()
            if is_google_colab:
@ -753,7 +753,7 @@ class ScriptInfo(object):
                if not entry_point.exists():
                    # noinspection PyBroadException
                    try:
-                        alternative_entry_point = '-'.join(entry_point_filename.split('-')[:-5])+'.ipynb'
+                        alternative_entry_point = '-'.join(entry_point_filename.split('-')[:-5]) + '.ipynb'
                        # now we should try to find the actual file
                        entry_point_alternative = (Path.cwd() / alternative_entry_point).absolute()
                        if not entry_point_alternative.is_file():
@ -828,7 +828,7 @@ class ScriptInfo(object):
        # returns tuple (notebook name, raw string notebook)
        # None, None if fails
        try:
-            from google.colab import _message
+            from google.colab import _message  # noqa
            notebook = _message.blocking_request('get_ipynb', timeout_sec=timeout)['ipynb']
            notebook_name = notebook.get("metadata", {}).get("colab", {}).get("name", "colab.ipynb")
@ -995,6 +995,10 @@ class ScriptInfo(object):
            working_dir = cls._get_working_dir(repo_root)
            entry_point = cls._get_entry_point(repo_root, script_path)
        # check if we are running with torch distributed, or transformers accelerate
        # make sure we change the entry point to reflect it.
        entry_point = cls._detect_distributed_execution(entry_point, log)
        if check_uncommitted:
            # if we have a jupyter notebook, always store the entire notebook (instead of the git diff)
            if jupyter_filepath:
@ -1010,7 +1014,7 @@ class ScriptInfo(object):
            if len(diff) > cls.max_diff_size_bytes:
                messages.append(
                    "======> WARNING! Git diff too large to store "
-                    "({}kb), skipping uncommitted changes <======".format(len(diff)//1024))
+                    "({}kb), skipping uncommitted changes <======".format(len(diff) // 1024))
                auxiliary_git_diff = diff
                diff = '# WARNING! git diff too large to store, clear this section to execute without it.\n' \
                       '# full git diff available in Artifacts/auxiliary_git_diff\n' \
@ -1065,6 +1069,52 @@ class ScriptInfo(object):
        return (ScriptInfoResult(script=script_info, warning_messages=messages, auxiliary_git_diff=auxiliary_git_diff),
                script_requirements)
    @classmethod
    def _detect_distributed_execution(cls, entry_point, log):
        # check if we are running with torch distributed, or transformers accelerate
        # make sure we change the entry point to reflect it.
        is_torch_distributed = os.environ.get("TORCHELASTIC_RUN_ID") is not None
        is_transformers_distributed = os.environ.get("ACCELERATE_DYNAMO_MODE") is not None
        if not is_torch_distributed and not is_transformers_distributed:
            return entry_point
        # this torch distributed
        # noinspection PyBroadException
        try:
            from psutil import Process  # noqa
            cmdline = Process().parent().cmdline()
            # first find the torch model call "torch.distributed.run" or "torch.distributed.launch"
            if is_torch_distributed:
                cmdstart_i = next(i for i, c in enumerate(cmdline) if c.lower().startswith("torch.distributed."))
            elif is_transformers_distributed:
                cmdstart_i = next(i for i, c in enumerate(cmdline) if c.lower().startswith("accelerate.commands."))
            else:
                raise Exception()  # we should not get here
            cmdline = cmdline[cmdstart_i:]
            # reverse look into the paths
            cmdend_i = next(i for i, c in enumerate(cmdline) if Path(c).stem == Path(entry_point).stem)
            filearg = cmdline[cmdend_i]
            # notice --args (script args) are passed on the Args section, we skip detecting them here
            # we are also already removing the filearg from the cmd (it is the last before script args)
            new_cmd = cmdline[:cmdend_i]
            # we assume our entrypoint is the last parameter of the execution cmd line
            if Path(filearg).stem == Path(entry_point).stem:
                entry_point = "-m {} {}".format(" ".join(new_cmd), entry_point)
            if log:
                log.info(
                    "{} execution detected: adjusting entrypoint to "
                    "reflect distributed execution arguments".format(
                        "Torch Distributed" if is_torch_distributed else "Transformers Accelerate")
                )
        except Exception:
            if log:
                log.warning("{} execution detected: Failed Detecting launch arguments, skipping".format(
                    "Torch Distributed" if is_torch_distributed else "Transformers Accelerate"))
        return entry_point
    @staticmethod
    def __legacy_jupyter_notebook_server_json_parsing():
        # noinspection PyBroadException
--- a/clearml/config/init.py
+++ b/clearml/config/init.py
@ -186,7 +186,15 @@ def get_node_id(default=0):
    if node_id is None and (mpi_world_rank is not None or mpi_rank is not None):
        node_id = mpi_world_rank if mpi_world_rank is not None else mpi_rank
-    # if node is is till None, use the default
+    # if node is still None, use the global RANK
    if node_id is None:
        # noinspection PyBroadException
        try:
            node_id = int(os.environ.get("RANK"))
        except Exception:
            pass
    # if node is still None, use the default
    if node_id is None:
        node_id = default
--- a/clearml/task.py
+++ b/clearml/task.py
@ -40,7 +40,8 @@ from .backend_config.defs import get_active_config_file, get_config_file
 from .backend_api.services import tasks, projects, events
 from .backend_api.session.session import (
    Session, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_HOST, ENV_WEB_HOST, ENV_FILES_HOST, )
-from .backend_api.session.defs import ENV_DEFERRED_TASK_INIT, ENV_IGNORE_MISSING_CONFIG, ENV_OFFLINE_MODE, MissingConfigError
+from .backend_api.session.defs import (ENV_DEFERRED_TASK_INIT, ENV_IGNORE_MISSING_CONFIG,
                                       ENV_OFFLINE_MODE, MissingConfigError)
 from .backend_interface.metrics import Metrics
 from .backend_interface.model import Model as BackendModel
 from .backend_interface.base import InterfaceBase
@ -97,6 +98,8 @@ from .utilities.proxy_object import (
 from .utilities.resource_monitor import ResourceMonitor
 from .utilities.seed import make_deterministic
 from .utilities.lowlevel.threads import get_current_thread_id
 from .utilities.lowlevel.distributed import get_torch_local_rank, get_torch_distributed_anchor_task_id, \
    create_torch_distributed_anchor
 from .utilities.process.mp import BackgroundMonitor, leave_process
 from .utilities.process.exit_hooks import ExitHooks
 from .utilities.matching import matches_any_wildcard
@ -105,6 +108,7 @@ from .utilities.networking import get_private_ip
 # noinspection PyProtectedMember
 from .backend_interface.task.args import _Arguments
 if TYPE_CHECKING:
    import pandas
    import numpy
@ -527,10 +531,16 @@ class Task(_Task):
        is_deferred = False
        try:
            if not running_remotely():
                # check remote status
                _local_rank = get_torch_local_rank()
                if _local_rank is not None and _local_rank > 0:
                    is_sub_process_task_id = get_torch_distributed_anchor_task_id(timeout=30)
                # only allow if running locally and creating the first Task
                # otherwise we ignore and perform in order
                if ENV_DEFERRED_TASK_INIT.get():
                    deferred_init = True
                if not is_sub_process_task_id and deferred_init and deferred_init != cls.__nested_deferred_init_flag:
                    def completed_cb(x):
                        Task.__main_task = x
@ -571,6 +581,11 @@ class Task(_Task):
                                    not auto_connect_frameworks.get('detect_repository', True)) else True,
                            auto_connect_streams=auto_connect_streams,
                        )
                        # check if we are local rank 0 (local master),
                        # create an anchor with task ID for the other processes
                        if _local_rank == 0:
                            create_torch_distributed_anchor(task_id=task.id)
                    except MissingConfigError as e:
                        if not ENV_IGNORE_MISSING_CONFIG.get():
                            raise
--- a/clearml/utilities/lowlevel/distributed.py
+++ b/clearml/utilities/lowlevel/distributed.py
@ -0,0 +1,96 @@
 import os
 from logging import getLogger
 from time import sleep, time
 from pathlib2 import Path
 def get_torch_local_rank():
    """
    return the local rank of the process, notice local rank 0 does not mean global rank 0
    return None if no torch distributed is running
    """
    if os.environ.get("TORCHELASTIC_RUN_ID") is not None:
        # noinspection PyBroadException
        try:
            return int(os.environ.get("LOCAL_RANK"))
        except Exception:
            return None
    return None
 def create_torch_distributed_anchor(task_id):
    """
    This will create a temporary file to pass the Task ID created by local_rank 0 of
    if None local rank 0 is calling this file, it
    Only call when running locally (i.e. without an agent),
    if running remotely there is no need to pass Task ID, it will be passed externally
    """
    local_file_name = ".clearml_torch_distributed_id"
    if get_torch_local_rank() != 0:
        return
    torch_dist_path = os.environ.get("TORCHELASTIC_ERROR_FILE")
    if not torch_dist_path:
        return
    # noinspection PyBroadException
    try:
        torch_dist_path = Path(torch_dist_path).parent.parent.parent
        # create the file
        with open(torch_dist_path / local_file_name, "wt") as f:
            f.write(str(task_id)+"\n")
    except Exception:
        # we failed for some reason?
        getLogger().warning("Failed creating torch task ID anchor file: {}".format(torch_dist_path))
 def get_torch_distributed_anchor_task_id(timeout=None):
    """
    This will wait until a temporary file appears and read the Task ID created by local_rank 0 of
    Only call when running locally (i.e. without an agent),
    if running remotely there is no need to pass Task ID, it will be passed externally
    :return Task ID of the local task to report to
    """
    # check that we are not local rank 0
    _local_rank = get_torch_local_rank()
    if not _local_rank:
        return
    local_file_name = ".clearml_torch_distributed_id"
    torch_dist_path = os.environ.get("TORCHELASTIC_ERROR_FILE")
    if not torch_dist_path:
        return
    task_id = None
    # noinspection PyBroadException
    try:
        torch_dist_path = Path(torch_dist_path).parent.parent.parent / local_file_name
        tic = time()
        # wait until disturbed file exists
        while not torch_dist_path.is_file():
            # if we found nothing, return None
            if timeout is not None and time() - tic > timeout:
                getLogger().warning("Failed detecting rank zero clearml Task ID, creating a new Task")
                return None
            # wait
            sleep(0.1)
        # create the file
        with open(torch_dist_path, "rt") as f:
            task_id = f.read().strip(" \n")
    except Exception:
        # we failed for some reason?
        pass
    getLogger().warning("Torch Distributed Local Rank {} Task ID {} detected".format(_local_rank, task_id))
    return task_id