From 14351e9f932358c2403fcfa139f71c742f6dd1e8 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Thu, 25 May 2023 17:34:55 +0300 Subject: [PATCH] Add support for multi-node experiments --- clearml/task.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/clearml/task.py b/clearml/task.py index 8a356136..5e72e3e5 100644 --- a/clearml/task.py +++ b/clearml/task.py @@ -1692,6 +1692,15 @@ class Task(_Task): dist.init_process_group('gloo') run(config.get('node_rank'), config.get('total_num_nodes')) + When using the ClearML cloud autoscaler apps, one needs to make sure the nodes can reach eachother. + The machines need to be in the same security group, the `MASTER_PORT` needs to be exposed and the + `MASTER_ADDR` needs to be the right private ip of the instance the master is running on. + For example, to achieve this, one can set the following Docker arguments in the `Additional ClearML Configuration` section: + + .. code-block:: py + + agent.extra_docker_arguments=["--ipc=host", "--network=host", "-p", "29500:29500", "--env", "CLEARML_MULTI_NODE_MASTER_DEF_ADDR=`hostname -I | awk '{print $1}'`"]` + :param total_num_nodes: The total number of nodes to be enqueued, including the master node, which should already be enqueued when running remotely :param port: Port opened by the master node. If the environment variable `CLEARML_MULTI_NODE_MASTER_DEF_PORT` @@ -1702,8 +1711,13 @@ class Task(_Task): :param queue: The queue to enqueue the nodes to. Can be different than the queue the master node is enqueued to. If None, the nodes will be enqueued to the same queue as the master node :param wait: If True, the master node will wait for the other nodes to start - :param addr: The address of the master node's worker. If not set, it defaults to the private IP - of the machine the master is running on + :param addr: The address of the master node's worker. If the environment variable + `CLEARML_MULTI_NODE_MASTER_DEF_ADDR` is set, the value of this parameter will be set to + the one defined in `CLEARML_MULTI_NODE_MASTER_DEF_ADDR`. + If `CLEARML_MULTI_NODE_MASTER_DEF_ADDR` doesn't exist, but `MASTER_ADDR` does, then the value of this + parameter will be set to the one defined in `MASTER_ADDR`. If neither environment variables exist, + the value passed to the parameter will be used. If this value is None (default), the private IP of + the machine the master node is running on will be used. :return: A dictionary containing relevant information regarding the multi node run. This dictionary has the following entries: @@ -1724,10 +1738,14 @@ class Task(_Task): raise UsageError("Master task is not enqueued to any queue and the queue parameter is None") master_conf = { - "master_addr": get_private_ip(), - "master_port": int(os.environ.get("CLEARML_MULTI_NODE_MASTER_DEF_PORT", os.environ.get("MASTER_PORT", port))), + "master_addr": os.environ.get( + "CLEARML_MULTI_NODE_MASTER_DEF_ADDR", os.environ.get("MASTER_ADDR", addr or get_private_ip()) + ), + "master_port": int( + os.environ.get("CLEARML_MULTI_NODE_MASTER_DEF_PORT", os.environ.get("MASTER_PORT", port)) + ), "node_rank": 0, - "wait": wait + "wait": wait, } editable_conf = {"total_num_nodes": total_num_nodes, "queue": queue} editable_conf = self.connect(editable_conf, name=self._launch_multi_node_section) @@ -4650,14 +4668,3 @@ class Task(_Task): auto_connect_frameworks={'detect_repository': False}) \ if state['main'] else Task.get_task(task_id=state['id']) self.__dict__ = task.__dict__ - - def __getattr__(self, name): - try: - self.__getattribute__(name) - except AttributeError as e: - if self.__class__ is Task: - getLogger().warning( - "'clearml.Task' object has no attribute '{}'. Did you mean to import 'Task' from 'allegroai'?".format(name) - ) - raise e -