diff --git a/clearml/config/defs.py b/clearml/config/defs.py index 0cd78b23..7e632fb1 100644 --- a/clearml/config/defs.py +++ b/clearml/config/defs.py @@ -28,6 +28,9 @@ SUPPRESS_UPDATE_MESSAGE_ENV_VAR = EnvEntry("CLEARML_SUPPRESS_UPDATE_MESSAGE", "T MAX_SERIES_PER_METRIC = EnvEntry("CLEARML_MAX_SERIES_PER_METRIC", default=100, type=int) +# values are 0/None (task per node), 1/2 (multi-node reporting, colored console), -1 (only report rank 0 node) +ENV_MULTI_NODE_SINGLE_TASK = EnvEntry("CLEARML_MULTI_NODE_SINGLE_TASK", type=int, default=None) + JUPYTER_PASSWORD = EnvEntry("CLEARML_JUPYTER_PASSWORD") # Repository detection diff --git a/clearml/utilities/resource_monitor.py b/clearml/utilities/resource_monitor.py index 41465d6a..e31227cc 100644 --- a/clearml/utilities/resource_monitor.py +++ b/clearml/utilities/resource_monitor.py @@ -3,6 +3,7 @@ import os import platform import sys import warnings +from math import ceil, log10 from time import time import psutil @@ -12,7 +13,7 @@ from typing import Text from .process.mp import BackgroundMonitor from ..backend_api import Session from ..binding.frameworks.tensorflow_bind import IsTensorboardInit -from ..config import config +from ..config import config, ENV_MULTI_NODE_SINGLE_TASK try: from .gpu import gpustat @@ -103,6 +104,31 @@ class ResourceMonitor(BackgroundMonitor): if self._is_thread_mode_and_not_main_process(): return + multi_node_single_task_reporting = False + report_node_as_series = False + rank = 0 + world_size_digits = 0 + # check if we are in multi-node reporting to the same Task + if ENV_MULTI_NODE_SINGLE_TASK.get(): + # if resource monitoring is disabled, do nothing + if ENV_MULTI_NODE_SINGLE_TASK.get() < 0: + return + # we are reporting machines stats on a different machine over the same Task + multi_node_single_task_reporting = True + if ENV_MULTI_NODE_SINGLE_TASK.get() == 1: + # report per machine graph (unique title) + report_node_as_series = False + elif ENV_MULTI_NODE_SINGLE_TASK.get() == 2: + # report per machine series (i.e. merge title+series resource and have "node X" as different series) + report_node_as_series = True + + # noinspection PyBroadException + try: + rank = int(os.environ.get("RANK") or 0) + world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0))) + except Exception: + pass + seconds_since_started = 0 reported = 0 last_iteration = 0 @@ -196,9 +222,19 @@ class ResourceMonitor(BackgroundMonitor): # noinspection PyBroadException try: title = self._title_gpu if k.startswith('gpu_') else self._title_machine + series = k # 3 points after the dot + if multi_node_single_task_reporting: + if report_node_as_series: + title = "{}:{}".format(":".join(title.split(":")[:-1]), series) + series = "rank {:0{world_size_digits}d}".format( + rank, world_size_digits=world_size_digits) + else: + title = "{}:rank{:0{world_size_digits}d}".format( + title, rank, world_size_digits=world_size_digits) + value = round(v * 1000) / 1000. - self._task.get_logger().report_scalar(title=title, series=k, iteration=iteration, value=value) + self._task.get_logger().report_scalar(title=title, series=series, iteration=iteration, value=value) except Exception: pass # clear readouts if this is update is not averaged