mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Add CLEARML_MULTI_NODE_SINGLE_TASK (values -1, 0, 1, 2) for easier multi-node singe Task workloads
This commit is contained in:
parent
7dc601598b
commit
9594e5dddd
@ -28,6 +28,9 @@ SUPPRESS_UPDATE_MESSAGE_ENV_VAR = EnvEntry("CLEARML_SUPPRESS_UPDATE_MESSAGE", "T
|
||||
|
||||
MAX_SERIES_PER_METRIC = EnvEntry("CLEARML_MAX_SERIES_PER_METRIC", default=100, type=int)
|
||||
|
||||
# values are 0/None (task per node), 1/2 (multi-node reporting, colored console), -1 (only report rank 0 node)
|
||||
ENV_MULTI_NODE_SINGLE_TASK = EnvEntry("CLEARML_MULTI_NODE_SINGLE_TASK", type=int, default=None)
|
||||
|
||||
JUPYTER_PASSWORD = EnvEntry("CLEARML_JUPYTER_PASSWORD")
|
||||
|
||||
# Repository detection
|
||||
|
@ -3,6 +3,7 @@ import os
|
||||
import platform
|
||||
import sys
|
||||
import warnings
|
||||
from math import ceil, log10
|
||||
from time import time
|
||||
|
||||
import psutil
|
||||
@ -12,7 +13,7 @@ from typing import Text
|
||||
from .process.mp import BackgroundMonitor
|
||||
from ..backend_api import Session
|
||||
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
|
||||
from ..config import config
|
||||
from ..config import config, ENV_MULTI_NODE_SINGLE_TASK
|
||||
|
||||
try:
|
||||
from .gpu import gpustat
|
||||
@ -103,6 +104,31 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
if self._is_thread_mode_and_not_main_process():
|
||||
return
|
||||
|
||||
multi_node_single_task_reporting = False
|
||||
report_node_as_series = False
|
||||
rank = 0
|
||||
world_size_digits = 0
|
||||
# check if we are in multi-node reporting to the same Task
|
||||
if ENV_MULTI_NODE_SINGLE_TASK.get():
|
||||
# if resource monitoring is disabled, do nothing
|
||||
if ENV_MULTI_NODE_SINGLE_TASK.get() < 0:
|
||||
return
|
||||
# we are reporting machines stats on a different machine over the same Task
|
||||
multi_node_single_task_reporting = True
|
||||
if ENV_MULTI_NODE_SINGLE_TASK.get() == 1:
|
||||
# report per machine graph (unique title)
|
||||
report_node_as_series = False
|
||||
elif ENV_MULTI_NODE_SINGLE_TASK.get() == 2:
|
||||
# report per machine series (i.e. merge title+series resource and have "node X" as different series)
|
||||
report_node_as_series = True
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
rank = int(os.environ.get("RANK") or 0)
|
||||
world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0)))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
seconds_since_started = 0
|
||||
reported = 0
|
||||
last_iteration = 0
|
||||
@ -196,9 +222,19 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
title = self._title_gpu if k.startswith('gpu_') else self._title_machine
|
||||
series = k
|
||||
# 3 points after the dot
|
||||
if multi_node_single_task_reporting:
|
||||
if report_node_as_series:
|
||||
title = "{}:{}".format(":".join(title.split(":")[:-1]), series)
|
||||
series = "rank {:0{world_size_digits}d}".format(
|
||||
rank, world_size_digits=world_size_digits)
|
||||
else:
|
||||
title = "{}:rank{:0{world_size_digits}d}".format(
|
||||
title, rank, world_size_digits=world_size_digits)
|
||||
|
||||
value = round(v * 1000) / 1000.
|
||||
self._task.get_logger().report_scalar(title=title, series=k, iteration=iteration, value=value)
|
||||
self._task.get_logger().report_scalar(title=title, series=series, iteration=iteration, value=value)
|
||||
except Exception:
|
||||
pass
|
||||
# clear readouts if this is update is not averaged
|
||||
|
Loading…
Reference in New Issue
Block a user