Fix CLEARML_MULTI_NODE_SINGLE_TASK resource monitoring

This commit is contained in:
allegroai 2024-07-07 13:38:12 +03:00
parent e27d277e40
commit 253aee3b0e

View File

@ -109,25 +109,29 @@ class ResourceMonitor(BackgroundMonitor):
rank = 0 rank = 0
world_size_digits = 0 world_size_digits = 0
# check if we are in multi-node reporting to the same Task # check if we are in multi-node reporting to the same Task
if ENV_MULTI_NODE_SINGLE_TASK.get(): # noinspection PyBroadException
# if resource monitoring is disabled, do nothing try:
if ENV_MULTI_NODE_SINGLE_TASK.get() < 0: if ENV_MULTI_NODE_SINGLE_TASK.get():
return # if resource monitoring is disabled, do nothing
# we are reporting machines stats on a different machine over the same Task if ENV_MULTI_NODE_SINGLE_TASK.get() < 0:
multi_node_single_task_reporting = True return
if ENV_MULTI_NODE_SINGLE_TASK.get() == 1: # we are reporting machines stats on a different machine over the same Task
# report per machine graph (unique title) multi_node_single_task_reporting = True
report_node_as_series = False if ENV_MULTI_NODE_SINGLE_TASK.get() == 1:
elif ENV_MULTI_NODE_SINGLE_TASK.get() == 2: # report per machine graph (unique title)
# report per machine series (i.e. merge title+series resource and have "node X" as different series) report_node_as_series = False
report_node_as_series = True elif ENV_MULTI_NODE_SINGLE_TASK.get() == 2:
# report per machine series (i.e. merge title+series resource and have "node X" as different series)
report_node_as_series = True
# noinspection PyBroadException # noinspection PyBroadException
try: try:
rank = int(os.environ.get("RANK") or 0) rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0)
world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0))) world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0)))
except Exception: except Exception:
pass pass
except Exception:
pass
seconds_since_started = 0 seconds_since_started = 0
reported = 0 reported = 0
@ -342,14 +346,35 @@ class ResourceMonitor(BackgroundMonitor):
def get_logger_reported_titles(cls, task): def get_logger_reported_titles(cls, task):
# noinspection PyProtectedMember # noinspection PyProtectedMember
titles = list(task.get_logger()._get_used_title_series().keys()) titles = list(task.get_logger()._get_used_title_series().keys())
# noinspection PyBroadException
try: try:
titles.remove(cls._title_machine) multi_node = ENV_MULTI_NODE_SINGLE_TASK.get() is not None
except ValueError: except Exception:
pass multi_node = False
try:
titles.remove(cls._title_gpu) if multi_node:
except ValueError: title_machine = ":".join(cls._title_machine.split(":")[:-1])
pass title_gpu = ":".join(cls._title_gpu.split(":")[:-1])
if not title_machine:
title_machine = cls._title_machine
if not title_gpu:
title_gpu = cls._title_gpu
try:
titles = [t for t in titles if not t.startswith(title_machine) and not t.startswith(title_gpu)]
except ValueError:
pass
else:
try:
titles.remove(cls._title_machine)
except ValueError:
pass
try:
titles.remove(cls._title_gpu)
except ValueError:
pass
return titles return titles
def _get_process_used_memory(self): def _get_process_used_memory(self):