Fix CLEARML_MULTI_NODE_SINGLE_TASK resource monitoring

This commit is contained in:
allegroai 2024-07-07 13:38:12 +03:00
parent e27d277e40
commit 253aee3b0e

View File

@ -109,6 +109,8 @@ class ResourceMonitor(BackgroundMonitor):
rank = 0
world_size_digits = 0
# check if we are in multi-node reporting to the same Task
# noinspection PyBroadException
try:
if ENV_MULTI_NODE_SINGLE_TASK.get():
# if resource monitoring is disabled, do nothing
if ENV_MULTI_NODE_SINGLE_TASK.get() < 0:
@ -124,10 +126,12 @@ class ResourceMonitor(BackgroundMonitor):
# noinspection PyBroadException
try:
rank = int(os.environ.get("RANK") or 0)
rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0)
world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0)))
except Exception:
pass
except Exception:
pass
seconds_since_started = 0
reported = 0
@ -342,6 +346,26 @@ class ResourceMonitor(BackgroundMonitor):
def get_logger_reported_titles(cls, task):
# noinspection PyProtectedMember
titles = list(task.get_logger()._get_used_title_series().keys())
# noinspection PyBroadException
try:
multi_node = ENV_MULTI_NODE_SINGLE_TASK.get() is not None
except Exception:
multi_node = False
if multi_node:
title_machine = ":".join(cls._title_machine.split(":")[:-1])
title_gpu = ":".join(cls._title_gpu.split(":")[:-1])
if not title_machine:
title_machine = cls._title_machine
if not title_gpu:
title_gpu = cls._title_gpu
try:
titles = [t for t in titles if not t.startswith(title_machine) and not t.startswith(title_gpu)]
except ValueError:
pass
else:
try:
titles.remove(cls._title_machine)
except ValueError:
@ -350,6 +374,7 @@ class ResourceMonitor(BackgroundMonitor):
titles.remove(cls._title_gpu)
except ValueError:
pass
return titles
def _get_process_used_memory(self):