mirror of
https://github.com/clearml/clearml
synced 2025-02-07 05:18:50 +00:00
Fix CLEARML_MULTI_NODE_SINGLE_TASK resource monitoring
This commit is contained in:
parent
e27d277e40
commit
253aee3b0e
@ -109,6 +109,8 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
rank = 0
|
||||
world_size_digits = 0
|
||||
# check if we are in multi-node reporting to the same Task
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if ENV_MULTI_NODE_SINGLE_TASK.get():
|
||||
# if resource monitoring is disabled, do nothing
|
||||
if ENV_MULTI_NODE_SINGLE_TASK.get() < 0:
|
||||
@ -124,10 +126,12 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
rank = int(os.environ.get("RANK") or 0)
|
||||
rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0)
|
||||
world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0)))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
seconds_since_started = 0
|
||||
reported = 0
|
||||
@ -342,6 +346,26 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
def get_logger_reported_titles(cls, task):
|
||||
# noinspection PyProtectedMember
|
||||
titles = list(task.get_logger()._get_used_title_series().keys())
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
multi_node = ENV_MULTI_NODE_SINGLE_TASK.get() is not None
|
||||
except Exception:
|
||||
multi_node = False
|
||||
|
||||
if multi_node:
|
||||
title_machine = ":".join(cls._title_machine.split(":")[:-1])
|
||||
title_gpu = ":".join(cls._title_gpu.split(":")[:-1])
|
||||
if not title_machine:
|
||||
title_machine = cls._title_machine
|
||||
if not title_gpu:
|
||||
title_gpu = cls._title_gpu
|
||||
|
||||
try:
|
||||
titles = [t for t in titles if not t.startswith(title_machine) and not t.startswith(title_gpu)]
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
titles.remove(cls._title_machine)
|
||||
except ValueError:
|
||||
@ -350,6 +374,7 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
titles.remove(cls._title_gpu)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return titles
|
||||
|
||||
def _get_process_used_memory(self):
|
||||
|
Loading…
Reference in New Issue
Block a user