mirror of
https://github.com/clearml/clearml
synced 2025-02-11 23:33:21 +00:00
Fix CLEARML_MULTI_NODE_SINGLE_TASK resource monitoring
This commit is contained in:
parent
e27d277e40
commit
253aee3b0e
@ -109,25 +109,29 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
rank = 0
|
rank = 0
|
||||||
world_size_digits = 0
|
world_size_digits = 0
|
||||||
# check if we are in multi-node reporting to the same Task
|
# check if we are in multi-node reporting to the same Task
|
||||||
if ENV_MULTI_NODE_SINGLE_TASK.get():
|
# noinspection PyBroadException
|
||||||
# if resource monitoring is disabled, do nothing
|
try:
|
||||||
if ENV_MULTI_NODE_SINGLE_TASK.get() < 0:
|
if ENV_MULTI_NODE_SINGLE_TASK.get():
|
||||||
return
|
# if resource monitoring is disabled, do nothing
|
||||||
# we are reporting machines stats on a different machine over the same Task
|
if ENV_MULTI_NODE_SINGLE_TASK.get() < 0:
|
||||||
multi_node_single_task_reporting = True
|
return
|
||||||
if ENV_MULTI_NODE_SINGLE_TASK.get() == 1:
|
# we are reporting machines stats on a different machine over the same Task
|
||||||
# report per machine graph (unique title)
|
multi_node_single_task_reporting = True
|
||||||
report_node_as_series = False
|
if ENV_MULTI_NODE_SINGLE_TASK.get() == 1:
|
||||||
elif ENV_MULTI_NODE_SINGLE_TASK.get() == 2:
|
# report per machine graph (unique title)
|
||||||
# report per machine series (i.e. merge title+series resource and have "node X" as different series)
|
report_node_as_series = False
|
||||||
report_node_as_series = True
|
elif ENV_MULTI_NODE_SINGLE_TASK.get() == 2:
|
||||||
|
# report per machine series (i.e. merge title+series resource and have "node X" as different series)
|
||||||
|
report_node_as_series = True
|
||||||
|
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
rank = int(os.environ.get("RANK") or 0)
|
rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0)
|
||||||
world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0)))
|
world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0)))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
seconds_since_started = 0
|
seconds_since_started = 0
|
||||||
reported = 0
|
reported = 0
|
||||||
@ -342,14 +346,35 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
def get_logger_reported_titles(cls, task):
|
def get_logger_reported_titles(cls, task):
|
||||||
# noinspection PyProtectedMember
|
# noinspection PyProtectedMember
|
||||||
titles = list(task.get_logger()._get_used_title_series().keys())
|
titles = list(task.get_logger()._get_used_title_series().keys())
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
titles.remove(cls._title_machine)
|
multi_node = ENV_MULTI_NODE_SINGLE_TASK.get() is not None
|
||||||
except ValueError:
|
except Exception:
|
||||||
pass
|
multi_node = False
|
||||||
try:
|
|
||||||
titles.remove(cls._title_gpu)
|
if multi_node:
|
||||||
except ValueError:
|
title_machine = ":".join(cls._title_machine.split(":")[:-1])
|
||||||
pass
|
title_gpu = ":".join(cls._title_gpu.split(":")[:-1])
|
||||||
|
if not title_machine:
|
||||||
|
title_machine = cls._title_machine
|
||||||
|
if not title_gpu:
|
||||||
|
title_gpu = cls._title_gpu
|
||||||
|
|
||||||
|
try:
|
||||||
|
titles = [t for t in titles if not t.startswith(title_machine) and not t.startswith(title_gpu)]
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
titles.remove(cls._title_machine)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
titles.remove(cls._title_gpu)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
return titles
|
return titles
|
||||||
|
|
||||||
def _get_process_used_memory(self):
|
def _get_process_used_memory(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user