Black formatting

This commit is contained in:
clearml 2025-03-01 00:56:49 +02:00
parent 7d49bf7c05
commit ff5fe3b27c

View File

@ -22,17 +22,24 @@ except ImportError:
class ResourceMonitor(BackgroundMonitor):
_title_machine = ':monitor:machine'
_title_gpu = ':monitor:gpu'
_title_machine = ":monitor:machine"
_title_gpu = ":monitor:gpu"
_first_report_sec_default = 30.0
_wait_for_first_iteration_to_start_sec_default = 180.0
_max_wait_for_first_iteration_to_start_sec_default = 1800.0
_resource_monitor_instances = []
_multi_node_single_task = None
def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
first_report_sec=None, wait_for_first_iteration_to_start_sec=None,
max_wait_for_first_iteration_to_start_sec=None, report_mem_used_per_process=True):
def __init__(
self,
task,
sample_frequency_per_sec=2.0,
report_frequency_sec=30.0,
first_report_sec=None,
wait_for_first_iteration_to_start_sec=None,
max_wait_for_first_iteration_to_start_sec=None,
report_mem_used_per_process=True
):
super(ResourceMonitor, self).__init__(task=task, wait_period=sample_frequency_per_sec)
# noinspection PyProtectedMember
ResourceMonitor._resource_monitor_instances.append(self)
@ -91,14 +98,13 @@ class ResourceMonitor(BackgroundMonitor):
self._debug_mode = False
if not self._gpustat:
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
self._task.get_logger().report_text("ClearML Monitor: GPU monitoring is not available")
else: # if running_remotely():
# noinspection PyBroadException
try:
active_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES', '') or \
os.environ.get('CUDA_VISIBLE_DEVICES', '')
active_gpus = os.environ.get("NVIDIA_VISIBLE_DEVICES", "") or os.environ.get("CUDA_VISIBLE_DEVICES", "")
if active_gpus and active_gpus != "all":
self._active_gpus = [g.strip() for g in active_gpus.split(',')]
self._active_gpus = [g.strip() for g in active_gpus.split(",")]
except Exception:
pass
@ -128,7 +134,7 @@ class ResourceMonitor(BackgroundMonitor):
# noinspection PyBroadException
try:
rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0)
rank = int(os.environ.get("RANK", os.environ.get("SLURM_PROCID")) or 0)
world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0)))
except Exception:
pass
@ -151,15 +157,16 @@ class ResourceMonitor(BackgroundMonitor):
pass
# add Task runtime_properties with the machine spec
if Session.check_min_api_version('2.13'):
if Session.check_min_api_version("2.13"):
try:
machine_spec = self._get_machine_specs()
if machine_spec:
# noinspection PyProtectedMember
self._task._set_runtime_properties(runtime_properties=machine_spec)
except Exception as ex:
logging.getLogger('clearml.resource_monitor').debug(
'Failed logging machine specification: {}'.format(ex))
logging.getLogger("clearml.resource_monitor").debug(
"Failed logging machine specification: {}".format(ex)
)
# last_iteration_interval = None
# last_iteration_ts = 0
@ -183,14 +190,17 @@ class ResourceMonitor(BackgroundMonitor):
if IsTensorboardInit.tensorboard_used():
fallback_to_sec_as_iterations = False
elif seconds_since_started >= self.wait_for_first_iteration:
self._task.get_logger().report_text('ClearML Monitor: Could not detect iteration reporting, '
'falling back to iterations as seconds-from-start')
self._task.get_logger().report_text(
"ClearML Monitor: Could not detect iteration reporting, "
"falling back to iterations as seconds-from-start"
)
fallback_to_sec_as_iterations = True
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self.max_check_first_iteration:
if self._check_logger_reported():
fallback_to_sec_as_iterations = False
self._task.get_logger().report_text('ClearML Monitor: Reporting detected, '
'reverting back to iteration based reporting')
self._task.get_logger().report_text(
"ClearML Monitor: Reporting detected, " "reverting back to iteration based reporting"
)
clear_readouts = True
# if we do not have last_iteration, we just use seconds as iteration
@ -228,8 +238,8 @@ class ResourceMonitor(BackgroundMonitor):
# noinspection PyBroadException
try:
# 3 digits after the dot
value = round(v * 1000) / 1000.
title = self._title_gpu if k.startswith('gpu_') else self._title_machine
value = round(v * 1000) / 1000.0
title = self._title_gpu if k.startswith("gpu_") else self._title_machine
series = k
if multi_node_single_task_reporting:
if report_node_as_series:
@ -237,22 +247,26 @@ class ResourceMonitor(BackgroundMonitor):
# can always check the default cpu/gpu utilization
if rank == 0:
self._task.get_logger().report_scalar(
title=title, series=series,
iteration=iteration, value=value)
title=title, series=series, iteration=iteration, value=value
)
# now let's create an additional report
title = "{}:{}".format(":".join(title.split(":")[:-1]), series)
series = "rank {:0{world_size_digits}d}".format(
rank, world_size_digits=world_size_digits)
rank, world_size_digits=world_size_digits
)
elif rank > 0:
title = "{}:rank{:0{world_size_digits}d}".format(
title, rank, world_size_digits=world_size_digits)
title, rank, world_size_digits=world_size_digits
)
else:
# for rank 0 we keep the same original report so that external services
# can always check the default cpu/gpu utilization
pass
self._task.get_logger().report_scalar(title=title, series=series, iteration=iteration, value=value)
self._task.get_logger().report_scalar(
title=title, series=series, iteration=iteration, value=value
)
except Exception:
pass
@ -269,7 +283,7 @@ class ResourceMonitor(BackgroundMonitor):
self._previous_readouts_ts = time()
for k, v in readouts.items():
# cumulative measurements
if k.endswith('_mbs'):
if k.endswith("_mbs"):
v = (v - self._previous_readouts.get(k, v)) / elapsed
self._readouts[k] = self._readouts.get(k, 0.0) + v
@ -296,15 +310,16 @@ class ResourceMonitor(BackgroundMonitor):
"cpu_usage": sum(cpu_usage) / float(len(cpu_usage)),
}
bytes_per_megabyte = 1024 ** 2
bytes_per_megabyte = 1024**2
def bytes_to_megabytes(x):
return x / bytes_per_megabyte
virtual_memory = psutil.virtual_memory()
# stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024
stats["memory_used_gb"] = bytes_to_megabytes(
self._get_process_used_memory() if self._process_info else virtual_memory.used) / 1024
stats["memory_used_gb"] = (
bytes_to_megabytes(self._get_process_used_memory() if self._process_info else virtual_memory.used) / 1024
)
stats["memory_free_gb"] = bytes_to_megabytes(virtual_memory.available) / 1024
disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent
stats["disk_free_percent"] = 100.0 - disk_use_percentage
@ -312,7 +327,7 @@ class ResourceMonitor(BackgroundMonitor):
if logging.root.level > logging.DEBUG: # If the logging level is bigger than debug, ignore
# psutil.sensors_temperatures warnings
warnings.simplefilter("ignore", category=RuntimeWarning)
sensor_stat = (psutil.sensors_temperatures() if hasattr(psutil, "sensors_temperatures") else {})
sensor_stat = psutil.sensors_temperatures() if hasattr(psutil, "sensors_temperatures") else {}
if "coretemp" in sensor_stat and len(sensor_stat["coretemp"]):
stats["cpu_temperature"] = max([float(t.current) for t in sensor_stat["coretemp"]])
@ -344,9 +359,10 @@ class ResourceMonitor(BackgroundMonitor):
# something happened and we can't use gpu stats,
self._gpustat_fail += 1
if self._gpustat_fail >= 3:
msg = 'ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring'
msg = "ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring"
if self._debug_mode:
import traceback
msg += "\n" + traceback.format_exc()
self._task.get_logger().report_text(msg)
self._gpustat = None
@ -410,9 +426,8 @@ class ResourceMonitor(BackgroundMonitor):
# only run the memory usage query once per reporting period
# because this memory query is relatively slow, and changes very little.
if self._last_process_pool.get('cpu') and \
(time() - self._last_process_pool['cpu'][0]) < self._report_frequency:
return self._last_process_pool['cpu'][1]
if self._last_process_pool.get("cpu") and (time() - self._last_process_pool["cpu"][0]) < self._report_frequency:
return self._last_process_pool["cpu"][1]
# if we have no parent process, return 0 (it's an error)
if not self._process_info:
@ -420,7 +435,7 @@ class ResourceMonitor(BackgroundMonitor):
self._last_process_id_list = []
mem_size = mem_usage_children(0, self._process_info)
self._last_process_pool['cpu'] = time(), mem_size
self._last_process_pool["cpu"] = time(), mem_size
return mem_size
@ -448,9 +463,14 @@ class ResourceMonitor(BackgroundMonitor):
# On the rest of the samples we return the previous memory measurement
# update mem used by our process and sub processes
if self._gpu_memory_per_process and self._process_info and \
(not self._last_process_pool.get('gpu') or
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
if (
self._gpu_memory_per_process
and self._process_info
and (
not self._last_process_pool.get("gpu")
or (time() - self._last_process_pool["gpu"][0]) >= self._report_frequency
)
):
gpu_mem = {}
# noinspection PyBroadException
try:
@ -469,15 +489,15 @@ class ResourceMonitor(BackgroundMonitor):
gpu_mem[i] = 0
for p in g.processes:
if p is not None and p['pid'] in self._last_process_id_list:
gpu_mem[i] += p.get('gpu_memory_usage', 0)
if p is not None and p["pid"] in self._last_process_id_list:
gpu_mem[i] += p.get("gpu_memory_usage", 0)
self._last_process_pool['gpu'] = time(), gpu_mem
self._last_process_pool["gpu"] = time(), gpu_mem
else:
# if we do no need to update the memory usage, run global query
# if we have no parent process (backward compatibility), return global stats
gpu_stat = self._gpustat.new_query(per_process_stats=False)
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
gpu_mem = self._last_process_pool["gpu"][1] if self._last_process_pool.get("gpu") else None
# generate the statistics dict for actual report
stats = {}
@ -506,7 +526,7 @@ class ResourceMonitor(BackgroundMonitor):
)
)
self._gpu_utilization_warning_sent = True
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
stats["gpu_%d_mem_usage" % i] = 100.0 * float(g["memory.used"]) / float(g["memory.total"])
# already in MBs
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
# use previously sampled process gpu memory, or global if it does not exist
@ -520,15 +540,15 @@ class ResourceMonitor(BackgroundMonitor):
# noinspection PyBroadException
try:
specs = {
'platform': str(sys.platform),
'python_version': str(platform.python_version()),
'python_exec': str(sys.executable),
'OS': str(platform.platform(aliased=True)),
'processor': str(platform.machine()),
'cpu_cores': int(psutil.cpu_count()),
'memory_gb': round(psutil.virtual_memory().total / 1024 ** 3, 1),
'hostname': str(platform.node()),
'gpu_count': 0,
"platform": str(sys.platform),
"python_version": str(platform.python_version()),
"python_exec": str(sys.executable),
"OS": str(platform.platform(aliased=True)),
"processor": str(platform.machine()),
"cpu_cores": int(psutil.cpu_count()),
"memory_gb": round(psutil.virtual_memory().total / 1024**3, 1),
"hostname": str(platform.node()),
"gpu_count": 0,
}
if self._gpustat:
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
@ -536,10 +556,10 @@ class ResourceMonitor(BackgroundMonitor):
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(g)]
specs.update(
gpu_count=int(len(gpus)),
gpu_type=', '.join(g.name for g in gpus),
gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
gpu_driver_version=gpu_stat.driver_version or '',
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
gpu_type=", ".join(g.name for g in gpus),
gpu_memory=", ".join("{}GB".format(round(g.memory_total / 1024.0)) for g in gpus),
gpu_driver_version=gpu_stat.driver_version or "",
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or "",
)
except Exception: