From ff5fe3b27c75515b14303a3500fdee1f616d5b82 Mon Sep 17 00:00:00 2001 From: clearml <> Date: Sat, 1 Mar 2025 00:56:49 +0200 Subject: [PATCH] Black formatting --- clearml/utilities/resource_monitor.py | 130 +++++++++++++++----------- 1 file changed, 75 insertions(+), 55 deletions(-) diff --git a/clearml/utilities/resource_monitor.py b/clearml/utilities/resource_monitor.py index c11c0750..79bd5a59 100644 --- a/clearml/utilities/resource_monitor.py +++ b/clearml/utilities/resource_monitor.py @@ -22,17 +22,24 @@ except ImportError: class ResourceMonitor(BackgroundMonitor): - _title_machine = ':monitor:machine' - _title_gpu = ':monitor:gpu' + _title_machine = ":monitor:machine" + _title_gpu = ":monitor:gpu" _first_report_sec_default = 30.0 _wait_for_first_iteration_to_start_sec_default = 180.0 _max_wait_for_first_iteration_to_start_sec_default = 1800.0 _resource_monitor_instances = [] _multi_node_single_task = None - def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30., - first_report_sec=None, wait_for_first_iteration_to_start_sec=None, - max_wait_for_first_iteration_to_start_sec=None, report_mem_used_per_process=True): + def __init__( + self, + task, + sample_frequency_per_sec=2.0, + report_frequency_sec=30.0, + first_report_sec=None, + wait_for_first_iteration_to_start_sec=None, + max_wait_for_first_iteration_to_start_sec=None, + report_mem_used_per_process=True + ): super(ResourceMonitor, self).__init__(task=task, wait_period=sample_frequency_per_sec) # noinspection PyProtectedMember ResourceMonitor._resource_monitor_instances.append(self) @@ -91,14 +98,13 @@ class ResourceMonitor(BackgroundMonitor): self._debug_mode = False if not self._gpustat: - self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available') + self._task.get_logger().report_text("ClearML Monitor: GPU monitoring is not available") else: # if running_remotely(): # noinspection PyBroadException try: - active_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES', '') or \ - os.environ.get('CUDA_VISIBLE_DEVICES', '') + active_gpus = os.environ.get("NVIDIA_VISIBLE_DEVICES", "") or os.environ.get("CUDA_VISIBLE_DEVICES", "") if active_gpus and active_gpus != "all": - self._active_gpus = [g.strip() for g in active_gpus.split(',')] + self._active_gpus = [g.strip() for g in active_gpus.split(",")] except Exception: pass @@ -128,7 +134,7 @@ class ResourceMonitor(BackgroundMonitor): # noinspection PyBroadException try: - rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0) + rank = int(os.environ.get("RANK", os.environ.get("SLURM_PROCID")) or 0) world_size_digits = ceil(log10(int(os.environ.get("WORLD_SIZE") or 0))) except Exception: pass @@ -151,15 +157,16 @@ class ResourceMonitor(BackgroundMonitor): pass # add Task runtime_properties with the machine spec - if Session.check_min_api_version('2.13'): + if Session.check_min_api_version("2.13"): try: machine_spec = self._get_machine_specs() if machine_spec: # noinspection PyProtectedMember self._task._set_runtime_properties(runtime_properties=machine_spec) except Exception as ex: - logging.getLogger('clearml.resource_monitor').debug( - 'Failed logging machine specification: {}'.format(ex)) + logging.getLogger("clearml.resource_monitor").debug( + "Failed logging machine specification: {}".format(ex) + ) # last_iteration_interval = None # last_iteration_ts = 0 @@ -183,14 +190,17 @@ class ResourceMonitor(BackgroundMonitor): if IsTensorboardInit.tensorboard_used(): fallback_to_sec_as_iterations = False elif seconds_since_started >= self.wait_for_first_iteration: - self._task.get_logger().report_text('ClearML Monitor: Could not detect iteration reporting, ' - 'falling back to iterations as seconds-from-start') + self._task.get_logger().report_text( + "ClearML Monitor: Could not detect iteration reporting, " + "falling back to iterations as seconds-from-start" + ) fallback_to_sec_as_iterations = True elif fallback_to_sec_as_iterations is True and seconds_since_started <= self.max_check_first_iteration: if self._check_logger_reported(): fallback_to_sec_as_iterations = False - self._task.get_logger().report_text('ClearML Monitor: Reporting detected, ' - 'reverting back to iteration based reporting') + self._task.get_logger().report_text( + "ClearML Monitor: Reporting detected, " "reverting back to iteration based reporting" + ) clear_readouts = True # if we do not have last_iteration, we just use seconds as iteration @@ -228,8 +238,8 @@ class ResourceMonitor(BackgroundMonitor): # noinspection PyBroadException try: # 3 digits after the dot - value = round(v * 1000) / 1000. - title = self._title_gpu if k.startswith('gpu_') else self._title_machine + value = round(v * 1000) / 1000.0 + title = self._title_gpu if k.startswith("gpu_") else self._title_machine series = k if multi_node_single_task_reporting: if report_node_as_series: @@ -237,22 +247,26 @@ class ResourceMonitor(BackgroundMonitor): # can always check the default cpu/gpu utilization if rank == 0: self._task.get_logger().report_scalar( - title=title, series=series, - iteration=iteration, value=value) + title=title, series=series, iteration=iteration, value=value + ) # now let's create an additional report title = "{}:{}".format(":".join(title.split(":")[:-1]), series) series = "rank {:0{world_size_digits}d}".format( - rank, world_size_digits=world_size_digits) + rank, world_size_digits=world_size_digits + ) elif rank > 0: title = "{}:rank{:0{world_size_digits}d}".format( - title, rank, world_size_digits=world_size_digits) + title, rank, world_size_digits=world_size_digits + ) else: # for rank 0 we keep the same original report so that external services # can always check the default cpu/gpu utilization pass - self._task.get_logger().report_scalar(title=title, series=series, iteration=iteration, value=value) + self._task.get_logger().report_scalar( + title=title, series=series, iteration=iteration, value=value + ) except Exception: pass @@ -269,7 +283,7 @@ class ResourceMonitor(BackgroundMonitor): self._previous_readouts_ts = time() for k, v in readouts.items(): # cumulative measurements - if k.endswith('_mbs'): + if k.endswith("_mbs"): v = (v - self._previous_readouts.get(k, v)) / elapsed self._readouts[k] = self._readouts.get(k, 0.0) + v @@ -296,15 +310,16 @@ class ResourceMonitor(BackgroundMonitor): "cpu_usage": sum(cpu_usage) / float(len(cpu_usage)), } - bytes_per_megabyte = 1024 ** 2 + bytes_per_megabyte = 1024**2 def bytes_to_megabytes(x): return x / bytes_per_megabyte virtual_memory = psutil.virtual_memory() # stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024 - stats["memory_used_gb"] = bytes_to_megabytes( - self._get_process_used_memory() if self._process_info else virtual_memory.used) / 1024 + stats["memory_used_gb"] = ( + bytes_to_megabytes(self._get_process_used_memory() if self._process_info else virtual_memory.used) / 1024 + ) stats["memory_free_gb"] = bytes_to_megabytes(virtual_memory.available) / 1024 disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent stats["disk_free_percent"] = 100.0 - disk_use_percentage @@ -312,7 +327,7 @@ class ResourceMonitor(BackgroundMonitor): if logging.root.level > logging.DEBUG: # If the logging level is bigger than debug, ignore # psutil.sensors_temperatures warnings warnings.simplefilter("ignore", category=RuntimeWarning) - sensor_stat = (psutil.sensors_temperatures() if hasattr(psutil, "sensors_temperatures") else {}) + sensor_stat = psutil.sensors_temperatures() if hasattr(psutil, "sensors_temperatures") else {} if "coretemp" in sensor_stat and len(sensor_stat["coretemp"]): stats["cpu_temperature"] = max([float(t.current) for t in sensor_stat["coretemp"]]) @@ -344,9 +359,10 @@ class ResourceMonitor(BackgroundMonitor): # something happened and we can't use gpu stats, self._gpustat_fail += 1 if self._gpustat_fail >= 3: - msg = 'ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring' + msg = "ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring" if self._debug_mode: import traceback + msg += "\n" + traceback.format_exc() self._task.get_logger().report_text(msg) self._gpustat = None @@ -410,9 +426,8 @@ class ResourceMonitor(BackgroundMonitor): # only run the memory usage query once per reporting period # because this memory query is relatively slow, and changes very little. - if self._last_process_pool.get('cpu') and \ - (time() - self._last_process_pool['cpu'][0]) < self._report_frequency: - return self._last_process_pool['cpu'][1] + if self._last_process_pool.get("cpu") and (time() - self._last_process_pool["cpu"][0]) < self._report_frequency: + return self._last_process_pool["cpu"][1] # if we have no parent process, return 0 (it's an error) if not self._process_info: @@ -420,7 +435,7 @@ class ResourceMonitor(BackgroundMonitor): self._last_process_id_list = [] mem_size = mem_usage_children(0, self._process_info) - self._last_process_pool['cpu'] = time(), mem_size + self._last_process_pool["cpu"] = time(), mem_size return mem_size @@ -448,9 +463,14 @@ class ResourceMonitor(BackgroundMonitor): # On the rest of the samples we return the previous memory measurement # update mem used by our process and sub processes - if self._gpu_memory_per_process and self._process_info and \ - (not self._last_process_pool.get('gpu') or - (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency): + if ( + self._gpu_memory_per_process + and self._process_info + and ( + not self._last_process_pool.get("gpu") + or (time() - self._last_process_pool["gpu"][0]) >= self._report_frequency + ) + ): gpu_mem = {} # noinspection PyBroadException try: @@ -469,15 +489,15 @@ class ResourceMonitor(BackgroundMonitor): gpu_mem[i] = 0 for p in g.processes: - if p is not None and p['pid'] in self._last_process_id_list: - gpu_mem[i] += p.get('gpu_memory_usage', 0) + if p is not None and p["pid"] in self._last_process_id_list: + gpu_mem[i] += p.get("gpu_memory_usage", 0) - self._last_process_pool['gpu'] = time(), gpu_mem + self._last_process_pool["gpu"] = time(), gpu_mem else: # if we do no need to update the memory usage, run global query # if we have no parent process (backward compatibility), return global stats gpu_stat = self._gpustat.new_query(per_process_stats=False) - gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None + gpu_mem = self._last_process_pool["gpu"][1] if self._last_process_pool.get("gpu") else None # generate the statistics dict for actual report stats = {} @@ -506,7 +526,7 @@ class ResourceMonitor(BackgroundMonitor): ) ) self._gpu_utilization_warning_sent = True - stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"]) + stats["gpu_%d_mem_usage" % i] = 100.0 * float(g["memory.used"]) / float(g["memory.total"]) # already in MBs stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024 # use previously sampled process gpu memory, or global if it does not exist @@ -520,15 +540,15 @@ class ResourceMonitor(BackgroundMonitor): # noinspection PyBroadException try: specs = { - 'platform': str(sys.platform), - 'python_version': str(platform.python_version()), - 'python_exec': str(sys.executable), - 'OS': str(platform.platform(aliased=True)), - 'processor': str(platform.machine()), - 'cpu_cores': int(psutil.cpu_count()), - 'memory_gb': round(psutil.virtual_memory().total / 1024 ** 3, 1), - 'hostname': str(platform.node()), - 'gpu_count': 0, + "platform": str(sys.platform), + "python_version": str(platform.python_version()), + "python_exec": str(sys.executable), + "OS": str(platform.platform(aliased=True)), + "processor": str(platform.machine()), + "cpu_cores": int(psutil.cpu_count()), + "memory_gb": round(psutil.virtual_memory().total / 1024**3, 1), + "hostname": str(platform.node()), + "gpu_count": 0, } if self._gpustat: gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True) @@ -536,10 +556,10 @@ class ResourceMonitor(BackgroundMonitor): gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(g)] specs.update( gpu_count=int(len(gpus)), - gpu_type=', '.join(g.name for g in gpus), - gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus), - gpu_driver_version=gpu_stat.driver_version or '', - gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '', + gpu_type=", ".join(g.name for g in gpus), + gpu_memory=", ".join("{}GB".format(round(g.memory_total / 1024.0)) for g in gpus), + gpu_driver_version=gpu_stat.driver_version or "", + gpu_driver_cuda_version=gpu_stat.driver_cuda_version or "", ) except Exception: