From 302a8cbf75279c49d7ac93bea0b6397d46b39a94 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sun, 26 Apr 2020 23:16:13 +0300 Subject: [PATCH] Report memory usage only for the experiment process (and sub-processes) --- trains/config/default/sdk.conf | 4 ++++ trains/task.py | 3 ++- trains/utilities/resource_monitor.py | 36 ++++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/trains/config/default/sdk.conf b/trains/config/default/sdk.conf index cb854c08..d067a67e 100644 --- a/trains/config/default/sdk.conf +++ b/trains/config/default/sdk.conf @@ -157,6 +157,10 @@ # Log all stdout & stderr log_stdout: true + + # compatibility feature, report memory usage for the entire machine + # default (false), report only on the running process and its sub-processes + report_global_mem_used: false } } } diff --git a/trains/task.py b/trains/task.py index 781bd23a..b3a6dfa0 100644 --- a/trains/task.py +++ b/trains/task.py @@ -422,7 +422,8 @@ class Task(_Task): if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('xgboost', True): PatchXGBoostModelIO.update_current_task(task) if auto_resource_monitoring and not is_sub_process_task_id: - task._resource_monitor = ResourceMonitor(task) + task._resource_monitor = ResourceMonitor( + task, report_mem_used_per_process=not config.get('development.worker.report_global_mem_used', False)) task._resource_monitor.start() # make sure all random generators are initialized with new seed diff --git a/trains/utilities/resource_monitor.py b/trains/utilities/resource_monitor.py index c85c4582..9441f038 100644 --- a/trains/utilities/resource_monitor.py +++ b/trains/utilities/resource_monitor.py @@ -21,7 +21,7 @@ class ResourceMonitor(object): def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30., first_report_sec=None, wait_for_first_iteration_to_start_sec=180.0, - max_wait_for_first_iteration_to_start_sec=1800.): + max_wait_for_first_iteration_to_start_sec=1800., report_mem_used_per_process=True): self._task = task self._sample_frequency = sample_frequency_per_sec self._report_frequency = report_frequency_sec @@ -37,6 +37,8 @@ class ResourceMonitor(object): self._gpustat_fail = 0 self._gpustat = gpustat self._active_gpus = None + self._process_info = psutil.Process() if report_mem_used_per_process else None + self._last_process_pool = None if not self._gpustat: self._task.get_logger().report_text('TRAINS Monitor: GPU monitoring is not available') else: # if running_remotely(): @@ -197,7 +199,9 @@ class ResourceMonitor(object): return x / bytes_per_megabyte virtual_memory = psutil.virtual_memory() - stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024 + # stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024 + stats["memory_used_gb"] = bytes_to_megabytes( + self._get_process_used_memory() if self._process_info else virtual_memory.used) / 1024 stats["memory_free_gb"] = bytes_to_megabytes(virtual_memory.available) / 1024 disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent stats["disk_free_percent"] = 100.0-disk_use_percentage @@ -257,3 +261,31 @@ class ResourceMonitor(object): except ValueError: pass return titles + + def _get_process_used_memory(self): + def mem_usage_children(a_mem_size, pr, parent_mem=None): + # add out memory usage + our_mem = pr.memory_info() + mem_diff = our_mem.rss - parent_mem.rss if parent_mem else our_mem.rss + a_mem_size += mem_diff if mem_diff > 0 else 0 + # now we are the parent + for child in pr.children(): + # get the current memory + m = pr.memory_info() + mem_diff = m.rss - our_mem.rss + a_mem_size += mem_diff if mem_diff > 0 else 0 + a_mem_size = mem_usage_children(a_mem_size, child, parent_mem=m) + return a_mem_size + + # only run the memory usage query once per reporting period + # because this memory query is relatively slow, and changes very little. + if self._last_process_pool and (time() - self._last_process_pool[0]) < 0.01*self._report_frequency: + return self._last_process_pool[1] + + # if we have no parent process, return 0 (it's an error) + if not self._process_info: + return 0 + + mem_size = mem_usage_children(0, self._process_info) + self._last_process_pool = time(), mem_size + return mem_size