Report memory usage only for the experiment process (and sub-processes)

This commit is contained in:
allegroai 2020-04-26 23:16:13 +03:00
parent 7e839204d1
commit 302a8cbf75
3 changed files with 40 additions and 3 deletions

View File

@ -157,6 +157,10 @@
# Log all stdout & stderr
log_stdout: true
# compatibility feature, report memory usage for the entire machine
# default (false), report only on the running process and its sub-processes
report_global_mem_used: false
}
}
}

View File

@ -422,7 +422,8 @@ class Task(_Task):
if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('xgboost', True):
PatchXGBoostModelIO.update_current_task(task)
if auto_resource_monitoring and not is_sub_process_task_id:
task._resource_monitor = ResourceMonitor(task)
task._resource_monitor = ResourceMonitor(
task, report_mem_used_per_process=not config.get('development.worker.report_global_mem_used', False))
task._resource_monitor.start()
# make sure all random generators are initialized with new seed

View File

@ -21,7 +21,7 @@ class ResourceMonitor(object):
def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
first_report_sec=None, wait_for_first_iteration_to_start_sec=180.0,
max_wait_for_first_iteration_to_start_sec=1800.):
max_wait_for_first_iteration_to_start_sec=1800., report_mem_used_per_process=True):
self._task = task
self._sample_frequency = sample_frequency_per_sec
self._report_frequency = report_frequency_sec
@ -37,6 +37,8 @@ class ResourceMonitor(object):
self._gpustat_fail = 0
self._gpustat = gpustat
self._active_gpus = None
self._process_info = psutil.Process() if report_mem_used_per_process else None
self._last_process_pool = None
if not self._gpustat:
self._task.get_logger().report_text('TRAINS Monitor: GPU monitoring is not available')
else: # if running_remotely():
@ -197,7 +199,9 @@ class ResourceMonitor(object):
return x / bytes_per_megabyte
virtual_memory = psutil.virtual_memory()
stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024
# stats["memory_used_gb"] = bytes_to_megabytes(virtual_memory.used) / 1024
stats["memory_used_gb"] = bytes_to_megabytes(
self._get_process_used_memory() if self._process_info else virtual_memory.used) / 1024
stats["memory_free_gb"] = bytes_to_megabytes(virtual_memory.available) / 1024
disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent
stats["disk_free_percent"] = 100.0-disk_use_percentage
@ -257,3 +261,31 @@ class ResourceMonitor(object):
except ValueError:
pass
return titles
def _get_process_used_memory(self):
def mem_usage_children(a_mem_size, pr, parent_mem=None):
# add out memory usage
our_mem = pr.memory_info()
mem_diff = our_mem.rss - parent_mem.rss if parent_mem else our_mem.rss
a_mem_size += mem_diff if mem_diff > 0 else 0
# now we are the parent
for child in pr.children():
# get the current memory
m = pr.memory_info()
mem_diff = m.rss - our_mem.rss
a_mem_size += mem_diff if mem_diff > 0 else 0
a_mem_size = mem_usage_children(a_mem_size, child, parent_mem=m)
return a_mem_size
# only run the memory usage query once per reporting period
# because this memory query is relatively slow, and changes very little.
if self._last_process_pool and (time() - self._last_process_pool[0]) < 0.01*self._report_frequency:
return self._last_process_pool[1]
# if we have no parent process, return 0 (it's an error)
if not self._process_info:
return 0
mem_size = mem_usage_children(0, self._process_info)
self._last_process_pool = time(), mem_size
return mem_size