Fix GPU Windows monitoring support (Trains Issue #177)

This commit is contained in:
allegroai 2020-08-10 08:07:51 +03:00
parent 6b333202e9
commit 4aacf9005e

View File

@ -203,23 +203,24 @@ class GPUStatCollection(object):
process['pid'] = nv_process.pid process['pid'] = nv_process.pid
# noinspection PyBroadException # noinspection PyBroadException
try: try:
process['username'] = ps_process.username() # we do not actually use these, so no point in collecting them
# cmdline returns full path; # process['username'] = ps_process.username()
# as in `ps -o comm`, get short cmdnames. # # cmdline returns full path;
_cmdline = ps_process.cmdline() # # as in `ps -o comm`, get short cmdnames.
if not _cmdline: # _cmdline = ps_process.cmdline()
# sometimes, zombie or unknown (e.g. [kworker/8:2H]) # if not _cmdline:
process['command'] = '?' # # sometimes, zombie or unknown (e.g. [kworker/8:2H])
process['full_command'] = ['?'] # process['command'] = '?'
else: # process['full_command'] = ['?']
process['command'] = os.path.basename(_cmdline[0]) # else:
process['full_command'] = _cmdline # process['command'] = os.path.basename(_cmdline[0])
# process['full_command'] = _cmdline
# process['cpu_percent'] = ps_process.cpu_percent()
# process['cpu_memory_usage'] = \
# round((ps_process.memory_percent() / 100.0) *
# psutil.virtual_memory().total)
# Bytes to MBytes # Bytes to MBytes
process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
process['cpu_percent'] = ps_process.cpu_percent()
process['cpu_memory_usage'] = \
round((ps_process.memory_percent() / 100.0) *
psutil.virtual_memory().total)
except Exception: except Exception:
# insufficient permissions # insufficient permissions
pass pass
@ -290,12 +291,13 @@ class GPUStatCollection(object):
# e.g. nvidia-smi reset or reboot the system # e.g. nvidia-smi reset or reboot the system
pass pass
# TODO: Do not block if full process info is not requested # we do not actually use these, so no point in collecting them
time.sleep(0.1) # # TODO: Do not block if full process info is not requested
for process in processes: # time.sleep(0.1)
pid = process['pid'] # for process in processes:
cache_process = GPUStatCollection.global_processes[pid] # pid = process['pid']
process['cpu_percent'] = cache_process.cpu_percent() # cache_process = GPUStatCollection.global_processes[pid]
# process['cpu_percent'] = cache_process.cpu_percent()
index = N.nvmlDeviceGetIndex(handle) index = N.nvmlDeviceGetIndex(handle)
gpu_info = { gpu_info = {