mirror of
https://github.com/clearml/clearml-agent
synced 2025-04-22 23:24:26 +00:00
Fix GPU Windows monitoring support (Trains Issue #177)
This commit is contained in:
parent
6b333202e9
commit
4aacf9005e
@ -203,23 +203,24 @@ class GPUStatCollection(object):
|
|||||||
process['pid'] = nv_process.pid
|
process['pid'] = nv_process.pid
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
process['username'] = ps_process.username()
|
# we do not actually use these, so no point in collecting them
|
||||||
# cmdline returns full path;
|
# process['username'] = ps_process.username()
|
||||||
# as in `ps -o comm`, get short cmdnames.
|
# # cmdline returns full path;
|
||||||
_cmdline = ps_process.cmdline()
|
# # as in `ps -o comm`, get short cmdnames.
|
||||||
if not _cmdline:
|
# _cmdline = ps_process.cmdline()
|
||||||
# sometimes, zombie or unknown (e.g. [kworker/8:2H])
|
# if not _cmdline:
|
||||||
process['command'] = '?'
|
# # sometimes, zombie or unknown (e.g. [kworker/8:2H])
|
||||||
process['full_command'] = ['?']
|
# process['command'] = '?'
|
||||||
else:
|
# process['full_command'] = ['?']
|
||||||
process['command'] = os.path.basename(_cmdline[0])
|
# else:
|
||||||
process['full_command'] = _cmdline
|
# process['command'] = os.path.basename(_cmdline[0])
|
||||||
|
# process['full_command'] = _cmdline
|
||||||
|
# process['cpu_percent'] = ps_process.cpu_percent()
|
||||||
|
# process['cpu_memory_usage'] = \
|
||||||
|
# round((ps_process.memory_percent() / 100.0) *
|
||||||
|
# psutil.virtual_memory().total)
|
||||||
# Bytes to MBytes
|
# Bytes to MBytes
|
||||||
process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
|
process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
|
||||||
process['cpu_percent'] = ps_process.cpu_percent()
|
|
||||||
process['cpu_memory_usage'] = \
|
|
||||||
round((ps_process.memory_percent() / 100.0) *
|
|
||||||
psutil.virtual_memory().total)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# insufficient permissions
|
# insufficient permissions
|
||||||
pass
|
pass
|
||||||
@ -290,12 +291,13 @@ class GPUStatCollection(object):
|
|||||||
# e.g. nvidia-smi reset or reboot the system
|
# e.g. nvidia-smi reset or reboot the system
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# TODO: Do not block if full process info is not requested
|
# we do not actually use these, so no point in collecting them
|
||||||
time.sleep(0.1)
|
# # TODO: Do not block if full process info is not requested
|
||||||
for process in processes:
|
# time.sleep(0.1)
|
||||||
pid = process['pid']
|
# for process in processes:
|
||||||
cache_process = GPUStatCollection.global_processes[pid]
|
# pid = process['pid']
|
||||||
process['cpu_percent'] = cache_process.cpu_percent()
|
# cache_process = GPUStatCollection.global_processes[pid]
|
||||||
|
# process['cpu_percent'] = cache_process.cpu_percent()
|
||||||
|
|
||||||
index = N.nvmlDeviceGetIndex(handle)
|
index = N.nvmlDeviceGetIndex(handle)
|
||||||
gpu_info = {
|
gpu_info = {
|
||||||
|
Loading…
Reference in New Issue
Block a user