mirror of
https://github.com/clearml/clearml
synced 2025-05-10 07:30:41 +00:00
670 lines
24 KiB
Python
670 lines
24 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Implementation of gpustat
|
|
@author Jongwook Choi
|
|
@url https://github.com/wookayin/gpustat
|
|
|
|
@ copied from gpu-stat 0.6.0
|
|
"""
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
import json
|
|
import platform
|
|
import subprocess
|
|
import sys
|
|
from ctypes import c_uint32, byref, c_int64
|
|
from datetime import datetime
|
|
from typing import Optional, List, Iterator, TextIO, Any
|
|
|
|
import psutil
|
|
|
|
from ..gpu import pynvml as N
|
|
from ..gpu import pyrsmi as R
|
|
|
|
NOT_SUPPORTED = "Not Supported"
|
|
MB = 1024 * 1024
|
|
|
|
|
|
class GPUStat(object):
|
|
def __init__(self, entry: dict) -> None:
|
|
if not isinstance(entry, dict):
|
|
raise TypeError("entry should be a dict, {} given".format(type(entry)))
|
|
self.entry = entry
|
|
|
|
def keys(self) -> Any:
|
|
return self.entry.keys()
|
|
|
|
def __getitem__(self, key: Any) -> Any:
|
|
return self.entry[key]
|
|
|
|
@property
|
|
def index(self) -> int:
|
|
"""
|
|
Returns the index of GPU (as in nvidia-smi).
|
|
"""
|
|
return self.entry["index"]
|
|
|
|
@property
|
|
def uuid(self) -> str:
|
|
"""
|
|
Returns the uuid returned by nvidia-smi,
|
|
e.g. GPU-12345678-abcd-abcd-uuid-123456abcdef
|
|
"""
|
|
return self.entry["uuid"]
|
|
|
|
@property
|
|
def mig_index(self) -> Optional[int]:
|
|
"""
|
|
Returns the index of the MIG partition (as in nvidia-smi).
|
|
"""
|
|
return self.entry.get("mig_index")
|
|
|
|
@property
|
|
def mig_uuid(self) -> str:
|
|
"""
|
|
Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode,
|
|
e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef
|
|
"""
|
|
return self.entry.get("mig_uuid")
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
"""
|
|
Returns the name of GPU card (e.g. Geforce Titan X)
|
|
"""
|
|
return self.entry["name"]
|
|
|
|
@property
|
|
def memory_total(self) -> int:
|
|
"""
|
|
Returns the total memory (in MB) as an integer.
|
|
"""
|
|
return int(self.entry["memory.total"])
|
|
|
|
@property
|
|
def memory_used(self) -> int:
|
|
"""
|
|
Returns the occupied memory (in MB) as an integer.
|
|
"""
|
|
return int(self.entry["memory.used"])
|
|
|
|
@property
|
|
def memory_free(self) -> int:
|
|
"""
|
|
Returns the free (available) memory (in MB) as an integer.
|
|
"""
|
|
v = self.memory_total - self.memory_used
|
|
return max(v, 0)
|
|
|
|
@property
|
|
def memory_available(self) -> int:
|
|
"""
|
|
Returns the available memory (in MB) as an integer.
|
|
Alias of memory_free.
|
|
"""
|
|
return self.memory_free
|
|
|
|
@property
|
|
def temperature(self) -> Optional[int]:
|
|
"""
|
|
Returns the temperature (in celcius) of GPU as an integer,
|
|
or None if the information is not available.
|
|
"""
|
|
v = self.entry["temperature.gpu"]
|
|
return int(v) if v is not None else None
|
|
|
|
@property
|
|
def fan_speed(self) -> Optional[int]:
|
|
"""
|
|
Returns the fan speed percentage (0-100) of maximum intended speed
|
|
as an integer, or None if the information is not available.
|
|
"""
|
|
v = self.entry["fan.speed"]
|
|
return int(v) if v is not None else None
|
|
|
|
@property
|
|
def utilization(self) -> Optional[int]:
|
|
"""
|
|
Returns the GPU utilization (in percentile),
|
|
or None if the information is not available.
|
|
"""
|
|
v = self.entry["utilization.gpu"]
|
|
return int(v) if v is not None else None
|
|
|
|
@property
|
|
def power_draw(self) -> Optional[int]:
|
|
"""
|
|
Returns the GPU power usage in Watts,
|
|
or None if the information is not available.
|
|
"""
|
|
v = self.entry["power.draw"]
|
|
return int(v) if v is not None else None
|
|
|
|
@property
|
|
def power_limit(self) -> Optional[int]:
|
|
"""
|
|
Returns the (enforced) GPU power limit in Watts,
|
|
or None if the information is not available.
|
|
"""
|
|
v = self.entry["enforced.power.limit"]
|
|
return int(v) if v is not None else None
|
|
|
|
@property
|
|
def processes(self) -> list:
|
|
"""
|
|
Get the list of running processes on the GPU.
|
|
"""
|
|
return self.entry["processes"]
|
|
|
|
def jsonify(self) -> dict:
|
|
o = dict(self.entry)
|
|
if self.entry["processes"] is not None:
|
|
o["processes"] = [{k: v for (k, v) in p.items() if k != "gpu_uuid"} for p in self.entry["processes"]]
|
|
else:
|
|
o["processes"] = "({})".format(NOT_SUPPORTED)
|
|
return o
|
|
|
|
|
|
class GPUStatCollection(object):
|
|
global_processes = {}
|
|
_initialized = False
|
|
_device_count = None
|
|
_gpu_device_info = {}
|
|
_mig_device_info = {}
|
|
|
|
def __init__(
|
|
self,
|
|
gpu_list: List[GPUStat],
|
|
driver_version: Optional[str] = None,
|
|
driver_cuda_version: Optional[str] = None,
|
|
) -> None:
|
|
self.gpus = gpu_list
|
|
|
|
# attach additional system information
|
|
self.hostname = platform.node()
|
|
self.query_time = datetime.now()
|
|
self.driver_version = driver_version
|
|
self.driver_cuda_version = driver_cuda_version
|
|
|
|
@staticmethod
|
|
def clean_processes() -> None:
|
|
for pid in list(GPUStatCollection.global_processes.keys()):
|
|
if not psutil.pid_exists(pid):
|
|
del GPUStatCollection.global_processes[pid]
|
|
|
|
@staticmethod
|
|
def _new_query_amd(
|
|
shutdown: bool = False,
|
|
per_process_stats: bool = False,
|
|
get_driver_info: bool = False,
|
|
) -> "GPUStatCollection":
|
|
initialized = False
|
|
if not GPUStatCollection._initialized:
|
|
R.smi_initialize()
|
|
GPUStatCollection._initialized = True
|
|
initialized = True
|
|
|
|
def get_gpu_info(index: int) -> dict:
|
|
def amd_query_processes() -> List[R.rsmi_process_info_t]:
|
|
num_procs = c_uint32()
|
|
ret = R.rocm_lib.rsmi_compute_process_info_get(None, byref(num_procs))
|
|
if R.rsmi_ret_ok(ret):
|
|
buff_sz = num_procs.value + 10
|
|
proc_info = (R.rsmi_process_info_t * buff_sz)()
|
|
ret = R.rocm_lib.rsmi_compute_process_info_get(byref(proc_info), byref(num_procs))
|
|
proc_info_list = [proc_info[i] for i in range(num_procs.value)] if R.rsmi_ret_ok(ret) else []
|
|
result_proc_info_list = []
|
|
# query VRAM usage explicitly, as rsmi_compute_process_info_get
|
|
# doesn't actually return VRAM usage
|
|
for proc_info in proc_info_list:
|
|
vram_query_proc_info = R.rsmi_process_info_t()
|
|
ret = R.rocm_lib.rsmi_compute_process_info_by_pid_get(
|
|
int(proc_info.process_id), byref(vram_query_proc_info)
|
|
)
|
|
if R.rsmi_ret_ok(ret):
|
|
proc_info.vram_usage = vram_query_proc_info.vram_usage
|
|
result_proc_info_list.append(proc_info)
|
|
return result_proc_info_list
|
|
return []
|
|
|
|
def get_fan_speed() -> float:
|
|
fan_level = c_int64()
|
|
fan_max = c_int64()
|
|
sensor_ind = c_uint32(0)
|
|
|
|
ret = R.rocm_lib.rsmi_dev_fan_speed_get(index, sensor_ind, byref(fan_level))
|
|
if not R.rsmi_ret_ok(ret, log_error=False):
|
|
return None
|
|
|
|
ret = R.rocm_lib.rsmi_dev_fan_speed_max_get(index, sensor_ind, byref(fan_max))
|
|
if not R.rsmi_ret_ok(ret, log_error=False):
|
|
return None
|
|
|
|
if fan_level.value <= 0 or fan_max <= 0:
|
|
return None
|
|
|
|
return float(fan_level.value) / float(fan_max.value)
|
|
|
|
def get_process_info(comp_process: R.rsmi_process_info_t) -> dict:
|
|
process = {}
|
|
pid = comp_process.process_id
|
|
# skip global_processes caching because PID querying seems to be inconsistent atm
|
|
# if pid not in GPUStatCollection.global_processes:
|
|
# GPUStatCollection.global_processes[pid] = psutil.Process(pid=pid)
|
|
process["pid"] = pid
|
|
try:
|
|
process["gpu_memory_usage"] = comp_process.vram_usage // MB
|
|
except Exception:
|
|
pass
|
|
return process
|
|
|
|
if not GPUStatCollection._gpu_device_info.get(index):
|
|
uuid = R.smi_get_device_id(index)
|
|
name = R.smi_get_device_name(index)
|
|
GPUStatCollection._gpu_device_info[index] = (name, uuid)
|
|
|
|
name, uuid = GPUStatCollection._gpu_device_info[index]
|
|
|
|
temperature = None # TODO: fetch temperature. It should be possible
|
|
fan_speed = get_fan_speed()
|
|
|
|
try:
|
|
memory_total = R.smi_get_device_memory_total(index)
|
|
except Exception:
|
|
memory_total = None
|
|
|
|
try:
|
|
memory_used = R.smi_get_device_memory_used(index)
|
|
except Exception:
|
|
memory_used = None
|
|
|
|
try:
|
|
utilization = R.smi_get_device_utilization(index)
|
|
except Exception:
|
|
utilization = None
|
|
|
|
try:
|
|
power = R.smi_get_device_average_power(index)
|
|
except Exception:
|
|
power = None
|
|
|
|
power_limit = None # TODO: find a way to fetch this
|
|
|
|
processes = []
|
|
if per_process_stats:
|
|
try:
|
|
comp_processes = amd_query_processes()
|
|
except Exception:
|
|
comp_processes = []
|
|
for comp_process in comp_processes:
|
|
try:
|
|
process = get_process_info(comp_process)
|
|
except psutil.NoSuchProcess:
|
|
# skip process caching for now
|
|
pass
|
|
else:
|
|
processes.append(process)
|
|
|
|
gpu_info = {
|
|
"index": index,
|
|
"uuid": uuid,
|
|
"name": name,
|
|
"temperature.gpu": temperature if temperature is not None else 0,
|
|
"fan.speed": fan_speed if fan_speed is not None else 0,
|
|
"utilization.gpu": utilization if utilization is not None else 100,
|
|
"power.draw": power if power is not None else 0,
|
|
"enforced.power.limit": power_limit if power_limit is not None else 0,
|
|
# Convert bytes into MBytes
|
|
"memory.used": memory_used // MB if memory_used is not None else 0,
|
|
"memory.total": memory_total // MB if memory_total is not None else 100,
|
|
"processes": None if (processes and all(p is None for p in processes)) else processes,
|
|
}
|
|
if per_process_stats:
|
|
GPUStatCollection.clean_processes()
|
|
return gpu_info
|
|
|
|
gpu_list = []
|
|
if GPUStatCollection._device_count is None:
|
|
GPUStatCollection._device_count = R.smi_get_device_count()
|
|
|
|
for index in range(GPUStatCollection._device_count):
|
|
gpu_info = get_gpu_info(index)
|
|
gpu_stat = GPUStat(gpu_info)
|
|
gpu_list.append(gpu_stat)
|
|
|
|
if shutdown and initialized:
|
|
R.smi_shutdown()
|
|
GPUStatCollection._initialized = False
|
|
|
|
# noinspection PyProtectedMember
|
|
driver_version = GPUStatCollection._get_amd_driver_version() if get_driver_info else None
|
|
|
|
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=None)
|
|
|
|
@staticmethod
|
|
def _get_amd_driver_version() -> Optional[str]:
|
|
# make sure the program doesn't crash with something like a SEGFAULT when querying the driver version
|
|
try:
|
|
process = subprocess.Popen(["rocm-smi", "--showdriverversion", "--json"], stdout=subprocess.PIPE)
|
|
out, _ = process.communicate()
|
|
return json.loads(out)["system"]["Driver version"]
|
|
except Exception:
|
|
try:
|
|
process = subprocess.Popen(
|
|
[
|
|
sys.executable,
|
|
"-c",
|
|
"from clearml.utilities.gpu.pyrsmi import smi_get_kernel_version, smi_initialize; "
|
|
+ "smi_initialize(); "
|
|
+ "print(smi_get_kernel_version())",
|
|
]
|
|
)
|
|
out, _ = process.communicate()
|
|
return out.strip()
|
|
except Exception:
|
|
return None
|
|
|
|
@staticmethod
|
|
def _running_in_amd_env() -> bool:
|
|
# noinspection PyProtectedMember
|
|
return bool(R._find_lib_rocm())
|
|
|
|
@staticmethod
|
|
def _new_query_nvidia(
|
|
shutdown: bool = False,
|
|
per_process_stats: bool = False,
|
|
get_driver_info: bool = False,
|
|
) -> "GPUStatCollection":
|
|
"""Query the information of all the GPUs on local machine"""
|
|
initialized = False
|
|
if not GPUStatCollection._initialized:
|
|
N.nvmlInit()
|
|
GPUStatCollection._initialized = True
|
|
initialized = True
|
|
|
|
def _decode(b: bytes) -> str:
|
|
if isinstance(b, bytes):
|
|
return b.decode() # for python3, to unicode
|
|
return b
|
|
|
|
def get_gpu_info(index: int, handle: Any, is_mig: bool = False) -> dict:
|
|
"""Get one GPU information specified by nvml handle"""
|
|
|
|
def get_process_info(nv_process: Any) -> dict:
|
|
"""Get the process information of specific pid"""
|
|
process = {}
|
|
if nv_process.pid not in GPUStatCollection.global_processes:
|
|
GPUStatCollection.global_processes[nv_process.pid] = psutil.Process(pid=nv_process.pid)
|
|
process["pid"] = nv_process.pid
|
|
# noinspection PyBroadException
|
|
try:
|
|
# ps_process = GPUStatCollection.global_processes[nv_process.pid]
|
|
# we do not actually use these, so no point in collecting them
|
|
# process['username'] = ps_process.username()
|
|
# # cmdline returns full path;
|
|
# # as in `ps -o comm`, get short cmdnames.
|
|
# _cmdline = ps_process.cmdline()
|
|
# if not _cmdline:
|
|
# # sometimes, zombie or unknown (e.g. [kworker/8:2H])
|
|
# process['command'] = '?'
|
|
# process['full_command'] = ['?']
|
|
# else:
|
|
# process['command'] = os.path.basename(_cmdline[0])
|
|
# process['full_command'] = _cmdline
|
|
# process['cpu_percent'] = ps_process.cpu_percent()
|
|
# process['cpu_memory_usage'] = \
|
|
# round((ps_process.memory_percent() / 100.0) *
|
|
# psutil.virtual_memory().total)
|
|
# Bytes to MBytes
|
|
process["gpu_memory_usage"] = nv_process.usedGpuMemory // MB
|
|
except Exception:
|
|
# insufficient permissions
|
|
pass
|
|
return process
|
|
|
|
device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info
|
|
if not device_info.get(index):
|
|
name = _decode(N.nvmlDeviceGetName(handle))
|
|
uuid = _decode(N.nvmlDeviceGetUUID(handle))
|
|
device_info[index] = (name, uuid)
|
|
|
|
name, uuid = device_info[index]
|
|
|
|
try:
|
|
temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
|
|
except N.NVMLError:
|
|
temperature = None # Not supported
|
|
|
|
try:
|
|
fan_speed = N.nvmlDeviceGetFanSpeed(handle)
|
|
except N.NVMLError:
|
|
fan_speed = None # Not supported
|
|
|
|
try:
|
|
memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes
|
|
except N.NVMLError:
|
|
memory = None # Not supported
|
|
|
|
try:
|
|
utilization = N.nvmlDeviceGetUtilizationRates(handle)
|
|
except N.NVMLError:
|
|
utilization = None # Not supported
|
|
|
|
try:
|
|
power = N.nvmlDeviceGetPowerUsage(handle)
|
|
except N.NVMLError:
|
|
power = None
|
|
|
|
try:
|
|
power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
|
|
except N.NVMLError:
|
|
power_limit = None
|
|
|
|
try:
|
|
nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
|
|
except N.NVMLError:
|
|
nv_comp_processes = None # Not supported
|
|
try:
|
|
nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle)
|
|
except N.NVMLError:
|
|
nv_graphics_processes = None # Not supported
|
|
|
|
if not per_process_stats or (nv_comp_processes is None and nv_graphics_processes is None):
|
|
processes = None
|
|
else:
|
|
processes = []
|
|
nv_comp_processes = nv_comp_processes or []
|
|
nv_graphics_processes = nv_graphics_processes or []
|
|
for nv_process in nv_comp_processes + nv_graphics_processes:
|
|
try:
|
|
process = get_process_info(nv_process)
|
|
except psutil.NoSuchProcess:
|
|
# TODO: add some reminder for NVML broken context
|
|
# e.g. nvidia-smi reset or reboot the system
|
|
process = None
|
|
processes.append(process)
|
|
|
|
# we do not actually use these, so no point in collecting them
|
|
# # TODO: Do not block if full process info is not requested
|
|
# time.sleep(0.1)
|
|
# for process in processes:
|
|
# pid = process['pid']
|
|
# cache_process = GPUStatCollection.global_processes[pid]
|
|
# process['cpu_percent'] = cache_process.cpu_percent()
|
|
|
|
index = N.nvmlDeviceGetIndex(handle)
|
|
gpu_info = {
|
|
"index": index,
|
|
"uuid": uuid,
|
|
"name": name,
|
|
"temperature.gpu": temperature,
|
|
"fan.speed": fan_speed,
|
|
"utilization.gpu": utilization.gpu if utilization else None,
|
|
"power.draw": power // 1000 if power is not None else None,
|
|
"enforced.power.limit": power_limit // 1000 if power_limit is not None else None,
|
|
# Convert bytes into MBytes
|
|
"memory.used": memory.used // MB if memory else None,
|
|
"memory.total": memory.total // MB if memory else None,
|
|
"processes": None if (processes and all(p is None for p in processes)) else processes,
|
|
}
|
|
if per_process_stats:
|
|
GPUStatCollection.clean_processes()
|
|
return gpu_info
|
|
|
|
# 1. get the list of gpu and status
|
|
gpu_list = []
|
|
if GPUStatCollection._device_count is None:
|
|
GPUStatCollection._device_count = N.nvmlDeviceGetCount()
|
|
|
|
for index in range(GPUStatCollection._device_count):
|
|
handle = N.nvmlDeviceGetHandleByIndex(index)
|
|
gpu_info = get_gpu_info(index, handle)
|
|
mig_cnt = 0
|
|
# noinspection PyBroadException
|
|
try:
|
|
mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle)
|
|
except Exception:
|
|
pass
|
|
|
|
if mig_cnt <= 0:
|
|
gpu_list.append(GPUStat(gpu_info))
|
|
continue
|
|
|
|
got_mig_info = False
|
|
for mig_index in range(mig_cnt):
|
|
# noinspection PyBroadException
|
|
try:
|
|
mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index)
|
|
mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True)
|
|
mig_info["mig_name"] = mig_info["name"]
|
|
mig_info["name"] = gpu_info["name"]
|
|
mig_info["mig_index"] = mig_info["index"]
|
|
mig_info["mig_uuid"] = mig_info["uuid"]
|
|
mig_info["index"] = gpu_info["index"]
|
|
mig_info["uuid"] = gpu_info["uuid"]
|
|
mig_info["temperature.gpu"] = gpu_info["temperature.gpu"]
|
|
mig_info["fan.speed"] = gpu_info["fan.speed"]
|
|
gpu_list.append(GPUStat(mig_info))
|
|
got_mig_info = True
|
|
except Exception:
|
|
pass
|
|
if not got_mig_info:
|
|
gpu_list.append(GPUStat(gpu_info))
|
|
|
|
# 2. additional info (driver version, etc).
|
|
if get_driver_info:
|
|
try:
|
|
driver_version = _decode(N.nvmlSystemGetDriverVersion())
|
|
except N.NVMLError:
|
|
driver_version = None # N/A
|
|
|
|
# noinspection PyBroadException
|
|
try:
|
|
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
|
|
except BaseException:
|
|
# noinspection PyBroadException
|
|
try:
|
|
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
|
except BaseException:
|
|
cuda_driver_version = None
|
|
if cuda_driver_version:
|
|
try:
|
|
cuda_driver_version = "{}.{}".format(
|
|
int(cuda_driver_version) // 1000,
|
|
(int(cuda_driver_version) % 1000) // 10,
|
|
)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
else:
|
|
driver_version = None
|
|
cuda_driver_version = None
|
|
|
|
# no need to shutdown:
|
|
if shutdown and initialized:
|
|
N.nvmlShutdown()
|
|
GPUStatCollection._initialized = False
|
|
|
|
return GPUStatCollection(
|
|
gpu_list,
|
|
driver_version=driver_version,
|
|
driver_cuda_version=cuda_driver_version,
|
|
)
|
|
|
|
@staticmethod
|
|
def new_query(
|
|
shutdown: bool = False,
|
|
per_process_stats: bool = False,
|
|
get_driver_info: bool = False,
|
|
) -> "GPUStatCollection":
|
|
# noinspection PyProtectedMember
|
|
if GPUStatCollection._running_in_amd_env():
|
|
# noinspection PyProtectedMember
|
|
return GPUStatCollection._new_query_amd(
|
|
shutdown=shutdown,
|
|
per_process_stats=per_process_stats,
|
|
get_driver_info=get_driver_info,
|
|
)
|
|
else:
|
|
# noinspection PyProtectedMember
|
|
return GPUStatCollection._new_query_nvidia(
|
|
shutdown=shutdown,
|
|
per_process_stats=per_process_stats,
|
|
get_driver_info=get_driver_info,
|
|
)
|
|
|
|
def __len__(self) -> int:
|
|
return len(self.gpus)
|
|
|
|
def __iter__(self) -> Iterator[GPUStat]:
|
|
return iter(self.gpus)
|
|
|
|
def __getitem__(self, index: int) -> GPUStat:
|
|
return self.gpus[index]
|
|
|
|
def __repr__(self) -> str:
|
|
s = "GPUStatCollection(host=%s, [\n" % self.hostname
|
|
s += "\n".join(" " + str(g) for g in self.gpus)
|
|
s += "\n])"
|
|
return s
|
|
|
|
# --- Printing Functions ---
|
|
def jsonify(self) -> dict:
|
|
return {
|
|
"hostname": self.hostname,
|
|
"query_time": self.query_time,
|
|
"gpus": [g.jsonify() for g in self],
|
|
}
|
|
|
|
def print_json(self, fp: TextIO = sys.stdout) -> None:
|
|
def date_handler(obj: Any) -> str:
|
|
if hasattr(obj, "isoformat"):
|
|
return obj.isoformat()
|
|
else:
|
|
raise TypeError(type(obj))
|
|
|
|
o = self.jsonify()
|
|
json.dump(o, fp, indent=4, separators=(",", ": "), default=date_handler)
|
|
fp.write("\n")
|
|
fp.flush()
|
|
|
|
|
|
def new_query(
|
|
shutdown: bool = False,
|
|
per_process_stats: bool = False,
|
|
get_driver_info: bool = False,
|
|
) -> GPUStatCollection:
|
|
"""
|
|
Obtain a new GPUStatCollection instance by querying nvidia-smi
|
|
to get the list of GPUs and running process information.
|
|
"""
|
|
return GPUStatCollection.new_query(
|
|
shutdown=shutdown,
|
|
per_process_stats=per_process_stats,
|
|
get_driver_info=get_driver_info,
|
|
)
|