mirror of
https://github.com/clearml/clearml
synced 2025-04-16 21:42:10 +00:00
Fix MIG GPU support
This commit is contained in:
parent
7a4154f054
commit
f267466926
@ -56,6 +56,21 @@ class GPUStat(object):
|
||||
"""
|
||||
return self.entry['uuid']
|
||||
|
||||
@property
|
||||
def mig_index(self):
|
||||
"""
|
||||
Returns the index of the MIG partition (as in nvidia-smi).
|
||||
"""
|
||||
return self.entry.get("mig_index")
|
||||
|
||||
@property
|
||||
def mig_uuid(self):
|
||||
"""
|
||||
Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode,
|
||||
e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef
|
||||
"""
|
||||
return self.entry.get("mig_uuid")
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""
|
||||
@ -160,6 +175,7 @@ class GPUStatCollection(object):
|
||||
_initialized = False
|
||||
_device_count = None
|
||||
_gpu_device_info = {}
|
||||
_mig_device_info = {}
|
||||
|
||||
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
|
||||
self.gpus = gpu_list
|
||||
@ -190,7 +206,7 @@ class GPUStatCollection(object):
|
||||
return b.decode() # for python3, to unicode
|
||||
return b
|
||||
|
||||
def get_gpu_info(index, handle):
|
||||
def get_gpu_info(index, handle, is_mig=False):
|
||||
"""Get one GPU information specified by nvml handle"""
|
||||
|
||||
def get_process_info(nv_process):
|
||||
@ -226,12 +242,13 @@ class GPUStatCollection(object):
|
||||
pass
|
||||
return process
|
||||
|
||||
if not GPUStatCollection._gpu_device_info.get(index):
|
||||
device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info
|
||||
if not device_info.get(index):
|
||||
name = _decode(N.nvmlDeviceGetName(handle))
|
||||
uuid = _decode(N.nvmlDeviceGetUUID(handle))
|
||||
GPUStatCollection._gpu_device_info[index] = (name, uuid)
|
||||
device_info[index] = (name, uuid)
|
||||
|
||||
name, uuid = GPUStatCollection._gpu_device_info[index]
|
||||
name, uuid = device_info[index]
|
||||
|
||||
try:
|
||||
temperature = N.nvmlDeviceGetTemperature(
|
||||
@ -327,8 +344,36 @@ class GPUStatCollection(object):
|
||||
for index in range(GPUStatCollection._device_count):
|
||||
handle = N.nvmlDeviceGetHandleByIndex(index)
|
||||
gpu_info = get_gpu_info(index, handle)
|
||||
gpu_stat = GPUStat(gpu_info)
|
||||
gpu_list.append(gpu_stat)
|
||||
mig_cnt = 0
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if mig_cnt <= 0:
|
||||
gpu_list.append(GPUStat(gpu_info))
|
||||
continue
|
||||
|
||||
got_mig_info = False
|
||||
for mig_index in range(mig_cnt):
|
||||
try:
|
||||
mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index)
|
||||
mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True)
|
||||
mig_info["mig_name"] = mig_info["name"]
|
||||
mig_info["name"] = gpu_info["name"]
|
||||
mig_info["mig_index"] = mig_info["index"]
|
||||
mig_info["mig_uuid"] = mig_info["uuid"]
|
||||
mig_info["index"] = gpu_info["index"]
|
||||
mig_info["uuid"] = gpu_info["uuid"]
|
||||
mig_info["temperature.gpu"] = gpu_info["temperature.gpu"]
|
||||
mig_info["fan.speed"] = gpu_info["fan.speed"]
|
||||
gpu_list.append(GPUStat(mig_info))
|
||||
got_mig_info = True
|
||||
except Exception as e:
|
||||
pass
|
||||
if not got_mig_info:
|
||||
gpu_list.append(GPUStat(gpu_info))
|
||||
|
||||
# 2. additional info (driver version, etc).
|
||||
if get_driver_info:
|
||||
|
@ -12,6 +12,7 @@ from typing import Text
|
||||
from .process.mp import BackgroundMonitor
|
||||
from ..backend_api import Session
|
||||
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
|
||||
from ..config import config
|
||||
|
||||
try:
|
||||
from .gpu import gpustat
|
||||
@ -46,6 +47,11 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
self._last_process_pool = {}
|
||||
self._last_process_id_list = []
|
||||
self._gpu_memory_per_process = True
|
||||
self._default_gpu_utilization = config.get("resource_monitoring.default_gpu_utilization", 100)
|
||||
# allow default_gpu_utilization as null in the config, in which case we don't log anything
|
||||
if self._default_gpu_utilization is not None:
|
||||
self._default_gpu_utilization = int(self._default_gpu_utilization)
|
||||
self._gpu_utilization_warning_sent = False
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
@ -314,13 +320,18 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
|
||||
return mem_size
|
||||
|
||||
def _skip_nonactive_gpu(self, idx, gpu):
|
||||
def _skip_nonactive_gpu(self, gpu):
|
||||
if not self._active_gpus:
|
||||
return False
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
uuid = getattr(gpu, "uuid", None)
|
||||
return str(idx) not in self._active_gpus and (not uuid or uuid not in self._active_gpus)
|
||||
mig_uuid = getattr(gpu, "mig_uuid", None)
|
||||
return (
|
||||
str(gpu.index) not in self._active_gpus
|
||||
and (not uuid or uuid not in self._active_gpus)
|
||||
and (not mig_uuid or mig_uuid not in self._active_gpus)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
@ -349,7 +360,7 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
self._gpu_memory_per_process = False
|
||||
break
|
||||
# only monitor the active gpu's, if none were selected, monitor everything
|
||||
if self._skip_nonactive_gpu(i, g):
|
||||
if self._skip_nonactive_gpu(g):
|
||||
continue
|
||||
|
||||
gpu_mem[i] = 0
|
||||
@ -369,10 +380,27 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
|
||||
for i, g in enumerate(gpu_stat.gpus):
|
||||
# only monitor the active gpu's, if none were selected, monitor everything
|
||||
if self._skip_nonactive_gpu(i, g):
|
||||
if self._skip_nonactive_gpu(g):
|
||||
continue
|
||||
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
|
||||
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
|
||||
if g["utilization.gpu"] is not None:
|
||||
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
|
||||
else:
|
||||
stats["gpu_%d_utilization" % i] = self._default_gpu_utilization
|
||||
if not self._gpu_utilization_warning_sent:
|
||||
if g.mig_index is not None:
|
||||
self._task.get_logger().report_text(
|
||||
"Running inside MIG, Nvidia driver cannot export utilization, pushing fixed value {}".format( # noqa
|
||||
self._default_gpu_utilization
|
||||
)
|
||||
)
|
||||
else:
|
||||
self._task.get_logger().report_text(
|
||||
"Nvidia driver cannot export utilization, pushing fixed value {}".format(
|
||||
self._default_gpu_utilization
|
||||
)
|
||||
)
|
||||
self._gpu_utilization_warning_sent = True
|
||||
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
|
||||
# already in MBs
|
||||
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
|
||||
@ -400,7 +428,7 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
if self._gpustat:
|
||||
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
|
||||
if gpu_stat.gpus:
|
||||
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(i, g)]
|
||||
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(g)]
|
||||
specs.update(
|
||||
gpu_count=int(len(gpus)),
|
||||
gpu_type=', '.join(g.name for g in gpus),
|
||||
|
Loading…
Reference in New Issue
Block a user