Fix MIG GPU support

This commit is contained in:
allegroai 2024-05-01 10:30:41 +03:00
parent 7a4154f054
commit f267466926
2 changed files with 85 additions and 12 deletions

View File

@ -56,6 +56,21 @@ class GPUStat(object):
"""
return self.entry['uuid']
@property
def mig_index(self):
"""
Returns the index of the MIG partition (as in nvidia-smi).
"""
return self.entry.get("mig_index")
@property
def mig_uuid(self):
"""
Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode,
e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef
"""
return self.entry.get("mig_uuid")
@property
def name(self):
"""
@ -160,6 +175,7 @@ class GPUStatCollection(object):
_initialized = False
_device_count = None
_gpu_device_info = {}
_mig_device_info = {}
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
self.gpus = gpu_list
@ -190,7 +206,7 @@ class GPUStatCollection(object):
return b.decode() # for python3, to unicode
return b
def get_gpu_info(index, handle):
def get_gpu_info(index, handle, is_mig=False):
"""Get one GPU information specified by nvml handle"""
def get_process_info(nv_process):
@ -226,12 +242,13 @@ class GPUStatCollection(object):
pass
return process
if not GPUStatCollection._gpu_device_info.get(index):
device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info
if not device_info.get(index):
name = _decode(N.nvmlDeviceGetName(handle))
uuid = _decode(N.nvmlDeviceGetUUID(handle))
GPUStatCollection._gpu_device_info[index] = (name, uuid)
device_info[index] = (name, uuid)
name, uuid = GPUStatCollection._gpu_device_info[index]
name, uuid = device_info[index]
try:
temperature = N.nvmlDeviceGetTemperature(
@ -327,8 +344,36 @@ class GPUStatCollection(object):
for index in range(GPUStatCollection._device_count):
handle = N.nvmlDeviceGetHandleByIndex(index)
gpu_info = get_gpu_info(index, handle)
gpu_stat = GPUStat(gpu_info)
gpu_list.append(gpu_stat)
mig_cnt = 0
# noinspection PyBroadException
try:
mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle)
except Exception:
pass
if mig_cnt <= 0:
gpu_list.append(GPUStat(gpu_info))
continue
got_mig_info = False
for mig_index in range(mig_cnt):
try:
mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index)
mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True)
mig_info["mig_name"] = mig_info["name"]
mig_info["name"] = gpu_info["name"]
mig_info["mig_index"] = mig_info["index"]
mig_info["mig_uuid"] = mig_info["uuid"]
mig_info["index"] = gpu_info["index"]
mig_info["uuid"] = gpu_info["uuid"]
mig_info["temperature.gpu"] = gpu_info["temperature.gpu"]
mig_info["fan.speed"] = gpu_info["fan.speed"]
gpu_list.append(GPUStat(mig_info))
got_mig_info = True
except Exception as e:
pass
if not got_mig_info:
gpu_list.append(GPUStat(gpu_info))
# 2. additional info (driver version, etc).
if get_driver_info:

View File

@ -12,6 +12,7 @@ from typing import Text
from .process.mp import BackgroundMonitor
from ..backend_api import Session
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
from ..config import config
try:
from .gpu import gpustat
@ -46,6 +47,11 @@ class ResourceMonitor(BackgroundMonitor):
self._last_process_pool = {}
self._last_process_id_list = []
self._gpu_memory_per_process = True
self._default_gpu_utilization = config.get("resource_monitoring.default_gpu_utilization", 100)
# allow default_gpu_utilization as null in the config, in which case we don't log anything
if self._default_gpu_utilization is not None:
self._default_gpu_utilization = int(self._default_gpu_utilization)
self._gpu_utilization_warning_sent = False
# noinspection PyBroadException
try:
@ -314,13 +320,18 @@ class ResourceMonitor(BackgroundMonitor):
return mem_size
def _skip_nonactive_gpu(self, idx, gpu):
def _skip_nonactive_gpu(self, gpu):
if not self._active_gpus:
return False
# noinspection PyBroadException
try:
uuid = getattr(gpu, "uuid", None)
return str(idx) not in self._active_gpus and (not uuid or uuid not in self._active_gpus)
mig_uuid = getattr(gpu, "mig_uuid", None)
return (
str(gpu.index) not in self._active_gpus
and (not uuid or uuid not in self._active_gpus)
and (not mig_uuid or mig_uuid not in self._active_gpus)
)
except Exception:
pass
return False
@ -349,7 +360,7 @@ class ResourceMonitor(BackgroundMonitor):
self._gpu_memory_per_process = False
break
# only monitor the active gpu's, if none were selected, monitor everything
if self._skip_nonactive_gpu(i, g):
if self._skip_nonactive_gpu(g):
continue
gpu_mem[i] = 0
@ -369,10 +380,27 @@ class ResourceMonitor(BackgroundMonitor):
for i, g in enumerate(gpu_stat.gpus):
# only monitor the active gpu's, if none were selected, monitor everything
if self._skip_nonactive_gpu(i, g):
if self._skip_nonactive_gpu(g):
continue
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
if g["utilization.gpu"] is not None:
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
else:
stats["gpu_%d_utilization" % i] = self._default_gpu_utilization
if not self._gpu_utilization_warning_sent:
if g.mig_index is not None:
self._task.get_logger().report_text(
"Running inside MIG, Nvidia driver cannot export utilization, pushing fixed value {}".format( # noqa
self._default_gpu_utilization
)
)
else:
self._task.get_logger().report_text(
"Nvidia driver cannot export utilization, pushing fixed value {}".format(
self._default_gpu_utilization
)
)
self._gpu_utilization_warning_sent = True
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
# already in MBs
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
@ -400,7 +428,7 @@ class ResourceMonitor(BackgroundMonitor):
if self._gpustat:
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
if gpu_stat.gpus:
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(i, g)]
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(g)]
specs.update(
gpu_count=int(len(gpus)),
gpu_type=', '.join(g.name for g in gpus),