mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Fix MIG GPU support
This commit is contained in:
parent
7a4154f054
commit
f267466926
@ -56,6 +56,21 @@ class GPUStat(object):
|
|||||||
"""
|
"""
|
||||||
return self.entry['uuid']
|
return self.entry['uuid']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mig_index(self):
|
||||||
|
"""
|
||||||
|
Returns the index of the MIG partition (as in nvidia-smi).
|
||||||
|
"""
|
||||||
|
return self.entry.get("mig_index")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mig_uuid(self):
|
||||||
|
"""
|
||||||
|
Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode,
|
||||||
|
e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef
|
||||||
|
"""
|
||||||
|
return self.entry.get("mig_uuid")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def name(self):
|
def name(self):
|
||||||
"""
|
"""
|
||||||
@ -160,6 +175,7 @@ class GPUStatCollection(object):
|
|||||||
_initialized = False
|
_initialized = False
|
||||||
_device_count = None
|
_device_count = None
|
||||||
_gpu_device_info = {}
|
_gpu_device_info = {}
|
||||||
|
_mig_device_info = {}
|
||||||
|
|
||||||
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
|
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
|
||||||
self.gpus = gpu_list
|
self.gpus = gpu_list
|
||||||
@ -190,7 +206,7 @@ class GPUStatCollection(object):
|
|||||||
return b.decode() # for python3, to unicode
|
return b.decode() # for python3, to unicode
|
||||||
return b
|
return b
|
||||||
|
|
||||||
def get_gpu_info(index, handle):
|
def get_gpu_info(index, handle, is_mig=False):
|
||||||
"""Get one GPU information specified by nvml handle"""
|
"""Get one GPU information specified by nvml handle"""
|
||||||
|
|
||||||
def get_process_info(nv_process):
|
def get_process_info(nv_process):
|
||||||
@ -226,12 +242,13 @@ class GPUStatCollection(object):
|
|||||||
pass
|
pass
|
||||||
return process
|
return process
|
||||||
|
|
||||||
if not GPUStatCollection._gpu_device_info.get(index):
|
device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info
|
||||||
|
if not device_info.get(index):
|
||||||
name = _decode(N.nvmlDeviceGetName(handle))
|
name = _decode(N.nvmlDeviceGetName(handle))
|
||||||
uuid = _decode(N.nvmlDeviceGetUUID(handle))
|
uuid = _decode(N.nvmlDeviceGetUUID(handle))
|
||||||
GPUStatCollection._gpu_device_info[index] = (name, uuid)
|
device_info[index] = (name, uuid)
|
||||||
|
|
||||||
name, uuid = GPUStatCollection._gpu_device_info[index]
|
name, uuid = device_info[index]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
temperature = N.nvmlDeviceGetTemperature(
|
temperature = N.nvmlDeviceGetTemperature(
|
||||||
@ -327,8 +344,36 @@ class GPUStatCollection(object):
|
|||||||
for index in range(GPUStatCollection._device_count):
|
for index in range(GPUStatCollection._device_count):
|
||||||
handle = N.nvmlDeviceGetHandleByIndex(index)
|
handle = N.nvmlDeviceGetHandleByIndex(index)
|
||||||
gpu_info = get_gpu_info(index, handle)
|
gpu_info = get_gpu_info(index, handle)
|
||||||
gpu_stat = GPUStat(gpu_info)
|
mig_cnt = 0
|
||||||
gpu_list.append(gpu_stat)
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if mig_cnt <= 0:
|
||||||
|
gpu_list.append(GPUStat(gpu_info))
|
||||||
|
continue
|
||||||
|
|
||||||
|
got_mig_info = False
|
||||||
|
for mig_index in range(mig_cnt):
|
||||||
|
try:
|
||||||
|
mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index)
|
||||||
|
mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True)
|
||||||
|
mig_info["mig_name"] = mig_info["name"]
|
||||||
|
mig_info["name"] = gpu_info["name"]
|
||||||
|
mig_info["mig_index"] = mig_info["index"]
|
||||||
|
mig_info["mig_uuid"] = mig_info["uuid"]
|
||||||
|
mig_info["index"] = gpu_info["index"]
|
||||||
|
mig_info["uuid"] = gpu_info["uuid"]
|
||||||
|
mig_info["temperature.gpu"] = gpu_info["temperature.gpu"]
|
||||||
|
mig_info["fan.speed"] = gpu_info["fan.speed"]
|
||||||
|
gpu_list.append(GPUStat(mig_info))
|
||||||
|
got_mig_info = True
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
if not got_mig_info:
|
||||||
|
gpu_list.append(GPUStat(gpu_info))
|
||||||
|
|
||||||
# 2. additional info (driver version, etc).
|
# 2. additional info (driver version, etc).
|
||||||
if get_driver_info:
|
if get_driver_info:
|
||||||
|
@ -12,6 +12,7 @@ from typing import Text
|
|||||||
from .process.mp import BackgroundMonitor
|
from .process.mp import BackgroundMonitor
|
||||||
from ..backend_api import Session
|
from ..backend_api import Session
|
||||||
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
|
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
|
||||||
|
from ..config import config
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from .gpu import gpustat
|
from .gpu import gpustat
|
||||||
@ -46,6 +47,11 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
self._last_process_pool = {}
|
self._last_process_pool = {}
|
||||||
self._last_process_id_list = []
|
self._last_process_id_list = []
|
||||||
self._gpu_memory_per_process = True
|
self._gpu_memory_per_process = True
|
||||||
|
self._default_gpu_utilization = config.get("resource_monitoring.default_gpu_utilization", 100)
|
||||||
|
# allow default_gpu_utilization as null in the config, in which case we don't log anything
|
||||||
|
if self._default_gpu_utilization is not None:
|
||||||
|
self._default_gpu_utilization = int(self._default_gpu_utilization)
|
||||||
|
self._gpu_utilization_warning_sent = False
|
||||||
|
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
@ -314,13 +320,18 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
|
|
||||||
return mem_size
|
return mem_size
|
||||||
|
|
||||||
def _skip_nonactive_gpu(self, idx, gpu):
|
def _skip_nonactive_gpu(self, gpu):
|
||||||
if not self._active_gpus:
|
if not self._active_gpus:
|
||||||
return False
|
return False
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
uuid = getattr(gpu, "uuid", None)
|
uuid = getattr(gpu, "uuid", None)
|
||||||
return str(idx) not in self._active_gpus and (not uuid or uuid not in self._active_gpus)
|
mig_uuid = getattr(gpu, "mig_uuid", None)
|
||||||
|
return (
|
||||||
|
str(gpu.index) not in self._active_gpus
|
||||||
|
and (not uuid or uuid not in self._active_gpus)
|
||||||
|
and (not mig_uuid or mig_uuid not in self._active_gpus)
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
@ -349,7 +360,7 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
self._gpu_memory_per_process = False
|
self._gpu_memory_per_process = False
|
||||||
break
|
break
|
||||||
# only monitor the active gpu's, if none were selected, monitor everything
|
# only monitor the active gpu's, if none were selected, monitor everything
|
||||||
if self._skip_nonactive_gpu(i, g):
|
if self._skip_nonactive_gpu(g):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
gpu_mem[i] = 0
|
gpu_mem[i] = 0
|
||||||
@ -369,10 +380,27 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
|
|
||||||
for i, g in enumerate(gpu_stat.gpus):
|
for i, g in enumerate(gpu_stat.gpus):
|
||||||
# only monitor the active gpu's, if none were selected, monitor everything
|
# only monitor the active gpu's, if none were selected, monitor everything
|
||||||
if self._skip_nonactive_gpu(i, g):
|
if self._skip_nonactive_gpu(g):
|
||||||
continue
|
continue
|
||||||
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
|
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
|
||||||
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
|
if g["utilization.gpu"] is not None:
|
||||||
|
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
|
||||||
|
else:
|
||||||
|
stats["gpu_%d_utilization" % i] = self._default_gpu_utilization
|
||||||
|
if not self._gpu_utilization_warning_sent:
|
||||||
|
if g.mig_index is not None:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"Running inside MIG, Nvidia driver cannot export utilization, pushing fixed value {}".format( # noqa
|
||||||
|
self._default_gpu_utilization
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"Nvidia driver cannot export utilization, pushing fixed value {}".format(
|
||||||
|
self._default_gpu_utilization
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._gpu_utilization_warning_sent = True
|
||||||
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
|
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
|
||||||
# already in MBs
|
# already in MBs
|
||||||
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
|
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
|
||||||
@ -400,7 +428,7 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
if self._gpustat:
|
if self._gpustat:
|
||||||
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
|
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
|
||||||
if gpu_stat.gpus:
|
if gpu_stat.gpus:
|
||||||
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(i, g)]
|
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._skip_nonactive_gpu(g)]
|
||||||
specs.update(
|
specs.update(
|
||||||
gpu_count=int(len(gpus)),
|
gpu_count=int(len(gpus)),
|
||||||
gpu_type=', '.join(g.name for g in gpus),
|
gpu_type=', '.join(g.name for g in gpus),
|
||||||
|
Loading…
Reference in New Issue
Block a user