Add log nvidia driver and cuda version

This commit is contained in:
allegroai 2021-05-20 11:35:59 +03:00
parent d3929033c0
commit 07a22a38ac
3 changed files with 1943 additions and 62 deletions

View File

@ -161,13 +161,14 @@ class GPUStatCollection(object):
_device_count = None _device_count = None
_gpu_device_info = {} _gpu_device_info = {}
def __init__(self, gpu_list, driver_version=None): def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
self.gpus = gpu_list self.gpus = gpu_list
# attach additional system information # attach additional system information
self.hostname = platform.node() self.hostname = platform.node()
self.query_time = datetime.now() self.query_time = datetime.now()
self.driver_version = driver_version self.driver_version = driver_version
self.driver_cuda_version = driver_cuda_version
@staticmethod @staticmethod
def clean_processes(): def clean_processes():
@ -178,10 +179,11 @@ class GPUStatCollection(object):
@staticmethod @staticmethod
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False): def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
"""Query the information of all the GPUs on local machine""" """Query the information of all the GPUs on local machine"""
initialized = False
if not GPUStatCollection._initialized: if not GPUStatCollection._initialized:
N.nvmlInit() N.nvmlInit()
GPUStatCollection._initialized = True GPUStatCollection._initialized = True
initialized = True
def _decode(b): def _decode(b):
if isinstance(b, bytes): if isinstance(b, bytes):
@ -334,15 +336,32 @@ class GPUStatCollection(object):
driver_version = _decode(N.nvmlSystemGetDriverVersion()) driver_version = _decode(N.nvmlSystemGetDriverVersion())
except N.NVMLError: except N.NVMLError:
driver_version = None # N/A driver_version = None # N/A
# noinspection PyBroadException
try:
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
except BaseException:
# noinspection PyBroadException
try:
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
except BaseException:
cuda_driver_version = None
if cuda_driver_version:
try:
cuda_driver_version = '{}.{}'.format(
int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
except (ValueError, TypeError):
pass
else: else:
driver_version = None driver_version = None
cuda_driver_version = None
# no need to shutdown: # no need to shutdown:
if shutdown: if shutdown and initialized:
N.nvmlShutdown() N.nvmlShutdown()
GPUStatCollection._initialized = False GPUStatCollection._initialized = False
return GPUStatCollection(gpu_list, driver_version=driver_version) return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
def __len__(self): def __len__(self):
return len(self.gpus) return len(self.gpus)

File diff suppressed because it is too large Load Diff

View File

@ -349,14 +349,19 @@ class ResourceMonitor(BackgroundMonitor):
'gpu_count': 0, 'gpu_count': 0,
'gpu_type': '', 'gpu_type': '',
'gpu_memory': '', 'gpu_memory': '',
'driver_version': '',
'driver_cuda_version': '',
} }
if self._gpustat: if self._gpustat:
gpu_stat = self._gpustat.new_query(shutdown=True) gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
if gpu_stat.gpus: if gpu_stat.gpus:
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._active_gpus or i in self._active_gpus] gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._active_gpus or i in self._active_gpus]
specs['gpu_count'] = int(len(gpus)) specs['gpu_count'] = int(len(gpus))
specs['gpu_type'] = ', '.join(g.name for g in gpus) specs['gpu_type'] = ', '.join(g.name for g in gpus)
specs['gpu_memory'] = ', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus) specs['gpu_memory'] = ', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus)
specs['driver_version'] = gpu_stat.driver_version or ''
specs['driver_cuda_version'] = gpu_stat.driver_cuda_version or ''
except Exception: except Exception:
return {} return {}