mirror of
https://github.com/clearml/clearml
synced 2025-03-03 02:32:11 +00:00
Add log nvidia driver and cuda version
This commit is contained in:
parent
d3929033c0
commit
07a22a38ac
@ -161,13 +161,14 @@ class GPUStatCollection(object):
|
||||
_device_count = None
|
||||
_gpu_device_info = {}
|
||||
|
||||
def __init__(self, gpu_list, driver_version=None):
|
||||
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
|
||||
self.gpus = gpu_list
|
||||
|
||||
# attach additional system information
|
||||
self.hostname = platform.node()
|
||||
self.query_time = datetime.now()
|
||||
self.driver_version = driver_version
|
||||
self.driver_cuda_version = driver_cuda_version
|
||||
|
||||
@staticmethod
|
||||
def clean_processes():
|
||||
@ -178,10 +179,11 @@ class GPUStatCollection(object):
|
||||
@staticmethod
|
||||
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||
"""Query the information of all the GPUs on local machine"""
|
||||
|
||||
initialized = False
|
||||
if not GPUStatCollection._initialized:
|
||||
N.nvmlInit()
|
||||
GPUStatCollection._initialized = True
|
||||
initialized = True
|
||||
|
||||
def _decode(b):
|
||||
if isinstance(b, bytes):
|
||||
@ -334,15 +336,32 @@ class GPUStatCollection(object):
|
||||
driver_version = _decode(N.nvmlSystemGetDriverVersion())
|
||||
except N.NVMLError:
|
||||
driver_version = None # N/A
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||
except BaseException:
|
||||
cuda_driver_version = None
|
||||
if cuda_driver_version:
|
||||
try:
|
||||
cuda_driver_version = '{}.{}'.format(
|
||||
int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
else:
|
||||
driver_version = None
|
||||
cuda_driver_version = None
|
||||
|
||||
# no need to shutdown:
|
||||
if shutdown:
|
||||
if shutdown and initialized:
|
||||
N.nvmlShutdown()
|
||||
GPUStatCollection._initialized = False
|
||||
|
||||
return GPUStatCollection(gpu_list, driver_version=driver_version)
|
||||
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.gpus)
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -349,14 +349,19 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
'gpu_count': 0,
|
||||
'gpu_type': '',
|
||||
'gpu_memory': '',
|
||||
'driver_version': '',
|
||||
'driver_cuda_version': '',
|
||||
}
|
||||
if self._gpustat:
|
||||
gpu_stat = self._gpustat.new_query(shutdown=True)
|
||||
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
|
||||
if gpu_stat.gpus:
|
||||
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._active_gpus or i in self._active_gpus]
|
||||
specs['gpu_count'] = int(len(gpus))
|
||||
specs['gpu_type'] = ', '.join(g.name for g in gpus)
|
||||
specs['gpu_memory'] = ', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus)
|
||||
specs['driver_version'] = gpu_stat.driver_version or ''
|
||||
specs['driver_cuda_version'] = gpu_stat.driver_cuda_version or ''
|
||||
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user