mirror of
https://github.com/clearml/clearml
synced 2025-03-03 10:42:00 +00:00
Add log nvidia driver and cuda version
This commit is contained in:
parent
d3929033c0
commit
07a22a38ac
@ -161,13 +161,14 @@ class GPUStatCollection(object):
|
|||||||
_device_count = None
|
_device_count = None
|
||||||
_gpu_device_info = {}
|
_gpu_device_info = {}
|
||||||
|
|
||||||
def __init__(self, gpu_list, driver_version=None):
|
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
|
||||||
self.gpus = gpu_list
|
self.gpus = gpu_list
|
||||||
|
|
||||||
# attach additional system information
|
# attach additional system information
|
||||||
self.hostname = platform.node()
|
self.hostname = platform.node()
|
||||||
self.query_time = datetime.now()
|
self.query_time = datetime.now()
|
||||||
self.driver_version = driver_version
|
self.driver_version = driver_version
|
||||||
|
self.driver_cuda_version = driver_cuda_version
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_processes():
|
def clean_processes():
|
||||||
@ -178,10 +179,11 @@ class GPUStatCollection(object):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||||
"""Query the information of all the GPUs on local machine"""
|
"""Query the information of all the GPUs on local machine"""
|
||||||
|
initialized = False
|
||||||
if not GPUStatCollection._initialized:
|
if not GPUStatCollection._initialized:
|
||||||
N.nvmlInit()
|
N.nvmlInit()
|
||||||
GPUStatCollection._initialized = True
|
GPUStatCollection._initialized = True
|
||||||
|
initialized = True
|
||||||
|
|
||||||
def _decode(b):
|
def _decode(b):
|
||||||
if isinstance(b, bytes):
|
if isinstance(b, bytes):
|
||||||
@ -334,15 +336,32 @@ class GPUStatCollection(object):
|
|||||||
driver_version = _decode(N.nvmlSystemGetDriverVersion())
|
driver_version = _decode(N.nvmlSystemGetDriverVersion())
|
||||||
except N.NVMLError:
|
except N.NVMLError:
|
||||||
driver_version = None # N/A
|
driver_version = None # N/A
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||||
|
except BaseException:
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||||
|
except BaseException:
|
||||||
|
cuda_driver_version = None
|
||||||
|
if cuda_driver_version:
|
||||||
|
try:
|
||||||
|
cuda_driver_version = '{}.{}'.format(
|
||||||
|
int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
driver_version = None
|
driver_version = None
|
||||||
|
cuda_driver_version = None
|
||||||
|
|
||||||
# no need to shutdown:
|
# no need to shutdown:
|
||||||
if shutdown:
|
if shutdown and initialized:
|
||||||
N.nvmlShutdown()
|
N.nvmlShutdown()
|
||||||
GPUStatCollection._initialized = False
|
GPUStatCollection._initialized = False
|
||||||
|
|
||||||
return GPUStatCollection(gpu_list, driver_version=driver_version)
|
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.gpus)
|
return len(self.gpus)
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -349,14 +349,19 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
'gpu_count': 0,
|
'gpu_count': 0,
|
||||||
'gpu_type': '',
|
'gpu_type': '',
|
||||||
'gpu_memory': '',
|
'gpu_memory': '',
|
||||||
|
'driver_version': '',
|
||||||
|
'driver_cuda_version': '',
|
||||||
}
|
}
|
||||||
if self._gpustat:
|
if self._gpustat:
|
||||||
gpu_stat = self._gpustat.new_query(shutdown=True)
|
gpu_stat = self._gpustat.new_query(shutdown=True, get_driver_info=True)
|
||||||
if gpu_stat.gpus:
|
if gpu_stat.gpus:
|
||||||
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._active_gpus or i in self._active_gpus]
|
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._active_gpus or i in self._active_gpus]
|
||||||
specs['gpu_count'] = int(len(gpus))
|
specs['gpu_count'] = int(len(gpus))
|
||||||
specs['gpu_type'] = ', '.join(g.name for g in gpus)
|
specs['gpu_type'] = ', '.join(g.name for g in gpus)
|
||||||
specs['gpu_memory'] = ', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus)
|
specs['gpu_memory'] = ', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus)
|
||||||
|
specs['driver_version'] = gpu_stat.driver_version or ''
|
||||||
|
specs['driver_cuda_version'] = gpu_stat.driver_cuda_version or ''
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user