Update GPU stats and pynvml support

This commit is contained in:
allegroai 2023-12-20 17:47:19 +02:00
parent faa97b6cc2
commit 55b065a114
4 changed files with 1340 additions and 85 deletions

View File

@ -665,9 +665,12 @@ class K8sIntegration(Worker):
return {target: results} if results else {} return {target: results} if results else {}
return results return results
def get_task_worker_id(self, template, task_id, pod_name, namespace, queue):
return f"{self.worker_id}:{task_id}"
def _create_template_container( def _create_template_container(
self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str], self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str],
docker_bash: str, clearml_conf_create_script: List[str] docker_bash: str, clearml_conf_create_script: List[str], task_worker_id: str
) -> dict: ) -> dict:
container = self._get_docker_args( container = self._get_docker_args(
docker_args, docker_args,
@ -677,7 +680,6 @@ class K8sIntegration(Worker):
) )
# Set worker ID # Set worker ID
task_worker_id = f"{self.worker_id}:{task_id}"
env_vars = container.get('env', []) env_vars = container.get('env', [])
found_worker_id = False found_worker_id = False
for entry in env_vars: for entry in env_vars:
@ -734,7 +736,7 @@ class K8sIntegration(Worker):
queue, queue,
task_id, task_id,
namespace, namespace,
template=None, template,
pod_number=None pod_number=None
): ):
if "apiVersion" not in template: if "apiVersion" not in template:
@ -774,13 +776,16 @@ class K8sIntegration(Worker):
containers = spec.setdefault('containers', []) containers = spec.setdefault('containers', [])
spec.setdefault('restartPolicy', 'Never') spec.setdefault('restartPolicy', 'Never')
task_worker_id = self.get_task_worker_id(template, task_id, name, namespace, queue)
container = self._create_template_container( container = self._create_template_container(
pod_name=name, pod_name=name,
task_id=task_id, task_id=task_id,
docker_image=docker_image, docker_image=docker_image,
docker_args=docker_args, docker_args=docker_args,
docker_bash=docker_bash, docker_bash=docker_bash,
clearml_conf_create_script=clearml_conf_create_script clearml_conf_create_script=clearml_conf_create_script,
task_worker_id=task_worker_id
) )
if containers: if containers:

View File

@ -15,10 +15,8 @@ from __future__ import print_function
from __future__ import unicode_literals from __future__ import unicode_literals
import json import json
import os.path
import platform import platform
import sys import sys
import time
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional
@ -164,13 +162,14 @@ class GPUStatCollection(object):
_device_count = None _device_count = None
_gpu_device_info = {} _gpu_device_info = {}
def __init__(self, gpu_list, driver_version=None): def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
self.gpus = gpu_list self.gpus = gpu_list
# attach additional system information # attach additional system information
self.hostname = platform.node() self.hostname = platform.node()
self.query_time = datetime.now() self.query_time = datetime.now()
self.driver_version = driver_version self.driver_version = driver_version
self.driver_cuda_version = driver_cuda_version
@staticmethod @staticmethod
def clean_processes(): def clean_processes():
@ -181,10 +180,11 @@ class GPUStatCollection(object):
@staticmethod @staticmethod
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False): def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
"""Query the information of all the GPUs on local machine""" """Query the information of all the GPUs on local machine"""
initialized = False
if not GPUStatCollection._initialized: if not GPUStatCollection._initialized:
N.nvmlInit() N.nvmlInit()
GPUStatCollection._initialized = True GPUStatCollection._initialized = True
initialized = True
def _decode(b): def _decode(b):
if isinstance(b, bytes): if isinstance(b, bytes):
@ -200,10 +200,10 @@ class GPUStatCollection(object):
if nv_process.pid not in GPUStatCollection.global_processes: if nv_process.pid not in GPUStatCollection.global_processes:
GPUStatCollection.global_processes[nv_process.pid] = \ GPUStatCollection.global_processes[nv_process.pid] = \
psutil.Process(pid=nv_process.pid) psutil.Process(pid=nv_process.pid)
ps_process = GPUStatCollection.global_processes[nv_process.pid]
process['pid'] = nv_process.pid process['pid'] = nv_process.pid
# noinspection PyBroadException # noinspection PyBroadException
try: try:
# ps_process = GPUStatCollection.global_processes[nv_process.pid]
# we do not actually use these, so no point in collecting them # we do not actually use these, so no point in collecting them
# process['username'] = ps_process.username() # process['username'] = ps_process.username()
# # cmdline returns full path; # # cmdline returns full path;
@ -286,11 +286,11 @@ class GPUStatCollection(object):
for nv_process in nv_comp_processes + nv_graphics_processes: for nv_process in nv_comp_processes + nv_graphics_processes:
try: try:
process = get_process_info(nv_process) process = get_process_info(nv_process)
processes.append(process)
except psutil.NoSuchProcess: except psutil.NoSuchProcess:
# TODO: add some reminder for NVML broken context # TODO: add some reminder for NVML broken context
# e.g. nvidia-smi reset or reboot the system # e.g. nvidia-smi reset or reboot the system
pass process = None
processes.append(process)
# we do not actually use these, so no point in collecting them # we do not actually use these, so no point in collecting them
# # TODO: Do not block if full process info is not requested # # TODO: Do not block if full process info is not requested
@ -314,7 +314,7 @@ class GPUStatCollection(object):
# Convert bytes into MBytes # Convert bytes into MBytes
'memory.used': memory.used // MB if memory else None, 'memory.used': memory.used // MB if memory else None,
'memory.total': memory.total // MB if memory else None, 'memory.total': memory.total // MB if memory else None,
'processes': processes, 'processes': None if (processes and all(p is None for p in processes)) else processes
} }
if per_process_stats: if per_process_stats:
GPUStatCollection.clean_processes() GPUStatCollection.clean_processes()
@ -337,15 +337,32 @@ class GPUStatCollection(object):
driver_version = _decode(N.nvmlSystemGetDriverVersion()) driver_version = _decode(N.nvmlSystemGetDriverVersion())
except N.NVMLError: except N.NVMLError:
driver_version = None # N/A driver_version = None # N/A
# noinspection PyBroadException
try:
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
except BaseException:
# noinspection PyBroadException
try:
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
except BaseException:
cuda_driver_version = None
if cuda_driver_version:
try:
cuda_driver_version = '{}.{}'.format(
int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
except (ValueError, TypeError):
pass
else: else:
driver_version = None driver_version = None
cuda_driver_version = None
# no need to shutdown: # no need to shutdown:
if shutdown: if shutdown and initialized:
N.nvmlShutdown() N.nvmlShutdown()
GPUStatCollection._initialized = False GPUStatCollection._initialized = False
return GPUStatCollection(gpu_list, driver_version=driver_version) return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
def __len__(self): def __len__(self):
return len(self.gpus) return len(self.gpus)

File diff suppressed because it is too large Load Diff

View File

@ -266,7 +266,9 @@ class ResourceMonitor(object):
gpu_stat = self._gpustat.new_query() gpu_stat = self._gpustat.new_query()
for i, g in enumerate(gpu_stat.gpus): for i, g in enumerate(gpu_stat.gpus):
# only monitor the active gpu's, if none were selected, monitor everything # only monitor the active gpu's, if none were selected, monitor everything
if self._active_gpus and str(i) not in self._active_gpus: if self._active_gpus:
uuid = getattr(g, "uuid", None)
if str(i) not in self._active_gpus and (not uuid or uuid not in self._active_gpus):
continue continue
stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"] stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"]
stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"] stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"]