Update GPU stats and pynvml support

This commit is contained in:
allegroai 2023-12-20 17:47:19 +02:00
parent faa97b6cc2
commit 55b065a114
4 changed files with 1340 additions and 85 deletions

View File

@ -665,9 +665,12 @@ class K8sIntegration(Worker):
return {target: results} if results else {}
return results
def get_task_worker_id(self, template, task_id, pod_name, namespace, queue):
return f"{self.worker_id}:{task_id}"
def _create_template_container(
self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str],
docker_bash: str, clearml_conf_create_script: List[str]
docker_bash: str, clearml_conf_create_script: List[str], task_worker_id: str
) -> dict:
container = self._get_docker_args(
docker_args,
@ -677,7 +680,6 @@ class K8sIntegration(Worker):
)
# Set worker ID
task_worker_id = f"{self.worker_id}:{task_id}"
env_vars = container.get('env', [])
found_worker_id = False
for entry in env_vars:
@ -734,7 +736,7 @@ class K8sIntegration(Worker):
queue,
task_id,
namespace,
template=None,
template,
pod_number=None
):
if "apiVersion" not in template:
@ -774,13 +776,16 @@ class K8sIntegration(Worker):
containers = spec.setdefault('containers', [])
spec.setdefault('restartPolicy', 'Never')
task_worker_id = self.get_task_worker_id(template, task_id, name, namespace, queue)
container = self._create_template_container(
pod_name=name,
task_id=task_id,
docker_image=docker_image,
docker_args=docker_args,
docker_bash=docker_bash,
clearml_conf_create_script=clearml_conf_create_script
clearml_conf_create_script=clearml_conf_create_script,
task_worker_id=task_worker_id
)
if containers:

View File

@ -15,10 +15,8 @@ from __future__ import print_function
from __future__ import unicode_literals
import json
import os.path
import platform
import sys
import time
from datetime import datetime
from typing import Optional
@ -164,13 +162,14 @@ class GPUStatCollection(object):
_device_count = None
_gpu_device_info = {}
def __init__(self, gpu_list, driver_version=None):
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
self.gpus = gpu_list
# attach additional system information
self.hostname = platform.node()
self.query_time = datetime.now()
self.driver_version = driver_version
self.driver_cuda_version = driver_cuda_version
@staticmethod
def clean_processes():
@ -181,10 +180,11 @@ class GPUStatCollection(object):
@staticmethod
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
"""Query the information of all the GPUs on local machine"""
initialized = False
if not GPUStatCollection._initialized:
N.nvmlInit()
GPUStatCollection._initialized = True
initialized = True
def _decode(b):
if isinstance(b, bytes):
@ -200,10 +200,10 @@ class GPUStatCollection(object):
if nv_process.pid not in GPUStatCollection.global_processes:
GPUStatCollection.global_processes[nv_process.pid] = \
psutil.Process(pid=nv_process.pid)
ps_process = GPUStatCollection.global_processes[nv_process.pid]
process['pid'] = nv_process.pid
# noinspection PyBroadException
try:
# ps_process = GPUStatCollection.global_processes[nv_process.pid]
# we do not actually use these, so no point in collecting them
# process['username'] = ps_process.username()
# # cmdline returns full path;
@ -286,11 +286,11 @@ class GPUStatCollection(object):
for nv_process in nv_comp_processes + nv_graphics_processes:
try:
process = get_process_info(nv_process)
processes.append(process)
except psutil.NoSuchProcess:
# TODO: add some reminder for NVML broken context
# e.g. nvidia-smi reset or reboot the system
pass
process = None
processes.append(process)
# we do not actually use these, so no point in collecting them
# # TODO: Do not block if full process info is not requested
@ -314,7 +314,7 @@ class GPUStatCollection(object):
# Convert bytes into MBytes
'memory.used': memory.used // MB if memory else None,
'memory.total': memory.total // MB if memory else None,
'processes': processes,
'processes': None if (processes and all(p is None for p in processes)) else processes
}
if per_process_stats:
GPUStatCollection.clean_processes()
@ -337,15 +337,32 @@ class GPUStatCollection(object):
driver_version = _decode(N.nvmlSystemGetDriverVersion())
except N.NVMLError:
driver_version = None # N/A
# noinspection PyBroadException
try:
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
except BaseException:
# noinspection PyBroadException
try:
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
except BaseException:
cuda_driver_version = None
if cuda_driver_version:
try:
cuda_driver_version = '{}.{}'.format(
int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
except (ValueError, TypeError):
pass
else:
driver_version = None
cuda_driver_version = None
# no need to shutdown:
if shutdown:
if shutdown and initialized:
N.nvmlShutdown()
GPUStatCollection._initialized = False
return GPUStatCollection(gpu_list, driver_version=driver_version)
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
def __len__(self):
return len(self.gpus)

File diff suppressed because it is too large Load Diff

View File

@ -266,8 +266,10 @@ class ResourceMonitor(object):
gpu_stat = self._gpustat.new_query()
for i, g in enumerate(gpu_stat.gpus):
# only monitor the active gpu's, if none were selected, monitor everything
if self._active_gpus and str(i) not in self._active_gpus:
continue
if self._active_gpus:
uuid = getattr(g, "uuid", None)
if str(i) not in self._active_gpus and (not uuid or uuid not in self._active_gpus):
continue
stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"]
stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"]
stats["gpu_mem_usage_{:d}".format(i)] = (