Update GPU stats and pynvml support

2025-06-26 18:16:15 +00:00 · 2023-12-20 17:47:19 +02:00 · 2023-12-20 17:47:19 +02:00 · 55b065a114
commit 55b065a114
parent faa97b6cc2
4 changed files with 1340 additions and 85 deletions
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@ -665,9 +665,12 @@ class K8sIntegration(Worker):
            return {target: results} if results else {}
        return results

+    def get_task_worker_id(self, template, task_id, pod_name, namespace, queue):
+        return f"{self.worker_id}:{task_id}"
+
    def _create_template_container(
        self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str],
-        docker_bash: str, clearml_conf_create_script: List[str]
+        docker_bash: str, clearml_conf_create_script: List[str], task_worker_id: str
    ) -> dict:
        container = self._get_docker_args(
            docker_args,
@ -677,7 +680,6 @@ class K8sIntegration(Worker):
        )

        # Set worker ID
-        task_worker_id = f"{self.worker_id}:{task_id}"
        env_vars = container.get('env', [])
        found_worker_id = False
        for entry in env_vars:
@ -734,7 +736,7 @@ class K8sIntegration(Worker):
        queue,
        task_id,
        namespace,
-        template=None,
+        template,
        pod_number=None
    ):
        if "apiVersion" not in template:
@ -774,13 +776,16 @@ class K8sIntegration(Worker):
        containers = spec.setdefault('containers', [])
        spec.setdefault('restartPolicy', 'Never')

+        task_worker_id = self.get_task_worker_id(template, task_id, name, namespace, queue)
+
        container = self._create_template_container(
            pod_name=name,
            task_id=task_id,
            docker_image=docker_image,
            docker_args=docker_args,
            docker_bash=docker_bash,
-            clearml_conf_create_script=clearml_conf_create_script
+            clearml_conf_create_script=clearml_conf_create_script,
+            task_worker_id=task_worker_id
        )

        if containers:
--- a/clearml_agent/helper/gpu/gpustat.py
+++ b/clearml_agent/helper/gpu/gpustat.py
@ -15,10 +15,8 @@ from __future__ import print_function
 from __future__ import unicode_literals

 import json
-import os.path
 import platform
 import sys
-import time
 from datetime import datetime
 from typing import Optional

@ -164,13 +162,14 @@ class GPUStatCollection(object):
    _device_count = None
    _gpu_device_info = {}

-    def __init__(self, gpu_list, driver_version=None):
+    def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
        self.gpus = gpu_list

        # attach additional system information
        self.hostname = platform.node()
        self.query_time = datetime.now()
        self.driver_version = driver_version
+        self.driver_cuda_version = driver_cuda_version

    @staticmethod
    def clean_processes():
@ -181,10 +180,11 @@ class GPUStatCollection(object):
    @staticmethod
    def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
        """Query the information of all the GPUs on local machine"""
-
+        initialized = False
        if not GPUStatCollection._initialized:
            N.nvmlInit()
            GPUStatCollection._initialized = True
+            initialized = True

        def _decode(b):
            if isinstance(b, bytes):
@ -200,10 +200,10 @@ class GPUStatCollection(object):
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
-                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['pid'] = nv_process.pid
                # noinspection PyBroadException
                try:
+                    # ps_process = GPUStatCollection.global_processes[nv_process.pid]
                    # we do not actually use these, so no point in collecting them
                    # process['username'] = ps_process.username()
                    # # cmdline returns full path;
@ -286,11 +286,11 @@ class GPUStatCollection(object):
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
-                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
-                        pass
+                        process = None
+                    processes.append(process)

                # we do not actually use these, so no point in collecting them
                # # TODO: Do not block if full process info is not requested
@ -314,7 +314,7 @@ class GPUStatCollection(object):
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
-                'processes': processes,
+                'processes': None if (processes and all(p is None for p in processes)) else processes
            }
            if per_process_stats:
                GPUStatCollection.clean_processes()
@ -337,15 +337,32 @@ class GPUStatCollection(object):
                driver_version = _decode(N.nvmlSystemGetDriverVersion())
            except N.NVMLError:
                driver_version = None  # N/A
+
+            # noinspection PyBroadException
+            try:
+                cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
+            except BaseException:
+                # noinspection PyBroadException
+                try:
+                    cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
+                except BaseException:
+                    cuda_driver_version = None
+            if cuda_driver_version:
+                try:
+                    cuda_driver_version = '{}.{}'.format(
+                        int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
+                except (ValueError, TypeError):
+                    pass
        else:
            driver_version = None
+            cuda_driver_version = None

        # no need to shutdown:
-        if shutdown:
+        if shutdown and initialized:
            N.nvmlShutdown()
            GPUStatCollection._initialized = False

-        return GPUStatCollection(gpu_list, driver_version=driver_version)
+        return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)

    def __len__(self):
        return len(self.gpus)
--- a/clearml_agent/helper/gpu/pynvml.py
+++ b/clearml_agent/helper/gpu/pynvml.py
--- a/clearml_agent/helper/resource_monitor.py
+++ b/clearml_agent/helper/resource_monitor.py
@ -266,8 +266,10 @@ class ResourceMonitor(object):
                gpu_stat = self._gpustat.new_query()
                for i, g in enumerate(gpu_stat.gpus):
                    # only monitor the active gpu's, if none were selected, monitor everything
-                    if self._active_gpus and str(i) not in self._active_gpus:
-                        continue
+                    if self._active_gpus:
+                        uuid = getattr(g, "uuid", None)
+                        if str(i) not in self._active_gpus and (not uuid or uuid not in self._active_gpus):
+                            continue
                    stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"]
                    stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"]
                    stats["gpu_mem_usage_{:d}".format(i)] = (