mirror of
				https://github.com/clearml/clearml-agent
				synced 2025-06-26 18:16:15 +00:00 
			
		
		
		
	Update GPU stats and pynvml support
This commit is contained in:
		
							parent
							
								
									faa97b6cc2
								
							
						
					
					
						commit
						55b065a114
					
				| @ -665,9 +665,12 @@ class K8sIntegration(Worker): | ||||
|             return {target: results} if results else {} | ||||
|         return results | ||||
| 
 | ||||
|     def get_task_worker_id(self, template, task_id, pod_name, namespace, queue): | ||||
|         return f"{self.worker_id}:{task_id}" | ||||
| 
 | ||||
|     def _create_template_container( | ||||
|         self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str], | ||||
|         docker_bash: str, clearml_conf_create_script: List[str] | ||||
|         docker_bash: str, clearml_conf_create_script: List[str], task_worker_id: str | ||||
|     ) -> dict: | ||||
|         container = self._get_docker_args( | ||||
|             docker_args, | ||||
| @ -677,7 +680,6 @@ class K8sIntegration(Worker): | ||||
|         ) | ||||
| 
 | ||||
|         # Set worker ID | ||||
|         task_worker_id = f"{self.worker_id}:{task_id}" | ||||
|         env_vars = container.get('env', []) | ||||
|         found_worker_id = False | ||||
|         for entry in env_vars: | ||||
| @ -734,7 +736,7 @@ class K8sIntegration(Worker): | ||||
|         queue, | ||||
|         task_id, | ||||
|         namespace, | ||||
|         template=None, | ||||
|         template, | ||||
|         pod_number=None | ||||
|     ): | ||||
|         if "apiVersion" not in template: | ||||
| @ -774,13 +776,16 @@ class K8sIntegration(Worker): | ||||
|         containers = spec.setdefault('containers', []) | ||||
|         spec.setdefault('restartPolicy', 'Never') | ||||
| 
 | ||||
|         task_worker_id = self.get_task_worker_id(template, task_id, name, namespace, queue) | ||||
| 
 | ||||
|         container = self._create_template_container( | ||||
|             pod_name=name, | ||||
|             task_id=task_id, | ||||
|             docker_image=docker_image, | ||||
|             docker_args=docker_args, | ||||
|             docker_bash=docker_bash, | ||||
|             clearml_conf_create_script=clearml_conf_create_script | ||||
|             clearml_conf_create_script=clearml_conf_create_script, | ||||
|             task_worker_id=task_worker_id | ||||
|         ) | ||||
| 
 | ||||
|         if containers: | ||||
|  | ||||
| @ -15,10 +15,8 @@ from __future__ import print_function | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import json | ||||
| import os.path | ||||
| import platform | ||||
| import sys | ||||
| import time | ||||
| from datetime import datetime | ||||
| from typing import Optional | ||||
| 
 | ||||
| @ -164,13 +162,14 @@ class GPUStatCollection(object): | ||||
|     _device_count = None | ||||
|     _gpu_device_info = {} | ||||
| 
 | ||||
|     def __init__(self, gpu_list, driver_version=None): | ||||
|     def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None): | ||||
|         self.gpus = gpu_list | ||||
| 
 | ||||
|         # attach additional system information | ||||
|         self.hostname = platform.node() | ||||
|         self.query_time = datetime.now() | ||||
|         self.driver_version = driver_version | ||||
|         self.driver_cuda_version = driver_cuda_version | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def clean_processes(): | ||||
| @ -181,10 +180,11 @@ class GPUStatCollection(object): | ||||
|     @staticmethod | ||||
|     def new_query(shutdown=False, per_process_stats=False, get_driver_info=False): | ||||
|         """Query the information of all the GPUs on local machine""" | ||||
| 
 | ||||
|         initialized = False | ||||
|         if not GPUStatCollection._initialized: | ||||
|             N.nvmlInit() | ||||
|             GPUStatCollection._initialized = True | ||||
|             initialized = True | ||||
| 
 | ||||
|         def _decode(b): | ||||
|             if isinstance(b, bytes): | ||||
| @ -200,10 +200,10 @@ class GPUStatCollection(object): | ||||
|                 if nv_process.pid not in GPUStatCollection.global_processes: | ||||
|                     GPUStatCollection.global_processes[nv_process.pid] = \ | ||||
|                         psutil.Process(pid=nv_process.pid) | ||||
|                 ps_process = GPUStatCollection.global_processes[nv_process.pid] | ||||
|                 process['pid'] = nv_process.pid | ||||
|                 # noinspection PyBroadException | ||||
|                 try: | ||||
|                     # ps_process = GPUStatCollection.global_processes[nv_process.pid] | ||||
|                     # we do not actually use these, so no point in collecting them | ||||
|                     # process['username'] = ps_process.username() | ||||
|                     # # cmdline returns full path; | ||||
| @ -286,11 +286,11 @@ class GPUStatCollection(object): | ||||
|                 for nv_process in nv_comp_processes + nv_graphics_processes: | ||||
|                     try: | ||||
|                         process = get_process_info(nv_process) | ||||
|                         processes.append(process) | ||||
|                     except psutil.NoSuchProcess: | ||||
|                         # TODO: add some reminder for NVML broken context | ||||
|                         # e.g. nvidia-smi reset  or  reboot the system | ||||
|                         pass | ||||
|                         process = None | ||||
|                     processes.append(process) | ||||
| 
 | ||||
|                 # we do not actually use these, so no point in collecting them | ||||
|                 # # TODO: Do not block if full process info is not requested | ||||
| @ -314,7 +314,7 @@ class GPUStatCollection(object): | ||||
|                 # Convert bytes into MBytes | ||||
|                 'memory.used': memory.used // MB if memory else None, | ||||
|                 'memory.total': memory.total // MB if memory else None, | ||||
|                 'processes': processes, | ||||
|                 'processes': None if (processes and all(p is None for p in processes)) else processes | ||||
|             } | ||||
|             if per_process_stats: | ||||
|                 GPUStatCollection.clean_processes() | ||||
| @ -337,15 +337,32 @@ class GPUStatCollection(object): | ||||
|                 driver_version = _decode(N.nvmlSystemGetDriverVersion()) | ||||
|             except N.NVMLError: | ||||
|                 driver_version = None  # N/A | ||||
| 
 | ||||
|             # noinspection PyBroadException | ||||
|             try: | ||||
|                 cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion()) | ||||
|             except BaseException: | ||||
|                 # noinspection PyBroadException | ||||
|                 try: | ||||
|                     cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2()) | ||||
|                 except BaseException: | ||||
|                     cuda_driver_version = None | ||||
|             if cuda_driver_version: | ||||
|                 try: | ||||
|                     cuda_driver_version = '{}.{}'.format( | ||||
|                         int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10) | ||||
|                 except (ValueError, TypeError): | ||||
|                     pass | ||||
|         else: | ||||
|             driver_version = None | ||||
|             cuda_driver_version = None | ||||
| 
 | ||||
|         # no need to shutdown: | ||||
|         if shutdown: | ||||
|         if shutdown and initialized: | ||||
|             N.nvmlShutdown() | ||||
|             GPUStatCollection._initialized = False | ||||
| 
 | ||||
|         return GPUStatCollection(gpu_list, driver_version=driver_version) | ||||
|         return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version) | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return len(self.gpus) | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -266,8 +266,10 @@ class ResourceMonitor(object): | ||||
|                 gpu_stat = self._gpustat.new_query() | ||||
|                 for i, g in enumerate(gpu_stat.gpus): | ||||
|                     # only monitor the active gpu's, if none were selected, monitor everything | ||||
|                     if self._active_gpus and str(i) not in self._active_gpus: | ||||
|                         continue | ||||
|                     if self._active_gpus: | ||||
|                         uuid = getattr(g, "uuid", None) | ||||
|                         if str(i) not in self._active_gpus and (not uuid or uuid not in self._active_gpus): | ||||
|                             continue | ||||
|                     stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"] | ||||
|                     stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"] | ||||
|                     stats["gpu_mem_usage_{:d}".format(i)] = ( | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 allegroai
						allegroai