mirror of
				https://github.com/clearml/clearml
				synced 2025-06-26 18:16:07 +00:00 
			
		
		
		
	Fix GPU memory used reports 0 when memory can not be queried per process
This commit is contained in:
		
							parent
							
								
									0442579e23
								
							
						
					
					
						commit
						c5675733a0
					
				@ -285,11 +285,11 @@ class GPUStatCollection(object):
 | 
			
		||||
                for nv_process in nv_comp_processes + nv_graphics_processes:
 | 
			
		||||
                    try:
 | 
			
		||||
                        process = get_process_info(nv_process)
 | 
			
		||||
                        processes.append(process)
 | 
			
		||||
                    except psutil.NoSuchProcess:
 | 
			
		||||
                        # TODO: add some reminder for NVML broken context
 | 
			
		||||
                        # e.g. nvidia-smi reset  or  reboot the system
 | 
			
		||||
                        pass
 | 
			
		||||
                        process = None
 | 
			
		||||
                    processes.append(process)
 | 
			
		||||
 | 
			
		||||
                # we do not actually use these, so no point in collecting them
 | 
			
		||||
                # # TODO: Do not block if full process info is not requested
 | 
			
		||||
@ -313,7 +313,7 @@ class GPUStatCollection(object):
 | 
			
		||||
                # Convert bytes into MBytes
 | 
			
		||||
                'memory.used': memory.used // MB if memory else None,
 | 
			
		||||
                'memory.total': memory.total // MB if memory else None,
 | 
			
		||||
                'processes': processes,
 | 
			
		||||
                'processes': None if (processes and all(p is None for p in processes)) else processes
 | 
			
		||||
            }
 | 
			
		||||
            if per_process_stats:
 | 
			
		||||
                GPUStatCollection.clean_processes()
 | 
			
		||||
 | 
			
		||||
@ -43,6 +43,8 @@ class ResourceMonitor(BackgroundMonitor):
 | 
			
		||||
        self._process_info = psutil.Process() if report_mem_used_per_process else None
 | 
			
		||||
        self._last_process_pool = {}
 | 
			
		||||
        self._last_process_id_list = []
 | 
			
		||||
        self._gpu_memory_per_process = True
 | 
			
		||||
 | 
			
		||||
        if not self._gpustat:
 | 
			
		||||
            self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
 | 
			
		||||
        else:  # if running_remotely():
 | 
			
		||||
@ -311,16 +313,28 @@ class ResourceMonitor(BackgroundMonitor):
 | 
			
		||||
        # update mem used by our process and sub processes
 | 
			
		||||
        if self._process_info and (not self._last_process_pool.get('gpu') or
 | 
			
		||||
                                   (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
 | 
			
		||||
            gpu_stat = self._gpustat.new_query(per_process_stats=True)
 | 
			
		||||
            gpu_mem = {}
 | 
			
		||||
            for i, g in enumerate(gpu_stat.gpus):
 | 
			
		||||
                # only monitor the active gpu's, if none were selected, monitor everything
 | 
			
		||||
                if self._active_gpus and i not in self._active_gpus:
 | 
			
		||||
                    continue
 | 
			
		||||
                gpu_mem[i] = 0
 | 
			
		||||
                for p in g.processes:
 | 
			
		||||
                    if p['pid'] in self._last_process_id_list:
 | 
			
		||||
                        gpu_mem[i] += p.get('gpu_memory_usage', 0)
 | 
			
		||||
            if self._gpu_memory_per_process:
 | 
			
		||||
                # noinspection PyBroadException
 | 
			
		||||
                try:
 | 
			
		||||
                    gpu_stat = self._gpustat.new_query(per_process_stats=True)
 | 
			
		||||
                except Exception:
 | 
			
		||||
                    gpu_stat = self._gpustat.new_query(per_process_stats=False)
 | 
			
		||||
 | 
			
		||||
                for i, g in enumerate(gpu_stat.gpus):
 | 
			
		||||
                    # if processes is None, that means we can't query GPU memory usage per proces, so we can stop
 | 
			
		||||
                    if g.processes is None:
 | 
			
		||||
                        self._gpu_memory_per_process = False
 | 
			
		||||
                        break
 | 
			
		||||
                    # only monitor the active gpu's, if none were selected, monitor everything
 | 
			
		||||
                    if self._active_gpus and i not in self._active_gpus:
 | 
			
		||||
                        continue
 | 
			
		||||
 | 
			
		||||
                    gpu_mem[i] = 0
 | 
			
		||||
                    for p in g.processes:
 | 
			
		||||
                        if p is not None and p['pid'] in self._last_process_id_list:
 | 
			
		||||
                            gpu_mem[i] += p.get('gpu_memory_usage', 0)
 | 
			
		||||
 | 
			
		||||
            self._last_process_pool['gpu'] = time(), gpu_mem
 | 
			
		||||
        else:
 | 
			
		||||
            # if we do no need to update the memory usage, run global query
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user