mirror of
https://github.com/clearml/clearml-agent
synced 2025-05-09 14:21:09 +00:00
Update GPU stats and pynvml support
This commit is contained in:
parent
faa97b6cc2
commit
55b065a114
@ -665,9 +665,12 @@ class K8sIntegration(Worker):
|
|||||||
return {target: results} if results else {}
|
return {target: results} if results else {}
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def get_task_worker_id(self, template, task_id, pod_name, namespace, queue):
|
||||||
|
return f"{self.worker_id}:{task_id}"
|
||||||
|
|
||||||
def _create_template_container(
|
def _create_template_container(
|
||||||
self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str],
|
self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str],
|
||||||
docker_bash: str, clearml_conf_create_script: List[str]
|
docker_bash: str, clearml_conf_create_script: List[str], task_worker_id: str
|
||||||
) -> dict:
|
) -> dict:
|
||||||
container = self._get_docker_args(
|
container = self._get_docker_args(
|
||||||
docker_args,
|
docker_args,
|
||||||
@ -677,7 +680,6 @@ class K8sIntegration(Worker):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Set worker ID
|
# Set worker ID
|
||||||
task_worker_id = f"{self.worker_id}:{task_id}"
|
|
||||||
env_vars = container.get('env', [])
|
env_vars = container.get('env', [])
|
||||||
found_worker_id = False
|
found_worker_id = False
|
||||||
for entry in env_vars:
|
for entry in env_vars:
|
||||||
@ -734,7 +736,7 @@ class K8sIntegration(Worker):
|
|||||||
queue,
|
queue,
|
||||||
task_id,
|
task_id,
|
||||||
namespace,
|
namespace,
|
||||||
template=None,
|
template,
|
||||||
pod_number=None
|
pod_number=None
|
||||||
):
|
):
|
||||||
if "apiVersion" not in template:
|
if "apiVersion" not in template:
|
||||||
@ -774,13 +776,16 @@ class K8sIntegration(Worker):
|
|||||||
containers = spec.setdefault('containers', [])
|
containers = spec.setdefault('containers', [])
|
||||||
spec.setdefault('restartPolicy', 'Never')
|
spec.setdefault('restartPolicy', 'Never')
|
||||||
|
|
||||||
|
task_worker_id = self.get_task_worker_id(template, task_id, name, namespace, queue)
|
||||||
|
|
||||||
container = self._create_template_container(
|
container = self._create_template_container(
|
||||||
pod_name=name,
|
pod_name=name,
|
||||||
task_id=task_id,
|
task_id=task_id,
|
||||||
docker_image=docker_image,
|
docker_image=docker_image,
|
||||||
docker_args=docker_args,
|
docker_args=docker_args,
|
||||||
docker_bash=docker_bash,
|
docker_bash=docker_bash,
|
||||||
clearml_conf_create_script=clearml_conf_create_script
|
clearml_conf_create_script=clearml_conf_create_script,
|
||||||
|
task_worker_id=task_worker_id
|
||||||
)
|
)
|
||||||
|
|
||||||
if containers:
|
if containers:
|
||||||
|
@ -15,10 +15,8 @@ from __future__ import print_function
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os.path
|
|
||||||
import platform
|
import platform
|
||||||
import sys
|
import sys
|
||||||
import time
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -164,13 +162,14 @@ class GPUStatCollection(object):
|
|||||||
_device_count = None
|
_device_count = None
|
||||||
_gpu_device_info = {}
|
_gpu_device_info = {}
|
||||||
|
|
||||||
def __init__(self, gpu_list, driver_version=None):
|
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
|
||||||
self.gpus = gpu_list
|
self.gpus = gpu_list
|
||||||
|
|
||||||
# attach additional system information
|
# attach additional system information
|
||||||
self.hostname = platform.node()
|
self.hostname = platform.node()
|
||||||
self.query_time = datetime.now()
|
self.query_time = datetime.now()
|
||||||
self.driver_version = driver_version
|
self.driver_version = driver_version
|
||||||
|
self.driver_cuda_version = driver_cuda_version
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_processes():
|
def clean_processes():
|
||||||
@ -181,10 +180,11 @@ class GPUStatCollection(object):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||||
"""Query the information of all the GPUs on local machine"""
|
"""Query the information of all the GPUs on local machine"""
|
||||||
|
initialized = False
|
||||||
if not GPUStatCollection._initialized:
|
if not GPUStatCollection._initialized:
|
||||||
N.nvmlInit()
|
N.nvmlInit()
|
||||||
GPUStatCollection._initialized = True
|
GPUStatCollection._initialized = True
|
||||||
|
initialized = True
|
||||||
|
|
||||||
def _decode(b):
|
def _decode(b):
|
||||||
if isinstance(b, bytes):
|
if isinstance(b, bytes):
|
||||||
@ -200,10 +200,10 @@ class GPUStatCollection(object):
|
|||||||
if nv_process.pid not in GPUStatCollection.global_processes:
|
if nv_process.pid not in GPUStatCollection.global_processes:
|
||||||
GPUStatCollection.global_processes[nv_process.pid] = \
|
GPUStatCollection.global_processes[nv_process.pid] = \
|
||||||
psutil.Process(pid=nv_process.pid)
|
psutil.Process(pid=nv_process.pid)
|
||||||
ps_process = GPUStatCollection.global_processes[nv_process.pid]
|
|
||||||
process['pid'] = nv_process.pid
|
process['pid'] = nv_process.pid
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
|
# ps_process = GPUStatCollection.global_processes[nv_process.pid]
|
||||||
# we do not actually use these, so no point in collecting them
|
# we do not actually use these, so no point in collecting them
|
||||||
# process['username'] = ps_process.username()
|
# process['username'] = ps_process.username()
|
||||||
# # cmdline returns full path;
|
# # cmdline returns full path;
|
||||||
@ -286,11 +286,11 @@ class GPUStatCollection(object):
|
|||||||
for nv_process in nv_comp_processes + nv_graphics_processes:
|
for nv_process in nv_comp_processes + nv_graphics_processes:
|
||||||
try:
|
try:
|
||||||
process = get_process_info(nv_process)
|
process = get_process_info(nv_process)
|
||||||
processes.append(process)
|
|
||||||
except psutil.NoSuchProcess:
|
except psutil.NoSuchProcess:
|
||||||
# TODO: add some reminder for NVML broken context
|
# TODO: add some reminder for NVML broken context
|
||||||
# e.g. nvidia-smi reset or reboot the system
|
# e.g. nvidia-smi reset or reboot the system
|
||||||
pass
|
process = None
|
||||||
|
processes.append(process)
|
||||||
|
|
||||||
# we do not actually use these, so no point in collecting them
|
# we do not actually use these, so no point in collecting them
|
||||||
# # TODO: Do not block if full process info is not requested
|
# # TODO: Do not block if full process info is not requested
|
||||||
@ -314,7 +314,7 @@ class GPUStatCollection(object):
|
|||||||
# Convert bytes into MBytes
|
# Convert bytes into MBytes
|
||||||
'memory.used': memory.used // MB if memory else None,
|
'memory.used': memory.used // MB if memory else None,
|
||||||
'memory.total': memory.total // MB if memory else None,
|
'memory.total': memory.total // MB if memory else None,
|
||||||
'processes': processes,
|
'processes': None if (processes and all(p is None for p in processes)) else processes
|
||||||
}
|
}
|
||||||
if per_process_stats:
|
if per_process_stats:
|
||||||
GPUStatCollection.clean_processes()
|
GPUStatCollection.clean_processes()
|
||||||
@ -337,15 +337,32 @@ class GPUStatCollection(object):
|
|||||||
driver_version = _decode(N.nvmlSystemGetDriverVersion())
|
driver_version = _decode(N.nvmlSystemGetDriverVersion())
|
||||||
except N.NVMLError:
|
except N.NVMLError:
|
||||||
driver_version = None # N/A
|
driver_version = None # N/A
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||||
|
except BaseException:
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||||
|
except BaseException:
|
||||||
|
cuda_driver_version = None
|
||||||
|
if cuda_driver_version:
|
||||||
|
try:
|
||||||
|
cuda_driver_version = '{}.{}'.format(
|
||||||
|
int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
driver_version = None
|
driver_version = None
|
||||||
|
cuda_driver_version = None
|
||||||
|
|
||||||
# no need to shutdown:
|
# no need to shutdown:
|
||||||
if shutdown:
|
if shutdown and initialized:
|
||||||
N.nvmlShutdown()
|
N.nvmlShutdown()
|
||||||
GPUStatCollection._initialized = False
|
GPUStatCollection._initialized = False
|
||||||
|
|
||||||
return GPUStatCollection(gpu_list, driver_version=driver_version)
|
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.gpus)
|
return len(self.gpus)
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -266,7 +266,9 @@ class ResourceMonitor(object):
|
|||||||
gpu_stat = self._gpustat.new_query()
|
gpu_stat = self._gpustat.new_query()
|
||||||
for i, g in enumerate(gpu_stat.gpus):
|
for i, g in enumerate(gpu_stat.gpus):
|
||||||
# only monitor the active gpu's, if none were selected, monitor everything
|
# only monitor the active gpu's, if none were selected, monitor everything
|
||||||
if self._active_gpus and str(i) not in self._active_gpus:
|
if self._active_gpus:
|
||||||
|
uuid = getattr(g, "uuid", None)
|
||||||
|
if str(i) not in self._active_gpus and (not uuid or uuid not in self._active_gpus):
|
||||||
continue
|
continue
|
||||||
stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"]
|
stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"]
|
||||||
stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"]
|
stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"]
|
||||||
|
Loading…
Reference in New Issue
Block a user