From 553c72e06a78d274cfd06b1b7a63fafc4ce0603d Mon Sep 17 00:00:00 2001 From: clearml <> Date: Sun, 22 Jun 2025 22:37:27 +0300 Subject: [PATCH] Support NVIDIA_VISIBLE_DEVICES using volume mounts --- clearml_agent/helper/resource_monitor.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/clearml_agent/helper/resource_monitor.py b/clearml_agent/helper/resource_monitor.py index fa94682..50c68f3 100644 --- a/clearml_agent/helper/resource_monitor.py +++ b/clearml_agent/helper/resource_monitor.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals, division import logging import re +import os import shlex from collections import deque from itertools import starmap @@ -112,7 +113,15 @@ class ResourceMonitor(object): active_gpus = Session.get_nvidia_visible_env() # None means no filtering, report all gpus if active_gpus and active_gpus != "all": - self._active_gpus = [g.strip() for g in str(active_gpus).split(',')] + if os.path.isdir(active_gpus): + try: + self._active_gpus = os.listdir(active_gpus) + except OSError as e: + log.warning( + "Failed listing {}: {}".format(active_gpus, e) + ) + else: + self._active_gpus = [g.strip() for g in active_gpus.split(",")] except Exception: pass self._cluster_report_interval_sec = int(session.config.get(