mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Improve resource monitor
This commit is contained in:
parent
a4ebf8293d
commit
6a4fcda1bf
@ -1396,7 +1396,7 @@ class Worker(ServiceCommandSection):
|
|||||||
def _setup_dynamic_gpus(self, gpu_queues):
|
def _setup_dynamic_gpus(self, gpu_queues):
|
||||||
available_gpus = self.get_runtime_properties()
|
available_gpus = self.get_runtime_properties()
|
||||||
if available_gpus is None:
|
if available_gpus is None:
|
||||||
raise ValueError("Dynamic GPU allocation is not supported by the ClearML-server")
|
raise ValueError("Dynamic GPU allocation is not supported by your ClearML-server")
|
||||||
available_gpus = [prop["value"] for prop in available_gpus if prop["key"] == 'available_gpus']
|
available_gpus = [prop["value"] for prop in available_gpus if prop["key"] == 'available_gpus']
|
||||||
if available_gpus:
|
if available_gpus:
|
||||||
gpus = []
|
gpus = []
|
||||||
@ -1413,7 +1413,9 @@ class Worker(ServiceCommandSection):
|
|||||||
|
|
||||||
if not self.set_runtime_properties(
|
if not self.set_runtime_properties(
|
||||||
key='available_gpus', value=','.join(str(g) for g in available_gpus)):
|
key='available_gpus', value=','.join(str(g) for g in available_gpus)):
|
||||||
raise ValueError("Dynamic GPU allocation is not supported by the ClearML-server")
|
raise ValueError("Dynamic GPU allocation is not supported by your ClearML-server")
|
||||||
|
|
||||||
|
self.cluster_report_monitor(available_gpus=available_gpus, gpu_queues=gpu_queues)
|
||||||
|
|
||||||
return available_gpus, gpu_queues
|
return available_gpus, gpu_queues
|
||||||
|
|
||||||
@ -1809,7 +1811,7 @@ class Worker(ServiceCommandSection):
|
|||||||
available_gpus = self._dynamic_gpu_get_available(gpu_indexes)
|
available_gpus = self._dynamic_gpu_get_available(gpu_indexes)
|
||||||
if not self.set_runtime_properties(
|
if not self.set_runtime_properties(
|
||||||
key='available_gpus', value=','.join(str(g) for g in available_gpus)):
|
key='available_gpus', value=','.join(str(g) for g in available_gpus)):
|
||||||
raise ValueError("Dynamic GPU allocation is not supported by the ClearML-server")
|
raise ValueError("Dynamic GPU allocation is not supported by your ClearML-server")
|
||||||
|
|
||||||
def report_monitor(self, report):
|
def report_monitor(self, report):
|
||||||
if not self.monitor:
|
if not self.monitor:
|
||||||
@ -1818,6 +1820,13 @@ class Worker(ServiceCommandSection):
|
|||||||
self.monitor.set_report(report)
|
self.monitor.set_report(report)
|
||||||
self.monitor.send_report()
|
self.monitor.send_report()
|
||||||
|
|
||||||
|
def cluster_report_monitor(self, available_gpus, gpu_queues):
|
||||||
|
if not self.monitor:
|
||||||
|
self.new_monitor()
|
||||||
|
self.monitor.setup_cluster_report(
|
||||||
|
worker_id=self.worker_id, available_gpus=available_gpus, gpu_queues=gpu_queues
|
||||||
|
)
|
||||||
|
|
||||||
def stop_monitor(self):
|
def stop_monitor(self):
|
||||||
if self.monitor:
|
if self.monitor:
|
||||||
self.monitor.stop()
|
self.monitor.stop()
|
||||||
|
@ -7,7 +7,7 @@ from collections import deque
|
|||||||
from itertools import starmap
|
from itertools import starmap
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event
|
||||||
from time import time
|
from time import time
|
||||||
from typing import Text, Sequence
|
from typing import Text, Sequence, List, Dict, Optional
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
import psutil
|
import psutil
|
||||||
@ -54,6 +54,14 @@ class ResourceMonitor(object):
|
|||||||
if value is not None
|
if value is not None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@attr.s
|
||||||
|
class ClusterReport:
|
||||||
|
cluster_key = attr.ib(type=str)
|
||||||
|
max_gpus = attr.ib(type=int, default=None)
|
||||||
|
max_workers = attr.ib(type=int, default=None)
|
||||||
|
max_cpus = attr.ib(type=int, default=None)
|
||||||
|
resource_groups = attr.ib(type=Sequence[str], factory=list)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
session, # type: Session
|
session, # type: Session
|
||||||
@ -61,7 +69,7 @@ class ResourceMonitor(object):
|
|||||||
sample_frequency_per_sec=2.0,
|
sample_frequency_per_sec=2.0,
|
||||||
report_frequency_sec=30.0,
|
report_frequency_sec=30.0,
|
||||||
first_report_sec=None,
|
first_report_sec=None,
|
||||||
worker_tags=None,
|
worker_tags=None
|
||||||
):
|
):
|
||||||
self.session = session
|
self.session = session
|
||||||
self.queue = deque(maxlen=1)
|
self.queue = deque(maxlen=1)
|
||||||
@ -92,6 +100,7 @@ class ResourceMonitor(object):
|
|||||||
else:
|
else:
|
||||||
# None means no filtering, report all gpus
|
# None means no filtering, report all gpus
|
||||||
self._active_gpus = None
|
self._active_gpus = None
|
||||||
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
active_gpus = Session.get_nvidia_visible_env()
|
active_gpus = Session.get_nvidia_visible_env()
|
||||||
# None means no filtering, report all gpus
|
# None means no filtering, report all gpus
|
||||||
@ -99,6 +108,10 @@ class ResourceMonitor(object):
|
|||||||
self._active_gpus = [g.strip() for g in str(active_gpus).split(',')]
|
self._active_gpus = [g.strip() for g in str(active_gpus).split(',')]
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
self._cluster_report_interval_sec = int(session.config.get(
|
||||||
|
"agent.resource_monitoring.cluster_report_interval_sec", 60
|
||||||
|
))
|
||||||
|
self._cluster_report = None
|
||||||
|
|
||||||
def set_report(self, report):
|
def set_report(self, report):
|
||||||
# type: (ResourceMonitor.StatusReport) -> ()
|
# type: (ResourceMonitor.StatusReport) -> ()
|
||||||
@ -130,6 +143,7 @@ class ResourceMonitor(object):
|
|||||||
)
|
)
|
||||||
log.debug("sending report: %s", report)
|
log.debug("sending report: %s", report)
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
self.session.get(service="workers", action="status_report", **report)
|
self.session.get(service="workers", action="status_report", **report)
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -137,7 +151,76 @@ class ResourceMonitor(object):
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def send_cluster_report(self) -> bool:
|
||||||
|
if not self.session.feature_set == "basic":
|
||||||
|
return False
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
properties = {
|
||||||
|
"max_cpus": self._cluster_report.max_cpus,
|
||||||
|
"max_gpus": self._cluster_report.max_gpus,
|
||||||
|
"max_workers": self._cluster_report.max_workers,
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"key": self._cluster_report.cluster_key,
|
||||||
|
"timestamp": int(time() * 1000),
|
||||||
|
"timeout": int(self._cluster_report_interval_sec * 2),
|
||||||
|
# "resource_groups": self._cluster_report.resource_groups, # yet to be supported
|
||||||
|
"properties": {k: v for k, v in properties.items() if v is not None},
|
||||||
|
}
|
||||||
|
self.session.post(service="workers", action="cluster_report", **payload)
|
||||||
|
except Exception as ex:
|
||||||
|
log.warning("Failed sending cluster report: %s", ex)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def setup_cluster_report(self, available_gpus, gpu_queues, worker_id=None, cluster_key=None, resource_groups=None):
|
||||||
|
# type: (List[int], Dict[str, int], Optional[str], Optional[str], Optional[List[str]]) -> ()
|
||||||
|
"""
|
||||||
|
Set up a cluster report for the enterprise server dashboard feature.
|
||||||
|
If a worker_id is provided, cluster_key and resource_groups are inferred from it.
|
||||||
|
"""
|
||||||
|
if self.session.feature_set == "basic":
|
||||||
|
return
|
||||||
|
|
||||||
|
if not worker_id and not cluster_key:
|
||||||
|
print("Error: cannot set up dashboard reporting - worker_id or cluster key are required")
|
||||||
|
return
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
if not cluster_key:
|
||||||
|
worker_id_parts = worker_id.split(":")
|
||||||
|
if len(worker_id_parts) < 3:
|
||||||
|
cluster_key = self.session.config.get("agent.resource_dashboard.default_cluster_name", "onprem")
|
||||||
|
resource_group = ":".join((cluster_key, worker_id_parts[0]))
|
||||||
|
print(
|
||||||
|
'WARNING: your worker ID "{}" is not suitable for proper resource dashboard reporting, please '
|
||||||
|
'set up agent.worker_name to be at least two colon-separated parts (i.e. "<category>:<name>"). '
|
||||||
|
'Using "{}" as the resource dashboard category and "{}" as the resource group.'.format(
|
||||||
|
worker_id, cluster_key, resource_group
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cluster_key = worker_id_parts[0]
|
||||||
|
resource_group = ":".join((worker_id_parts[:2]))
|
||||||
|
|
||||||
|
resource_groups = [resource_group]
|
||||||
|
|
||||||
|
self._cluster_report = ResourceMonitor.ClusterReport(
|
||||||
|
cluster_key=cluster_key,
|
||||||
|
max_gpus=len(available_gpus),
|
||||||
|
max_workers=len(available_gpus) // min(x for x, _ in gpu_queues.values()),
|
||||||
|
resource_groups=resource_groups
|
||||||
|
)
|
||||||
|
|
||||||
|
self.send_cluster_report()
|
||||||
|
except Exception as ex:
|
||||||
|
print("Error: failed setting cluster report: {}".format(ex))
|
||||||
|
|
||||||
def _daemon(self):
|
def _daemon(self):
|
||||||
|
last_cluster_report = 0
|
||||||
seconds_since_started = 0
|
seconds_since_started = 0
|
||||||
reported = 0
|
reported = 0
|
||||||
try:
|
try:
|
||||||
@ -177,6 +260,15 @@ class ResourceMonitor(object):
|
|||||||
|
|
||||||
# count reported iterations
|
# count reported iterations
|
||||||
reported += 1
|
reported += 1
|
||||||
|
|
||||||
|
if (
|
||||||
|
self._cluster_report and
|
||||||
|
self._cluster_report_interval_sec
|
||||||
|
and time() - last_cluster_report > self._cluster_report_interval_sec
|
||||||
|
):
|
||||||
|
if self.send_cluster_report():
|
||||||
|
last_cluster_report = time()
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
log.exception("Error reporting monitoring info: %s", str(ex))
|
log.exception("Error reporting monitoring info: %s", str(ex))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user