From d4e136307cecd35b1e40174ad2b3132e981dc7c4 Mon Sep 17 00:00:00 2001 From: cthorey Date: Mon, 8 Apr 2024 10:42:08 +0200 Subject: [PATCH] Fix autoscaler should recheck that the worker is still IDLE before shutting it down (#1240) --- clearml/automation/auto_scaler.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/clearml/automation/auto_scaler.py b/clearml/automation/auto_scaler.py index fe801bc6..2218fb2a 100644 --- a/clearml/automation/auto_scaler.py +++ b/clearml/automation/auto_scaler.py @@ -198,6 +198,13 @@ class AutoScaler(object): instance_type=resource_conf["instance_type"], ) + def is_worker_still_idle(self, worker_id): + self.logger.info("Checking if worker %r is still idle", worker_id) + for worker in self.api_client.workers.get_all(): + if worker.id == worker_id: + return getattr(worker, 'task', None) is None + return True + def supervisor(self): """ Spin up or down resources as necessary. @@ -323,6 +330,9 @@ class AutoScaler(object): continue # Remove from both cloud and clearml all instances that are idle for longer than MAX_IDLE_TIME_MIN if time() - timestamp > self.max_idle_time_min * MINUTE: + if not self.is_worker_still_idle(worker_id): + # Skip worker if no more idle + continue wid = WorkerId(worker_id) cloud_id = wid.cloud_id self.driver.spin_down_worker(cloud_id)