Fix autoscaler should recheck that the worker is still IDLE before shutting it down (#1240)

This commit is contained in:
cthorey 2024-04-08 10:42:08 +02:00 committed by GitHub
parent 90998403ee
commit d4e136307c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -198,6 +198,13 @@ class AutoScaler(object):
instance_type=resource_conf["instance_type"],
)
def is_worker_still_idle(self, worker_id):
self.logger.info("Checking if worker %r is still idle", worker_id)
for worker in self.api_client.workers.get_all():
if worker.id == worker_id:
return getattr(worker, 'task', None) is None
return True
def supervisor(self):
"""
Spin up or down resources as necessary.
@ -323,6 +330,9 @@ class AutoScaler(object):
continue
# Remove from both cloud and clearml all instances that are idle for longer than MAX_IDLE_TIME_MIN
if time() - timestamp > self.max_idle_time_min * MINUTE:
if not self.is_worker_still_idle(worker_id):
# Skip worker if no more idle
continue
wid = WorkerId(worker_id)
cloud_id = wid.cloud_id
self.driver.spin_down_worker(cloud_id)