mirror of
https://github.com/clearml/clearml
synced 2025-01-31 00:56:57 +00:00
Fix autoscaler should recheck that the worker is still IDLE before shutting it down (#1240)
This commit is contained in:
parent
90998403ee
commit
d4e136307c
@ -198,6 +198,13 @@ class AutoScaler(object):
|
||||
instance_type=resource_conf["instance_type"],
|
||||
)
|
||||
|
||||
def is_worker_still_idle(self, worker_id):
|
||||
self.logger.info("Checking if worker %r is still idle", worker_id)
|
||||
for worker in self.api_client.workers.get_all():
|
||||
if worker.id == worker_id:
|
||||
return getattr(worker, 'task', None) is None
|
||||
return True
|
||||
|
||||
def supervisor(self):
|
||||
"""
|
||||
Spin up or down resources as necessary.
|
||||
@ -323,6 +330,9 @@ class AutoScaler(object):
|
||||
continue
|
||||
# Remove from both cloud and clearml all instances that are idle for longer than MAX_IDLE_TIME_MIN
|
||||
if time() - timestamp > self.max_idle_time_min * MINUTE:
|
||||
if not self.is_worker_still_idle(worker_id):
|
||||
# Skip worker if no more idle
|
||||
continue
|
||||
wid = WorkerId(worker_id)
|
||||
cloud_id = wid.cloud_id
|
||||
self.driver.spin_down_worker(cloud_id)
|
||||
|
Loading…
Reference in New Issue
Block a user