mirror of
https://github.com/clearml/clearml
synced 2025-05-07 14:24:31 +00:00
Fix autoscaler spin down stuck machine, ignore unknown stale workers
This commit is contained in:
parent
d53dbbf697
commit
9744d63796
@ -234,7 +234,17 @@ class AutoScaler(object):
|
|||||||
previous_workers.add(worker.id)
|
previous_workers.add(worker.id)
|
||||||
|
|
||||||
for worker_id in self.stale_workers(spun_workers):
|
for worker_id in self.stale_workers(spun_workers):
|
||||||
del spun_workers[worker_id]
|
out = spun_workers.pop(worker_id, None)
|
||||||
|
if out is None:
|
||||||
|
self.logger.warning('Ignoring unknown stale worker: %r', worker_id)
|
||||||
|
continue
|
||||||
|
resource = out[0]
|
||||||
|
try:
|
||||||
|
self.logger.info('Spinning down stuck worker: %r', worker_id)
|
||||||
|
self.driver.spin_down_worker(WorkerId(worker_id).cloud_id)
|
||||||
|
up_machines[resource] -= 1
|
||||||
|
except Exception as err:
|
||||||
|
self.logger.info('Cannot spin down %r: %r', worker_id, err)
|
||||||
|
|
||||||
self.update_idle_workers(all_workers, idle_workers)
|
self.update_idle_workers(all_workers, idle_workers)
|
||||||
required_idle_resources = [] # idle resources we'll need to keep running
|
required_idle_resources = [] # idle resources we'll need to keep running
|
||||||
|
Loading…
Reference in New Issue
Block a user