1
0
mirror of https://github.com/clearml/clearml synced 2025-03-04 11:09:15 +00:00

Fix resource name with a prefix matching a resource type may cause the AutoScaler to avoid spinning down idle instances

Fix idle workers should contain resource name and not instance type (since it's later matched to a resource name)
This commit is contained in:
allegroai 2022-03-30 17:17:35 +03:00
parent 470aa8c52d
commit 6142524b84

View File

@ -262,9 +262,9 @@ class AutoScaler(object):
# If we have an idle worker matching the required resource, # If we have an idle worker matching the required resource,
# remove it from the required allocation resources # remove it from the required allocation resources
free_queue_resources = [ free_queue_resources = [
resource resource_name
for _, resource, _ in idle_workers.values() for _, resource_name, _ in idle_workers.values()
if any(q_r for q_r in queue_resources if resource in q_r[0]) if any(q_r for q_r in queue_resources if resource_name == q_r[0])
] ]
# if we have an instance waiting to be spun # if we have an instance waiting to be spun
# remove it from the required allocation resources # remove it from the required allocation resources
@ -317,9 +317,9 @@ class AutoScaler(object):
# Go over the idle workers list, and spin down idle workers # Go over the idle workers list, and spin down idle workers
for worker_id in list(idle_workers): for worker_id in list(idle_workers):
timestamp, resources, worker = idle_workers[worker_id] timestamp, resource_name, worker = idle_workers[worker_id]
# skip resource types that might be needed # skip resource types that might be needed
if resources in required_idle_resources: if resource_name in required_idle_resources:
continue continue
# Remove from both cloud and clearml all instances that are idle for longer than MAX_IDLE_TIME_MIN # Remove from both cloud and clearml all instances that are idle for longer than MAX_IDLE_TIME_MIN
if time() - timestamp > self.max_idle_time_min * MINUTE: if time() - timestamp > self.max_idle_time_min * MINUTE:
@ -346,7 +346,7 @@ class AutoScaler(object):
task = getattr(worker, 'task', None) task = getattr(worker, 'task', None)
if not task: if not task:
if worker.id not in idle_workers: if worker.id not in idle_workers:
resource_name = WorkerId(worker.id).instance_type resource_name = WorkerId(worker.id).name
worker_time = worker_last_time(worker) worker_time = worker_last_time(worker)
idle_workers[worker.id] = (worker_time, resource_name, worker) idle_workers[worker.id] = (worker_time, resource_name, worker)
elif worker.id in idle_workers: elif worker.id in idle_workers: