Fix AWS autoscaler wizard to match the autoscaler requirements (queues cannot share the same resource type, so we automatically duplicate them)

This commit is contained in:
allegroai 2020-07-30 15:17:59 +03:00
parent 6dca60aef2
commit 87088a8c0f
2 changed files with 124 additions and 47 deletions

View File

@ -1,7 +1,8 @@
from argparse import ArgumentParser from argparse import ArgumentParser
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib2 import Path
from typing import Tuple from typing import Tuple
from itertools import chain
import yaml import yaml
from six.moves import input from six.moves import input
@ -12,7 +13,7 @@ from trains.config import running_remotely
from trains.utilities.wizard.user_input import get_input, input_int, input_bool from trains.utilities.wizard.user_input import get_input, input_int, input_bool
CONF_FILE = "aws_autoscaler.yaml" CONF_FILE = "aws_autoscaler.yaml"
DEFAULT_DOCKER_IMAGE = "nvidia/cuda" DEFAULT_DOCKER_IMAGE = "nvidia/cuda:10.1-runtime-ubuntu18.04"
def main(): def main():
@ -23,13 +24,23 @@ def main():
action="store_true", action="store_true",
default=False, default=False,
) )
parser.add_argument(
"--remote",
help="Run the autoscaler as a service, launch on the `services` queue",
action="store_true",
default=False,
)
args = parser.parse_args() args = parser.parse_args()
if running_remotely(): if running_remotely():
hyper_params = AwsAutoScaler.Settings().as_dict() hyper_params = AwsAutoScaler.Settings().as_dict()
configurations = AwsAutoScaler.Configuration().as_dict() configurations = AwsAutoScaler.Configuration().as_dict()
else: else:
print("AWS Autoscaler setup\n") print("AWS Autoscaler setup wizard\n"
"---------------------------\n"
"Follow the wizard to configure your AWS auto-scaler service.\n"
"Once completed, you will be able to view and change the configuration in the trains-server web UI.\n"
"It means there is no need to worry about typos or mistakes :)\n")
config_file = Path(CONF_FILE).absolute() config_file = Path(CONF_FILE).absolute()
if config_file.exists() and input_bool( if config_file.exists() and input_bool(
@ -43,6 +54,7 @@ def main():
else: else:
configurations, hyper_params = run_wizard() configurations, hyper_params = run_wizard()
# noinspection PyBroadException
try: try:
with config_file.open("w+") as f: with config_file.open("w+") as f:
conf = { conf = {
@ -58,12 +70,19 @@ def main():
) )
return return
task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler") task = Task.init(project_name="DevOps", task_name="AWS Auto-Scaler", task_type=Task.TaskTypes.service)
task.connect(hyper_params) task.connect(hyper_params)
task.connect_configuration(configurations) task.connect_configuration(configurations)
autoscaler = AwsAutoScaler(hyper_params, configurations) if args.remote or args.run:
print("Running AWS auto-scaler as a service\nExecution log {}".format(task.get_output_log_web_page()))
if args.remote:
# if we are running remotely enqueue this run, and leave the process
# the trains-agent services will pick it up and execute it for us.
task.execute_remotely(queue_name='services')
autoscaler = AwsAutoScaler(hyper_params, configurations)
if running_remotely() or args.run: if running_remotely() or args.run:
autoscaler.start() autoscaler.start()
@ -78,7 +97,10 @@ def run_wizard():
hyper_params.cloud_credentials_secret = get_input( hyper_params.cloud_credentials_secret = get_input(
"AWS Secret Access Key", required=True "AWS Secret Access Key", required=True
) )
hyper_params.cloud_credentials_region = get_input("AWS region name", required=True) hyper_params.cloud_credentials_region = get_input(
"AWS region name",
"[us-east-1b]",
default='us-east-1b')
# get GIT User/Pass for cloning # get GIT User/Pass for cloning
print( print(
"\nGIT credentials:" "\nGIT credentials:"
@ -103,93 +125,147 @@ def run_wizard():
hyper_params.default_docker_image = get_input( hyper_params.default_docker_image = get_input(
"default docker image/parameters", "default docker image/parameters",
"to use [default is {}]".format(DEFAULT_DOCKER_IMAGE), "to use [{}]".format(DEFAULT_DOCKER_IMAGE),
default=DEFAULT_DOCKER_IMAGE, default=DEFAULT_DOCKER_IMAGE,
new_line=True, new_line=True,
) )
print("\nDefine the type of machines you want the autoscaler to use") print("\nConfigure the machine types for the auto-scaler:")
print("------------------------------------------------")
resource_configurations = {} resource_configurations = {}
while True: while True:
resource_name = get_input( a_resource = {
"machine type name",
"(remember it, we will later use it in the budget section)",
required=True,
new_line=True,
)
resource_configurations[resource_name] = {
"instance_type": get_input( "instance_type": get_input(
"instance type", "Amazon instance type",
"for resource '{}' [default is 'g4dn.4xlarge']".format(resource_name), "['g4dn.4xlarge']",
question='Select',
default="g4dn.4xlarge", default="g4dn.4xlarge",
), ),
"is_spot": input_bool( "is_spot": input_bool(
"is '{}' resource using spot instances? [t/F]".format(resource_name) "Use spot instances? [y/N]"
), ),
"availability_zone": get_input( "availability_zone": get_input(
"availability zone", "availability zone",
"for resource '{}' [default is 'us-east-1b']".format(resource_name), "['us-east-1b']",
question='Select',
default="us-east-1b", default="us-east-1b",
), ),
"ami_id": get_input( "ami_id": get_input(
"ami_id", "the Amazon Machine Image id",
"for resource '{}' [default is 'ami-07c95cafbb788face']".format( "['ami-07c95cafbb788face']",
resource_name question='Select',
),
default="ami-07c95cafbb788face", default="ami-07c95cafbb788face",
), ),
"ebs_device_name": get_input( "ebs_device_name": get_input(
"ebs_device_name", "the Amazon EBS device",
"for resource '{}' [default is '/dev/xvda']".format(resource_name), "['/dev/xvda']",
default="/dev/xvda", default="/dev/xvda",
), ),
"ebs_volume_size": input_int( "ebs_volume_size": input_int(
"ebs_volume_size", "the Amazon EBS volume size",
" for resource '{}' [default is '100']".format(resource_name), "(in GiB) [100]",
default=100, default=100,
), ),
"ebs_volume_type": get_input( "ebs_volume_type": get_input(
"ebs_volume_type", "the Amazon EBS volume type",
"for resource '{}' [default is 'gp2']".format(resource_name), "['gp2']",
default="gp2", default="gp2",
), ),
} }
if not input_bool("\nDefine another resource? [y/N]"):
while True:
resource_name = get_input(
"a name for this instance type",
"(used in the budget section) For example 'aws4gpu'",
question='Select',
required=True,
)
if resource_name in resource_configurations:
print("\tError: instance type '{}' already used!".format(resource_name))
continue
break break
resource_configurations[resource_name] = a_resource
if not input_bool("\nDefine another instance type? [y/N]"):
break
configurations.resource_configurations = resource_configurations configurations.resource_configurations = resource_configurations
configurations.extra_vm_bash_script = input( configurations.extra_vm_bash_script = input(
"\nEnter any pre-execution bash script to be executed on the newly created instances: " "\nEnter any pre-execution bash script to be executed on the newly created instances []: "
) )
print("\nSet up the budget\n") print("\nDefine the machines budget:")
print("-----------------------------")
resource_configurations_names = list(configurations.resource_configurations.keys())
queues = defaultdict(list) queues = defaultdict(list)
while True: while True:
queue_name = get_input("queue name", required=True)
while True: while True:
queue_type = get_input( queue_name = get_input("a queue name (for example: 'aws_4gpu_machines')", question='Select', required=True)
"queue type", if queue_name in queues:
"(use the resources names defined earlier)", print("\tError: queue name '{}' already used!".format(queue_name))
required=True, continue
) break
max_instances = input_int(
"maximum number of instances allowed", required=True while True:
) valid_instances = [k for k in resource_configurations_names
queues[queue_name].append((queue_type, max_instances)) if k not in (q[0] for q in queues[queue_name])]
while True:
queue_type = get_input(
"a instance type to attach to the queue",
"{}".format(valid_instances),
question="Select",
required=True,
)
if queue_type not in configurations.resource_configurations:
print("\tError: instance type '{}' not in predefined instances {}!".format(
queue_type, list(configurations.resource_configurations.keys())))
continue
if queue_type in (q[0] for q in queues[queue_name]):
print("\tError: instance type '{}' already in {}!".format(
queue_type, queue_name))
continue
if queue_type in [q[0] for q in chain.from_iterable(queues.values())]:
queue_type_new = '{}_{}'.format(queue_type, queue_name)
print("\tInstance type '{}' already used, renaming instance to {}".format(
queue_type, queue_type_new))
configurations.resource_configurations[queue_type_new] = \
dict(**configurations.resource_configurations[queue_type])
queue_type = queue_type_new
# make sure the renamed name is not reused
if queue_type in (q[0] for q in queues[queue_name]):
print("\tError: instance type '{}' already in {}!".format(
queue_type, queue_name))
continue
if not input_bool("\nAdd another type to queue? [y/N]: "):
break break
if not input_bool("Define another queue? [y/N]: "): max_instances = input_int(
"maximum number of '{}' instances to spin simultaneously (example: 3)".format(queue_type),
required=True
)
queues[queue_name].append((queue_type, max_instances))
valid_instances = [k for k in configurations.resource_configurations.keys()
if k not in (q[0] for q in queues[queue_name])]
if not valid_instances:
break
if not input_bool("Do you wish to add another instance type to queue? [y/N]: "):
break
if not input_bool("\nAdd another queue? [y/N]: "):
break break
configurations.queues = dict(queues) configurations.queues = dict(queues)
hyper_params.max_idle_time_min = input_int( hyper_params.max_idle_time_min = input_int(
"maximum idle time", "maximum idle time",
"for the autoscaler (in minutes, default is 15)", "for the auto-scaler to spin down an instance (in minutes) [15]",
default=15, default=15,
new_line=True, new_line=True,
) )
hyper_params.polling_interval_time_min = input_int( hyper_params.polling_interval_time_min = input_int(
"polling interval", "for the autoscaler (in minutes, default is 5)", default=5, "instances polling interval", "for the auto-scaler (in minutes) [5]", default=5,
) )
return configurations.as_dict(), hyper_params.as_dict() return configurations.as_dict(), hyper_params.as_dict()

View File

@ -2,3 +2,4 @@ boto3
pyYaml pyYaml
six six
trains trains
pathlib2