From 87088a8c0f33cf0de3d5d500385d52a0653f0482 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Thu, 30 Jul 2020 15:17:59 +0300 Subject: [PATCH] Fix AWS autoscaler wizard to match the autoscaler requirements (queues cannot share the same resource type, so we automatically duplicate them) --- .../services/aws-autoscaler/aws_autoscaler.py | 168 +++++++++++++----- .../services/aws-autoscaler/requirements.txt | 3 +- 2 files changed, 124 insertions(+), 47 deletions(-) diff --git a/examples/services/aws-autoscaler/aws_autoscaler.py b/examples/services/aws-autoscaler/aws_autoscaler.py index 8cd1ed83..2698b01c 100644 --- a/examples/services/aws-autoscaler/aws_autoscaler.py +++ b/examples/services/aws-autoscaler/aws_autoscaler.py @@ -1,7 +1,8 @@ from argparse import ArgumentParser from collections import defaultdict -from pathlib import Path +from pathlib2 import Path from typing import Tuple +from itertools import chain import yaml from six.moves import input @@ -12,7 +13,7 @@ from trains.config import running_remotely from trains.utilities.wizard.user_input import get_input, input_int, input_bool CONF_FILE = "aws_autoscaler.yaml" -DEFAULT_DOCKER_IMAGE = "nvidia/cuda" +DEFAULT_DOCKER_IMAGE = "nvidia/cuda:10.1-runtime-ubuntu18.04" def main(): @@ -23,13 +24,23 @@ def main(): action="store_true", default=False, ) + parser.add_argument( + "--remote", + help="Run the autoscaler as a service, launch on the `services` queue", + action="store_true", + default=False, + ) args = parser.parse_args() if running_remotely(): hyper_params = AwsAutoScaler.Settings().as_dict() configurations = AwsAutoScaler.Configuration().as_dict() else: - print("AWS Autoscaler setup\n") + print("AWS Autoscaler setup wizard\n" + "---------------------------\n" + "Follow the wizard to configure your AWS auto-scaler service.\n" + "Once completed, you will be able to view and change the configuration in the trains-server web UI.\n" + "It means there is no need to worry about typos or mistakes :)\n") config_file = Path(CONF_FILE).absolute() if config_file.exists() and input_bool( @@ -43,6 +54,7 @@ def main(): else: configurations, hyper_params = run_wizard() + # noinspection PyBroadException try: with config_file.open("w+") as f: conf = { @@ -58,12 +70,19 @@ def main(): ) return - task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler") + task = Task.init(project_name="DevOps", task_name="AWS Auto-Scaler", task_type=Task.TaskTypes.service) task.connect(hyper_params) task.connect_configuration(configurations) - autoscaler = AwsAutoScaler(hyper_params, configurations) + if args.remote or args.run: + print("Running AWS auto-scaler as a service\nExecution log {}".format(task.get_output_log_web_page())) + if args.remote: + # if we are running remotely enqueue this run, and leave the process + # the trains-agent services will pick it up and execute it for us. + task.execute_remotely(queue_name='services') + + autoscaler = AwsAutoScaler(hyper_params, configurations) if running_remotely() or args.run: autoscaler.start() @@ -78,7 +97,10 @@ def run_wizard(): hyper_params.cloud_credentials_secret = get_input( "AWS Secret Access Key", required=True ) - hyper_params.cloud_credentials_region = get_input("AWS region name", required=True) + hyper_params.cloud_credentials_region = get_input( + "AWS region name", + "[us-east-1b]", + default='us-east-1b') # get GIT User/Pass for cloning print( "\nGIT credentials:" @@ -103,93 +125,147 @@ def run_wizard(): hyper_params.default_docker_image = get_input( "default docker image/parameters", - "to use [default is {}]".format(DEFAULT_DOCKER_IMAGE), + "to use [{}]".format(DEFAULT_DOCKER_IMAGE), default=DEFAULT_DOCKER_IMAGE, new_line=True, ) - print("\nDefine the type of machines you want the autoscaler to use") + print("\nConfigure the machine types for the auto-scaler:") + print("------------------------------------------------") resource_configurations = {} while True: - resource_name = get_input( - "machine type name", - "(remember it, we will later use it in the budget section)", - required=True, - new_line=True, - ) - resource_configurations[resource_name] = { + a_resource = { "instance_type": get_input( - "instance type", - "for resource '{}' [default is 'g4dn.4xlarge']".format(resource_name), + "Amazon instance type", + "['g4dn.4xlarge']", + question='Select', default="g4dn.4xlarge", ), "is_spot": input_bool( - "is '{}' resource using spot instances? [t/F]".format(resource_name) + "Use spot instances? [y/N]" ), "availability_zone": get_input( "availability zone", - "for resource '{}' [default is 'us-east-1b']".format(resource_name), + "['us-east-1b']", + question='Select', default="us-east-1b", ), "ami_id": get_input( - "ami_id", - "for resource '{}' [default is 'ami-07c95cafbb788face']".format( - resource_name - ), + "the Amazon Machine Image id", + "['ami-07c95cafbb788face']", + question='Select', default="ami-07c95cafbb788face", ), "ebs_device_name": get_input( - "ebs_device_name", - "for resource '{}' [default is '/dev/xvda']".format(resource_name), + "the Amazon EBS device", + "['/dev/xvda']", default="/dev/xvda", ), "ebs_volume_size": input_int( - "ebs_volume_size", - " for resource '{}' [default is '100']".format(resource_name), + "the Amazon EBS volume size", + "(in GiB) [100]", default=100, ), "ebs_volume_type": get_input( - "ebs_volume_type", - "for resource '{}' [default is 'gp2']".format(resource_name), + "the Amazon EBS volume type", + "['gp2']", default="gp2", ), } - if not input_bool("\nDefine another resource? [y/N]"): + + while True: + resource_name = get_input( + "a name for this instance type", + "(used in the budget section) For example 'aws4gpu'", + question='Select', + required=True, + ) + if resource_name in resource_configurations: + print("\tError: instance type '{}' already used!".format(resource_name)) + continue break + resource_configurations[resource_name] = a_resource + + if not input_bool("\nDefine another instance type? [y/N]"): + break + configurations.resource_configurations = resource_configurations configurations.extra_vm_bash_script = input( - "\nEnter any pre-execution bash script to be executed on the newly created instances: " + "\nEnter any pre-execution bash script to be executed on the newly created instances []: " ) - print("\nSet up the budget\n") + print("\nDefine the machines budget:") + print("-----------------------------") + resource_configurations_names = list(configurations.resource_configurations.keys()) queues = defaultdict(list) while True: - queue_name = get_input("queue name", required=True) while True: - queue_type = get_input( - "queue type", - "(use the resources names defined earlier)", - required=True, - ) - max_instances = input_int( - "maximum number of instances allowed", required=True - ) - queues[queue_name].append((queue_type, max_instances)) + queue_name = get_input("a queue name (for example: 'aws_4gpu_machines')", question='Select', required=True) + if queue_name in queues: + print("\tError: queue name '{}' already used!".format(queue_name)) + continue + break + + while True: + valid_instances = [k for k in resource_configurations_names + if k not in (q[0] for q in queues[queue_name])] + while True: + queue_type = get_input( + "a instance type to attach to the queue", + "{}".format(valid_instances), + question="Select", + required=True, + ) + if queue_type not in configurations.resource_configurations: + print("\tError: instance type '{}' not in predefined instances {}!".format( + queue_type, list(configurations.resource_configurations.keys()))) + continue + + if queue_type in (q[0] for q in queues[queue_name]): + print("\tError: instance type '{}' already in {}!".format( + queue_type, queue_name)) + continue + + if queue_type in [q[0] for q in chain.from_iterable(queues.values())]: + queue_type_new = '{}_{}'.format(queue_type, queue_name) + print("\tInstance type '{}' already used, renaming instance to {}".format( + queue_type, queue_type_new)) + configurations.resource_configurations[queue_type_new] = \ + dict(**configurations.resource_configurations[queue_type]) + queue_type = queue_type_new + + # make sure the renamed name is not reused + if queue_type in (q[0] for q in queues[queue_name]): + print("\tError: instance type '{}' already in {}!".format( + queue_type, queue_name)) + continue - if not input_bool("\nAdd another type to queue? [y/N]: "): break - if not input_bool("Define another queue? [y/N]: "): + max_instances = input_int( + "maximum number of '{}' instances to spin simultaneously (example: 3)".format(queue_type), + required=True + ) + + queues[queue_name].append((queue_type, max_instances)) + valid_instances = [k for k in configurations.resource_configurations.keys() + if k not in (q[0] for q in queues[queue_name])] + if not valid_instances: + break + + if not input_bool("Do you wish to add another instance type to queue? [y/N]: "): + break + if not input_bool("\nAdd another queue? [y/N]: "): break configurations.queues = dict(queues) hyper_params.max_idle_time_min = input_int( "maximum idle time", - "for the autoscaler (in minutes, default is 15)", + "for the auto-scaler to spin down an instance (in minutes) [15]", default=15, new_line=True, ) hyper_params.polling_interval_time_min = input_int( - "polling interval", "for the autoscaler (in minutes, default is 5)", default=5, + "instances polling interval", "for the auto-scaler (in minutes) [5]", default=5, ) return configurations.as_dict(), hyper_params.as_dict() diff --git a/examples/services/aws-autoscaler/requirements.txt b/examples/services/aws-autoscaler/requirements.txt index d21f0080..17225017 100644 --- a/examples/services/aws-autoscaler/requirements.txt +++ b/examples/services/aws-autoscaler/requirements.txt @@ -1,4 +1,5 @@ boto3 pyYaml six -trains \ No newline at end of file +trains +pathlib2