Fix AWS autoscaler wizard to match the autoscaler requirements (queues cannot share the same resource type, so we automatically duplicate them)

2025-05-01 19:45:16 +00:00 · 2020-07-30 15:17:59 +03:00 · 2020-07-30 15:17:59 +03:00 · 87088a8c0f
commit 87088a8c0f
parent 6dca60aef2
2 changed files with 124 additions and 47 deletions
--- a/examples/services/aws-autoscaler/aws_autoscaler.py
+++ b/examples/services/aws-autoscaler/aws_autoscaler.py
@ -1,7 +1,8 @@
 from argparse import ArgumentParser
 from collections import defaultdict
-from pathlib import Path
+from pathlib2 import Path
 from typing import Tuple
 from itertools import chain
 import yaml
 from six.moves import input
@ -12,7 +13,7 @@ from trains.config import running_remotely
 from trains.utilities.wizard.user_input import get_input, input_int, input_bool
 CONF_FILE = "aws_autoscaler.yaml"
-DEFAULT_DOCKER_IMAGE = "nvidia/cuda"
+DEFAULT_DOCKER_IMAGE = "nvidia/cuda:10.1-runtime-ubuntu18.04"
 def main():
@ -23,13 +24,23 @@ def main():
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "--remote",
        help="Run the autoscaler as a service, launch on the `services` queue",
        action="store_true",
        default=False,
    )
    args = parser.parse_args()
    if running_remotely():
        hyper_params = AwsAutoScaler.Settings().as_dict()
        configurations = AwsAutoScaler.Configuration().as_dict()
    else:
-        print("AWS Autoscaler setup\n")
+        print("AWS Autoscaler setup wizard\n"
              "---------------------------\n"
              "Follow the wizard to configure your AWS auto-scaler service.\n"
              "Once completed, you will be able to view and change the configuration in the trains-server web UI.\n"
              "It means there is no need to worry about typos or mistakes :)\n")
        config_file = Path(CONF_FILE).absolute()
        if config_file.exists() and input_bool(
@ -43,6 +54,7 @@ def main():
        else:
            configurations, hyper_params = run_wizard()
            # noinspection PyBroadException
            try:
                with config_file.open("w+") as f:
                    conf = {
@ -58,12 +70,19 @@ def main():
                )
                return
-    task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler")
+    task = Task.init(project_name="DevOps", task_name="AWS Auto-Scaler", task_type=Task.TaskTypes.service)
    task.connect(hyper_params)
    task.connect_configuration(configurations)
-    autoscaler = AwsAutoScaler(hyper_params, configurations)
+    if args.remote or args.run:
        print("Running AWS auto-scaler as a service\nExecution log {}".format(task.get_output_log_web_page()))
    if args.remote:
        # if we are running remotely enqueue this run, and leave the process
        # the trains-agent services will pick it up and execute it for us.
        task.execute_remotely(queue_name='services')
    autoscaler = AwsAutoScaler(hyper_params, configurations)
    if running_remotely() or args.run:
        autoscaler.start()
@ -78,7 +97,10 @@ def run_wizard():
    hyper_params.cloud_credentials_secret = get_input(
        "AWS Secret Access Key", required=True
    )
-    hyper_params.cloud_credentials_region = get_input("AWS region name", required=True)
+    hyper_params.cloud_credentials_region = get_input(
        "AWS region name",
        "[us-east-1b]",
        default='us-east-1b')
    # get GIT User/Pass for cloning
    print(
        "\nGIT credentials:"
@ -103,93 +125,147 @@ def run_wizard():
    hyper_params.default_docker_image = get_input(
        "default docker image/parameters",
-        "to use [default is {}]".format(DEFAULT_DOCKER_IMAGE),
+        "to use [{}]".format(DEFAULT_DOCKER_IMAGE),
        default=DEFAULT_DOCKER_IMAGE,
        new_line=True,
    )
-    print("\nDefine the type of machines you want the autoscaler to use")
+    print("\nConfigure the machine types for the auto-scaler:")
    print("------------------------------------------------")
    resource_configurations = {}
    while True:
-        resource_name = get_input(
+        a_resource = {
            "machine type name",
            "(remember it, we will later use it in the budget section)",
            required=True,
            new_line=True,
        )
        resource_configurations[resource_name] = {
            "instance_type": get_input(
-                "instance type",
+                "Amazon instance type",
-                "for resource '{}' [default is 'g4dn.4xlarge']".format(resource_name),
+                "['g4dn.4xlarge']",
                question='Select',
                default="g4dn.4xlarge",
            ),
            "is_spot": input_bool(
-                "is '{}' resource using spot instances? [t/F]".format(resource_name)
+                "Use spot instances? [y/N]"
            ),
            "availability_zone": get_input(
                "availability zone",
-                "for resource '{}' [default is 'us-east-1b']".format(resource_name),
+                "['us-east-1b']",
                question='Select',
                default="us-east-1b",
            ),
            "ami_id": get_input(
-                "ami_id",
+                "the Amazon Machine Image id",
-                "for resource '{}' [default is 'ami-07c95cafbb788face']".format(
+                "['ami-07c95cafbb788face']",
-                    resource_name
+                question='Select',
                ),
                default="ami-07c95cafbb788face",
            ),
            "ebs_device_name": get_input(
-                "ebs_device_name",
+                "the Amazon EBS device",
-                "for resource '{}' [default is '/dev/xvda']".format(resource_name),
+                "['/dev/xvda']",
                default="/dev/xvda",
            ),
            "ebs_volume_size": input_int(
-                "ebs_volume_size",
+                "the Amazon EBS volume size",
-                " for resource '{}' [default is '100']".format(resource_name),
+                "(in GiB) [100]",
                default=100,
            ),
            "ebs_volume_type": get_input(
-                "ebs_volume_type",
+                "the Amazon EBS volume type",
-                "for resource '{}' [default is 'gp2']".format(resource_name),
+                "['gp2']",
                default="gp2",
            ),
        }
-        if not input_bool("\nDefine another resource? [y/N]"):
+
        while True:
            resource_name = get_input(
                "a name for this instance type",
                "(used in the budget section) For example 'aws4gpu'",
                question='Select',
                required=True,
            )
            if resource_name in resource_configurations:
                print("\tError: instance type '{}' already used!".format(resource_name))
                continue
            break
        resource_configurations[resource_name] = a_resource
        if not input_bool("\nDefine another instance type? [y/N]"):
            break
    configurations.resource_configurations = resource_configurations
    configurations.extra_vm_bash_script = input(
-        "\nEnter any pre-execution bash script to be executed on the newly created instances: "
+        "\nEnter any pre-execution bash script to be executed on the newly created instances []: "
    )
-    print("\nSet up the budget\n")
+    print("\nDefine the machines budget:")
    print("-----------------------------")
    resource_configurations_names = list(configurations.resource_configurations.keys())
    queues = defaultdict(list)
    while True:
        queue_name = get_input("queue name", required=True)
        while True:
-            queue_type = get_input(
+            queue_name = get_input("a queue name (for example: 'aws_4gpu_machines')", question='Select', required=True)
-                "queue type",
+            if queue_name in queues:
-                "(use the resources names defined earlier)",
+                print("\tError: queue name '{}' already used!".format(queue_name))
-                required=True,
+                continue
-            )
+            break
-            max_instances = input_int(
+
-                "maximum number of instances allowed", required=True
+        while True:
-            )
+            valid_instances = [k for k in resource_configurations_names
-            queues[queue_name].append((queue_type, max_instances))
+                               if k not in (q[0] for q in queues[queue_name])]
            while True:
                queue_type = get_input(
                    "a instance type to attach to the queue",
                    "{}".format(valid_instances),
                    question="Select",
                    required=True,
                )
                if queue_type not in configurations.resource_configurations:
                    print("\tError: instance type '{}' not in predefined instances {}!".format(
                        queue_type, list(configurations.resource_configurations.keys())))
                    continue
                if queue_type in (q[0] for q in queues[queue_name]):
                    print("\tError: instance type '{}' already in {}!".format(
                        queue_type, queue_name))
                    continue
                if queue_type in [q[0] for q in chain.from_iterable(queues.values())]:
                    queue_type_new = '{}_{}'.format(queue_type, queue_name)
                    print("\tInstance type '{}' already used, renaming instance to {}".format(
                        queue_type, queue_type_new))
                    configurations.resource_configurations[queue_type_new] = \
                        dict(**configurations.resource_configurations[queue_type])
                    queue_type = queue_type_new
                    # make sure the renamed name is not reused
                    if queue_type in (q[0] for q in queues[queue_name]):
                        print("\tError: instance type '{}' already in {}!".format(
                            queue_type, queue_name))
                        continue
            if not input_bool("\nAdd another type to queue? [y/N]: "):
                break
-        if not input_bool("Define another queue? [y/N]: "):
+            max_instances = input_int(
                "maximum number of '{}' instances to spin simultaneously (example: 3)".format(queue_type),
                required=True
            )
            queues[queue_name].append((queue_type, max_instances))
            valid_instances = [k for k in configurations.resource_configurations.keys()
                               if k not in (q[0] for q in queues[queue_name])]
            if not valid_instances:
                break
            if not input_bool("Do you wish to add another instance type to queue? [y/N]: "):
                break
        if not input_bool("\nAdd another queue? [y/N]: "):
            break
    configurations.queues = dict(queues)
    hyper_params.max_idle_time_min = input_int(
        "maximum idle time",
-        "for the autoscaler (in minutes, default is 15)",
+        "for the auto-scaler to spin down an instance (in minutes) [15]",
        default=15,
        new_line=True,
    )
    hyper_params.polling_interval_time_min = input_int(
-        "polling interval", "for the autoscaler (in minutes, default is 5)", default=5,
+        "instances polling interval", "for the auto-scaler (in minutes) [5]", default=5,
    )
    return configurations.as_dict(), hyper_params.as_dict()
--- a/examples/services/aws-autoscaler/requirements.txt
+++ b/examples/services/aws-autoscaler/requirements.txt
@ -2,3 +2,4 @@ boto3
 pyYaml
 six
 trains
 pathlib2