Fix AWS autoscaler wizard to match the autoscaler requirements (queues cannot share the same resource type, so we automatically duplicate them)

2025-06-26 18:16:07 +00:00 · 2020-07-30 15:17:59 +03:00 · 2020-07-30 15:17:59 +03:00 · 87088a8c0f
commit 87088a8c0f
parent 6dca60aef2
2 changed files with 124 additions and 47 deletions
--- a/examples/services/aws-autoscaler/aws_autoscaler.py
+++ b/examples/services/aws-autoscaler/aws_autoscaler.py
@ -1,7 +1,8 @@
 from argparse import ArgumentParser
 from collections import defaultdict
-from pathlib import Path
+from pathlib2 import Path
 from typing import Tuple
+from itertools import chain

 import yaml
 from six.moves import input
@ -12,7 +13,7 @@ from trains.config import running_remotely
 from trains.utilities.wizard.user_input import get_input, input_int, input_bool

 CONF_FILE = "aws_autoscaler.yaml"
-DEFAULT_DOCKER_IMAGE = "nvidia/cuda"
+DEFAULT_DOCKER_IMAGE = "nvidia/cuda:10.1-runtime-ubuntu18.04"


 def main():
@ -23,13 +24,23 @@ def main():
        action="store_true",
        default=False,
    )
+    parser.add_argument(
+        "--remote",
+        help="Run the autoscaler as a service, launch on the `services` queue",
+        action="store_true",
+        default=False,
+    )
    args = parser.parse_args()

    if running_remotely():
        hyper_params = AwsAutoScaler.Settings().as_dict()
        configurations = AwsAutoScaler.Configuration().as_dict()
    else:
-        print("AWS Autoscaler setup\n")
+        print("AWS Autoscaler setup wizard\n"
+              "---------------------------\n"
+              "Follow the wizard to configure your AWS auto-scaler service.\n"
+              "Once completed, you will be able to view and change the configuration in the trains-server web UI.\n"
+              "It means there is no need to worry about typos or mistakes :)\n")

        config_file = Path(CONF_FILE).absolute()
        if config_file.exists() and input_bool(
@ -43,6 +54,7 @@ def main():
        else:
            configurations, hyper_params = run_wizard()

+            # noinspection PyBroadException
            try:
                with config_file.open("w+") as f:
                    conf = {
@ -58,12 +70,19 @@ def main():
                )
                return

-    task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler")
+    task = Task.init(project_name="DevOps", task_name="AWS Auto-Scaler", task_type=Task.TaskTypes.service)
    task.connect(hyper_params)
    task.connect_configuration(configurations)

-    autoscaler = AwsAutoScaler(hyper_params, configurations)
+    if args.remote or args.run:
+        print("Running AWS auto-scaler as a service\nExecution log {}".format(task.get_output_log_web_page()))

+    if args.remote:
+        # if we are running remotely enqueue this run, and leave the process
+        # the trains-agent services will pick it up and execute it for us.
+        task.execute_remotely(queue_name='services')
+
+    autoscaler = AwsAutoScaler(hyper_params, configurations)
    if running_remotely() or args.run:
        autoscaler.start()

@ -78,7 +97,10 @@ def run_wizard():
    hyper_params.cloud_credentials_secret = get_input(
        "AWS Secret Access Key", required=True
    )
-    hyper_params.cloud_credentials_region = get_input("AWS region name", required=True)
+    hyper_params.cloud_credentials_region = get_input(
+        "AWS region name",
+        "[us-east-1b]",
+        default='us-east-1b')
    # get GIT User/Pass for cloning
    print(
        "\nGIT credentials:"
@ -103,93 +125,147 @@ def run_wizard():

    hyper_params.default_docker_image = get_input(
        "default docker image/parameters",
-        "to use [default is {}]".format(DEFAULT_DOCKER_IMAGE),
+        "to use [{}]".format(DEFAULT_DOCKER_IMAGE),
        default=DEFAULT_DOCKER_IMAGE,
        new_line=True,
    )
-    print("\nDefine the type of machines you want the autoscaler to use")
+    print("\nConfigure the machine types for the auto-scaler:")
+    print("------------------------------------------------")
    resource_configurations = {}
    while True:
-        resource_name = get_input(
-            "machine type name",
-            "(remember it, we will later use it in the budget section)",
-            required=True,
-            new_line=True,
-        )
-        resource_configurations[resource_name] = {
+        a_resource = {
            "instance_type": get_input(
-                "instance type",
-                "for resource '{}' [default is 'g4dn.4xlarge']".format(resource_name),
+                "Amazon instance type",
+                "['g4dn.4xlarge']",
+                question='Select',
                default="g4dn.4xlarge",
            ),
            "is_spot": input_bool(
-                "is '{}' resource using spot instances? [t/F]".format(resource_name)
+                "Use spot instances? [y/N]"
            ),
            "availability_zone": get_input(
                "availability zone",
-                "for resource '{}' [default is 'us-east-1b']".format(resource_name),
+                "['us-east-1b']",
+                question='Select',
                default="us-east-1b",
            ),
            "ami_id": get_input(
-                "ami_id",
-                "for resource '{}' [default is 'ami-07c95cafbb788face']".format(
-                    resource_name
-                ),
+                "the Amazon Machine Image id",
+                "['ami-07c95cafbb788face']",
+                question='Select',
                default="ami-07c95cafbb788face",
            ),
            "ebs_device_name": get_input(
-                "ebs_device_name",
-                "for resource '{}' [default is '/dev/xvda']".format(resource_name),
+                "the Amazon EBS device",
+                "['/dev/xvda']",
                default="/dev/xvda",
            ),
            "ebs_volume_size": input_int(
-                "ebs_volume_size",
-                " for resource '{}' [default is '100']".format(resource_name),
+                "the Amazon EBS volume size",
+                "(in GiB) [100]",
                default=100,
            ),
            "ebs_volume_type": get_input(
-                "ebs_volume_type",
-                "for resource '{}' [default is 'gp2']".format(resource_name),
+                "the Amazon EBS volume type",
+                "['gp2']",
                default="gp2",
            ),
        }
-        if not input_bool("\nDefine another resource? [y/N]"):
+
+        while True:
+            resource_name = get_input(
+                "a name for this instance type",
+                "(used in the budget section) For example 'aws4gpu'",
+                question='Select',
+                required=True,
+            )
+            if resource_name in resource_configurations:
+                print("\tError: instance type '{}' already used!".format(resource_name))
+                continue
            break
+        resource_configurations[resource_name] = a_resource
+
+        if not input_bool("\nDefine another instance type? [y/N]"):
+            break
+
    configurations.resource_configurations = resource_configurations

    configurations.extra_vm_bash_script = input(
-        "\nEnter any pre-execution bash script to be executed on the newly created instances: "
+        "\nEnter any pre-execution bash script to be executed on the newly created instances []: "
    )

-    print("\nSet up the budget\n")
+    print("\nDefine the machines budget:")
+    print("-----------------------------")
+    resource_configurations_names = list(configurations.resource_configurations.keys())
    queues = defaultdict(list)
    while True:
-        queue_name = get_input("queue name", required=True)
        while True:
-            queue_type = get_input(
-                "queue type",
-                "(use the resources names defined earlier)",
-                required=True,
-            )
-            max_instances = input_int(
-                "maximum number of instances allowed", required=True
-            )
-            queues[queue_name].append((queue_type, max_instances))
+            queue_name = get_input("a queue name (for example: 'aws_4gpu_machines')", question='Select', required=True)
+            if queue_name in queues:
+                print("\tError: queue name '{}' already used!".format(queue_name))
+                continue
+            break
+
+        while True:
+            valid_instances = [k for k in resource_configurations_names
+                               if k not in (q[0] for q in queues[queue_name])]
+            while True:
+                queue_type = get_input(
+                    "a instance type to attach to the queue",
+                    "{}".format(valid_instances),
+                    question="Select",
+                    required=True,
+                )
+                if queue_type not in configurations.resource_configurations:
+                    print("\tError: instance type '{}' not in predefined instances {}!".format(
+                        queue_type, list(configurations.resource_configurations.keys())))
+                    continue
+
+                if queue_type in (q[0] for q in queues[queue_name]):
+                    print("\tError: instance type '{}' already in {}!".format(
+                        queue_type, queue_name))
+                    continue
+
+                if queue_type in [q[0] for q in chain.from_iterable(queues.values())]:
+                    queue_type_new = '{}_{}'.format(queue_type, queue_name)
+                    print("\tInstance type '{}' already used, renaming instance to {}".format(
+                        queue_type, queue_type_new))
+                    configurations.resource_configurations[queue_type_new] = \
+                        dict(**configurations.resource_configurations[queue_type])
+                    queue_type = queue_type_new
+
+                    # make sure the renamed name is not reused
+                    if queue_type in (q[0] for q in queues[queue_name]):
+                        print("\tError: instance type '{}' already in {}!".format(
+                            queue_type, queue_name))
+                        continue

-            if not input_bool("\nAdd another type to queue? [y/N]: "):
                break
-        if not input_bool("Define another queue? [y/N]: "):
+            max_instances = input_int(
+                "maximum number of '{}' instances to spin simultaneously (example: 3)".format(queue_type),
+                required=True
+            )
+
+            queues[queue_name].append((queue_type, max_instances))
+            valid_instances = [k for k in configurations.resource_configurations.keys()
+                               if k not in (q[0] for q in queues[queue_name])]
+            if not valid_instances:
+                break
+
+            if not input_bool("Do you wish to add another instance type to queue? [y/N]: "):
+                break
+        if not input_bool("\nAdd another queue? [y/N]: "):
            break
    configurations.queues = dict(queues)

    hyper_params.max_idle_time_min = input_int(
        "maximum idle time",
-        "for the autoscaler (in minutes, default is 15)",
+        "for the auto-scaler to spin down an instance (in minutes) [15]",
        default=15,
        new_line=True,
    )
    hyper_params.polling_interval_time_min = input_int(
-        "polling interval", "for the autoscaler (in minutes, default is 5)", default=5,
+        "instances polling interval", "for the auto-scaler (in minutes) [5]", default=5,
    )

    return configurations.as_dict(), hyper_params.as_dict()
--- a/examples/services/aws-autoscaler/requirements.txt
+++ b/examples/services/aws-autoscaler/requirements.txt
@ -1,4 +1,5 @@
 boto3
 pyYaml
 six
-trains
+trains
+pathlib2