Add AWS EC2 Auto-Scaler service example

2025-06-26 18:16:07 +00:00 · 2020-07-11 01:32:51 +03:00 · 2020-07-11 01:32:51 +03:00 · 8d7740ea68
commit 8d7740ea68
parent 25fd425bf7
4 changed files with 693 additions and 0 deletions
--- a/examples/services/aws-autoscaler/aws_autoscaler.py
+++ b/examples/services/aws-autoscaler/aws_autoscaler.py
@ -0,0 +1,260 @@
+import distutils
+from argparse import ArgumentParser
+from collections import defaultdict
+from pathlib import Path
+from typing import Optional, Tuple
+
+import yaml
+from six.moves import input
+
+from trains import Task
+from trains.automation.aws_auto_scaler import AwsAutoScaler
+from trains.config import running_remotely
+
+CONF_FILE = "aws_autoscaler.yaml"
+DEFAULT_DOCKER_IMAGE = "nvidia/cuda"
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--run",
+        help="Run the autoscaler after wizard finished",
+        action="store_true",
+        default=False,
+    )
+    args = parser.parse_args()
+
+    if running_remotely():
+        hyper_params = AwsAutoScaler.Settings().as_dict()
+        configurations = AwsAutoScaler.Configuration().as_dict()
+    else:
+        print("AWS Autoscaler setup\n")
+
+        config_file = Path(CONF_FILE).absolute()
+        if config_file.exists() and input_bool(
+            "Load configurations from config file '{}' [Y/n]? ".format(str(CONF_FILE)),
+            default=True,
+        ):
+            with config_file.open("r") as f:
+                conf = yaml.load(f, Loader=yaml.SafeLoader)
+            hyper_params = conf["hyper_params"]
+            configurations = conf["configurations"]
+        else:
+            configurations, hyper_params = run_wizard()
+
+            try:
+                with config_file.open("w+") as f:
+                    conf = {
+                        "hyper_params": hyper_params,
+                        "configurations": configurations,
+                    }
+                    yaml.safe_dump(conf, f)
+            except Exception:
+                print(
+                    "Error! Could not write configuration file at: {}".format(
+                        str(CONF_FILE)
+                    )
+                )
+                return
+
+    task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler")
+    task.connect(hyper_params)
+    task.connect_configuration(configurations)
+
+    autoscaler = AwsAutoScaler(hyper_params, configurations)
+
+    if running_remotely() or args.run:
+        autoscaler.start()
+
+
+def run_wizard():
+    # type: () -> Tuple[dict, dict]
+
+    hyper_params = AwsAutoScaler.Settings()
+    configurations = AwsAutoScaler.Configuration()
+
+    hyper_params.cloud_credentials_key = get_input("AWS Access Key ID", required=True)
+    hyper_params.cloud_credentials_secret = get_input(
+        "AWS Secret Access Key", required=True
+    )
+    hyper_params.cloud_credentials_region = get_input("AWS region name", required=True)
+    # get GIT User/Pass for cloning
+    print(
+        "\nGIT credentials:"
+        "\nEnter GIT username for repository cloning (leave blank for SSH key authentication): [] ",
+        end="",
+    )
+    git_user = input()
+    if git_user.strip():
+        print("Enter password for user '{}': ".format(git_user), end="")
+        git_pass = input()
+        print(
+            "Git repository cloning will be using user={} password={}".format(
+                git_user, git_pass
+            )
+        )
+    else:
+        git_user = None
+        git_pass = None
+
+    hyper_params.git_user = git_user
+    hyper_params.git_pass = git_pass
+
+    hyper_params.default_docker_image = get_input(
+        "default docker image/parameters",
+        "to use [default is {}]".format(DEFAULT_DOCKER_IMAGE),
+        default=DEFAULT_DOCKER_IMAGE,
+        new_line=True,
+    )
+    print("\nDefine the type of machines you want the autoscaler to use")
+    resource_configurations = {}
+    while True:
+        resource_name = get_input(
+            "machine type name",
+            "(remember it, we will later use it in the budget section)",
+            required=True,
+            new_line=True,
+        )
+        resource_configurations[resource_name] = {
+            "instance_type": get_input(
+                "instance type",
+                "for resource '{}' [default is 'g4dn.4xlarge']".format(resource_name),
+                default="g4dn.4xlarge",
+            ),
+            "is_spot": input_bool(
+                "is '{}' resource using spot instances? [t/F]".format(resource_name)
+            ),
+            "availability_zone": get_input(
+                "availability zone",
+                "for resource '{}' [default is 'us-east-1b']".format(resource_name),
+                default="us-east-1b",
+            ),
+            "ami_id": get_input(
+                "ami_id",
+                "for resource '{}' [default is 'ami-07c95cafbb788face']".format(
+                    resource_name
+                ),
+                default="ami-07c95cafbb788face",
+            ),
+            "ebs_device_name": get_input(
+                "ebs_device_name",
+                "for resource '{}' [default is '/dev/xvda']".format(resource_name),
+                default="/dev/xvda",
+            ),
+            "ebs_volume_size": input_int(
+                "ebs_volume_size",
+                " for resource '{}' [default is '100']".format(resource_name),
+                default=100,
+            ),
+            "ebs_volume_type": get_input(
+                "ebs_volume_type",
+                "for resource '{}' [default is 'gp2']".format(resource_name),
+                default="gp2",
+            ),
+        }
+        if not input_bool("\nDefine another resource? [y/N]"):
+            break
+    configurations.resource_configurations = resource_configurations
+
+    configurations.extra_vm_bash_script = input(
+        "\nEnter any pre-execution bash script to be executed on the newly created instances: "
+    )
+
+    print("\nSet up the budget\n")
+    queues = defaultdict(list)
+    while True:
+        queue_name = get_input("queue name", required=True)
+        while True:
+            queue_type = get_input(
+                "queue type",
+                "(use the resources names defined earlier)",
+                required=True,
+            )
+            max_instances = input_int(
+                "maximum number of instances allowed", required=True
+            )
+            queues[queue_name].append((queue_type, max_instances))
+
+            if not input_bool("\nAdd another type to queue? [y/N]: "):
+                break
+        if not input_bool("Define another queue? [y/N]: "):
+            break
+    configurations.queues = dict(queues)
+
+    hyper_params.max_idle_time_min = input_int(
+        "maximum idle time",
+        "for the autoscaler (in minutes, default is 15)",
+        default=15,
+        new_line=True,
+    )
+    hyper_params.polling_interval_time_min = input_int(
+        "polling interval", "for the autoscaler (in minutes, default is 5)", default=5,
+    )
+
+    return configurations.as_dict(), hyper_params.as_dict()
+
+
+def get_input(
+    key,  # type: str
+    description="",  # type: str
+    question="Enter",  # type: str
+    required=False,  # type: bool
+    default=None,  # type: Optional[str]
+    new_line=False,  # type: bool
+):
+    # type: (...) -> Optional[str]
+    if new_line:
+        print()
+    while True:
+        value = input("{} {} {}: ".format(question, key, description))
+        if not value.strip() and required:
+            print("{} is required".format(key))
+        elif not (value.strip() or required):
+            return default
+        else:
+            return value
+
+
+def input_int(
+    key,  # type: str
+    description="",  # type: str
+    required=False,  # type: bool
+    default=None,  # type: Optional[int]
+    new_line=False,  # type: bool
+):
+    # type: (...) -> Optional[int]
+    while True:
+        try:
+            value = int(
+                get_input(
+                    key,
+                    description,
+                    required=required,
+                    default=default,
+                    new_line=new_line,
+                )
+            )
+            return value
+        except ValueError:
+            print(
+                "Invalid input: {} should be a number. Please enter an integer".format(
+                    key
+                )
+            )
+
+
+def input_bool(question, default=False):
+    # type: (str, bool) -> bool
+    while True:
+        try:
+            response = input("{}: ".format(question)).lower()
+            if not response:
+                return default
+            return distutils.util.strtobool(response)
+        except ValueError:
+            print("Invalid input: please enter yes or no")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/services/aws-autoscaler/requirements.txt
+++ b/examples/services/aws-autoscaler/requirements.txt
@ -0,0 +1,4 @@
+boto3
+pyYaml
+six
+trains
--- a/trains/automation/auto_scaler.py
+++ b/trains/automation/auto_scaler.py
@ -0,0 +1,244 @@
+import os
+import re
+from itertools import chain
+from operator import itemgetter
+from time import sleep, time
+from typing import Union
+
+import attr
+from attr.validators import instance_of
+
+from ..backend_api import Session
+from ..backend_api.session.client import APIClient
+
+
+class AutoScaler(object):
+    @attr.s
+    class Settings(object):
+        git_user = attr.ib(default=None)
+        git_pass = attr.ib(default=None)
+        cloud_credentials_key = attr.ib(default=None)
+        cloud_credentials_secret = attr.ib(default=None)
+        cloud_credentials_region = attr.ib(default=None)
+        default_docker_image = attr.ib(default="nvidia/cuda")
+        max_idle_time_min = attr.ib(validator=instance_of(int), default=15)
+        polling_interval_time_min = attr.ib(validator=instance_of(int), default=5)
+        workers_prefix = attr.ib(default="dynamic_worker")
+        cloud_provider = attr.ib(default="")
+
+        def as_dict(self):
+            return attr.asdict(self)
+
+    @attr.s
+    class Configuration(object):
+        resource_configurations = attr.ib(default=None)
+        queues = attr.ib(default=None)
+        extra_trains_conf = attr.ib(default="")
+        extra_vm_bash_script = attr.ib(default="")
+
+        def as_dict(self):
+            return attr.asdict(self)
+
+    def __init__(self, settings, configuration):
+        # type: (Union[dict, AutoScaler.Settings], Union[dict, AutoScaler.Configuration]) -> None
+        if isinstance(settings, dict):
+            settings = self.Settings(**settings)
+        if isinstance(configuration, dict):
+            configuration = self.Configuration(**configuration)
+
+        self.web_server = Session.get_app_server_host()
+        self.api_server = Session.get_api_server_host()
+        self.files_server = Session.get_files_server_host()
+
+        session = Session()
+        self.access_key = session.access_key
+        self.secret_key = session.secret_key
+
+        self.git_user = settings.git_user
+        self.git_pass = settings.git_pass
+        self.cloud_credentials_key = settings.cloud_credentials_key
+        self.cloud_credentials_secret = settings.cloud_credentials_secret
+        self.cloud_credentials_region = settings.cloud_credentials_region
+        self.default_docker_image = settings.default_docker_image
+
+        self.extra_trains_conf = configuration.extra_trains_conf
+        self.extra_vm_bash_script = configuration.extra_vm_bash_script
+        self.resource_configurations = configuration.resource_configurations
+        self.queues = configuration.queues
+
+        if not self.sanity_check():
+            return
+
+        self.max_idle_time_min = int(settings.max_idle_time_min)
+        self.polling_interval_time_min = int(settings.polling_interval_time_min)
+
+        self.workers_prefix = settings.workers_prefix
+        self.cloud_provider = settings.cloud_provider
+
+    def sanity_check(self):
+        # Sanity check - Validate queue resources
+        if len(set(map(itemgetter(0), chain(*self.queues.values())))) != sum(
+            map(len, self.queues.values())
+        ):
+            print(
+                "Error: at least one resource name is used in multiple queues. "
+                "A resource name can only appear in a single queue definition."
+            )
+            return False
+        return True
+
+    def start(self):
+        # Loop forever, it is okay we are stateless
+        while True:
+            try:
+                self.supervisor()
+            except Exception as ex:
+                print(
+                    "Warning! exception occurred: {ex}\nRetry in 15 seconds".format(
+                        ex=ex
+                    )
+                )
+                sleep(15)
+
+    def spin_up_worker(self, resource, worker_id_prefix, queue_name):
+        """
+        Creates a new worker for trains (cloud-specific implementation).
+        First, create an instance in the cloud and install some required packages.
+        Then, define trains-agent environment variables and run trains-agent for the specified queue.
+        NOTE: - Will wait until instance is running
+              - This implementation assumes the instance image already has docker installed
+
+        :param str resource: resource name, as defined in self.resource_configurations and self.queues.
+        :param str worker_id_prefix: worker name prefix
+        :param str queue_name: trains queue to listen to
+        """
+        pass
+
+    def spin_down_worker(self, instance_id):
+        """
+        Destroys the cloud instance (cloud-specific implementation).
+
+        :param instance_id: Cloud instance ID to be destroyed
+        :type instance_id: str
+        """
+        pass
+
+    def supervisor(self):
+        """
+        Spin up or down resources as necessary.
+        - For every queue in self.queues do the following:
+            1. Check if there are tasks waiting in the queue.
+            2. Check if there are enough idle workers available for those tasks.
+            3. In case more instances are required, and we haven't reached max instances allowed,
+               create the required instances with regards to the maximum number defined in self.queues
+               Choose which instance to create according to their order in self.queues. Won't create more instances
+               if maximum number defined has already reached.
+        - spin down instances according to their idle time. instance which is idle for more than self.max_idle_time_min
+        minutes would be removed.
+        """
+
+        # Worker's id in trains would be composed from prefix, name, instance_type and cloud_id separated by ';'
+        workers_pattern = re.compile(
+            r"^(?P<prefix>[^:]+):(?P<name>[^:]+):(?P<instance_type>[^:]+):(?P<cloud_id>[^:]+)"
+        )
+
+        # Set up the environment variables for trains
+        os.environ["TRAINS_API_HOST"] = self.api_server
+        os.environ["TRAINS_WEB_HOST"] = self.web_server
+        os.environ["TRAINS_FILES_HOST"] = self.files_server
+        os.environ["TRAINS_API_ACCESS_KEY"] = self.access_key
+        os.environ["TRAINS_API_SECRET_KEY"] = self.secret_key
+        api_client = APIClient()
+
+        # Verify the requested queues exist and create those that doesn't exist
+        all_queues = [q.name for q in list(api_client.queues.get_all())]
+        missing_queues = [q for q in self.queues if q not in all_queues]
+        for q in missing_queues:
+            api_client.queues.create(q)
+
+        idle_workers = {}
+        while True:
+            queue_name_to_id = {
+                queue.name: queue.id for queue in api_client.queues.get_all()
+            }
+            resource_to_queue = {
+                item[0]: queue
+                for queue, resources in self.queues.items()
+                for item in resources
+            }
+            all_workers = [
+                worker
+                for worker in api_client.workers.get_all()
+                if workers_pattern.match(worker.id)
+                and workers_pattern.match(worker.id)["prefix"] == self.workers_prefix
+            ]
+
+            # Workers without a task, are added to the idle list
+            for worker in all_workers:
+                if not hasattr(worker, "task") or not worker.task:
+                    if worker.id not in idle_workers:
+                        resource_name = workers_pattern.match(worker.id)[
+                            "instance_type"
+                        ]
+                        idle_workers[worker.id] = (time(), resource_name, worker)
+                elif (
+                    hasattr(worker, "task")
+                    and worker.task
+                    and worker.id in idle_workers
+                ):
+                    idle_workers.pop(worker.id, None)
+
+            required_idle_resources = []  # idle resources we'll need to keep running
+            allocate_new_resources = []  # resources that will need to be started
+            # Check if we have tasks waiting on one of the designated queues
+            for queue in self.queues:
+                entries = api_client.queues.get_by_id(queue_name_to_id[queue]).entries
+                if entries and len(entries) > 0:
+                    queue_resources = self.queues[queue]
+
+                    # If we have an idle worker matching the required resource,
+                    # remove it from the required allocation resources
+                    free_queue_resources = [
+                        resource
+                        for _, resource, _ in idle_workers.values()
+                        if resource in queue_resources
+                    ]
+                    required_idle_resources.extend(free_queue_resources)
+                    spin_up_count = len(entries) - len(free_queue_resources)
+                    spin_up_resources = []
+
+                    # Add as many resources as possible to handle this queue's entries
+                    for resource, max_instances in queue_resources:
+                        if len(spin_up_resources) >= spin_up_count:
+                            break
+                        max_allowed = int(max_instances) - len(
+                            [
+                                worker
+                                for worker in all_workers
+                                if workers_pattern.match(worker.id)["name"] == resource
+                            ]
+                        )
+                        spin_up_resources.extend(
+                            [resource] * min(max_allowed, spin_up_count)
+                        )
+                    allocate_new_resources.extend(spin_up_resources)
+
+            # Now we actually spin the new machines
+            for resource in allocate_new_resources:
+                self.spin_up_worker(
+                    resource, self.workers_prefix, resource_to_queue[resource]
+                )
+
+            # Go over the idle workers list, and spin down idle workers
+            for timestamp, resources, worker in idle_workers.values():
+                # skip resource types that might be needed
+                if resources in required_idle_resources:
+                    continue
+                # Remove from both aws and trains all instances that are idle for longer than MAX_IDLE_TIME_MIN
+                if time() - timestamp > self.max_idle_time_min * 60.0:
+                    cloud_id = workers_pattern.match(worker.id)["cloud_id"]
+                    self.spin_down_worker(cloud_id)
+                    worker.unregister()
+
+            # Nothing else to do
+            sleep(self.polling_interval_time_min * 60.0)
--- a/trains/automation/aws_auto_scaler.py
+++ b/trains/automation/aws_auto_scaler.py
@ -0,0 +1,185 @@
+import base64
+from typing import Union
+
+import attr
+
+from .auto_scaler import AutoScaler
+from .. import Task
+
+try:
+    # noinspection PyPackageRequirements
+    import boto3
+
+    Task.add_requirements('boto3')
+except ImportError:
+    raise ValueError("AwsAutoScaler requires 'boto3' package, it was not found\n"
+                     "install with: pip install boto3")
+
+
+class AwsAutoScaler(AutoScaler):
+    @attr.s
+    class Settings(AutoScaler.Settings):
+        workers_prefix = attr.ib(default="dynamic_aws")
+        cloud_provider = attr.ib(default="AWS")
+
+    def __init__(self, settings, configuration):
+        # type: (Union[dict, AwsAutoScaler.Settings], Union[dict, AwsAutoScaler.Configuration]) -> None
+        super(AwsAutoScaler, self).__init__(settings, configuration)
+
+    def spin_up_worker(self, resource, worker_id_prefix, queue_name):
+        """
+        Creates a new worker for trains.
+        First, create an instance in the cloud and install some required packages.
+        Then, define trains-agent environment variables and run trains-agent for the specified queue.
+        NOTE: - Will wait until instance is running
+              - This implementation assumes the instance image already has docker installed
+
+        :param str resource: resource name, as defined in BUDGET and QUEUES.
+        :param str worker_id_prefix: worker name prefix
+        :param str queue_name: trains queue to listen to
+        """
+        resource_conf = self.resource_configurations[resource]
+        # Add worker type and AWS instance type to the worker name.
+        worker_id = "{worker_id_prefix}:{worker_type}:{instance_type}".format(
+            worker_id_prefix=worker_id_prefix,
+            worker_type=resource,
+            instance_type=resource_conf["instance_type"],
+        )
+
+        # user_data script will automatically run when the instance is started. it will install the required packages
+        # for trains-agent configure it using environment variables and run trains-agent on the required queue
+        user_data = """#!/bin/bash
+        sudo apt-get update
+        sudo apt-get install -y python3-dev
+        sudo apt-get install -y python3-pip
+        sudo apt-get install -y gcc
+        sudo apt-get install -y git
+        sudo apt-get install -y build-essential
+        python3 -m pip install -U pip
+        python3 -m pip install virtualenv
+        python3 -m virtualenv trains_agent_venv
+        source trains_agent_venv/bin/activate
+        python -m pip install trains-agent
+        echo 'agent.git_user=\"{git_user}\"' >> /root/trains.conf
+        echo 'agent.git_pass=\"{git_pass}\"' >> /root/trains.conf
+        echo "{trains_conf}" >> /root/trains.conf
+        export TRAINS_API_HOST={api_server}
+        export TRAINS_WEB_HOST={web_server}
+        export TRAINS_FILES_HOST={files_server}
+        export DYNAMIC_INSTANCE_ID=`curl http://169.254.169.254/latest/meta-data/instance-id`
+        export TRAINS_WORKER_ID={worker_id}:$DYNAMIC_INSTANCE_ID
+        export TRAINS_API_ACCESS_KEY='{access_key}'
+        export TRAINS_API_SECRET_KEY='{secret_key}'
+        {bash_script}
+        source ~/.bashrc
+        python -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' {docker}
+        shutdown
+        """.format(
+            api_server=self.api_server,
+            web_server=self.web_server,
+            files_server=self.files_server,
+            worker_id=worker_id,
+            access_key=self.access_key,
+            secret_key=self.secret_key,
+            queue=queue_name,
+            git_user=self.git_user,
+            git_pass=self.git_pass,
+            trains_conf=self.extra_trains_conf,
+            bash_script=self.extra_vm_bash_script,
+            docker="--docker '{}'".format(self.default_docker_image)
+            if self.default_docker_image
+            else "",
+        )
+
+        ec2 = boto3.client(
+            "ec2",
+            aws_access_key_id=self.cloud_credentials_key or None,
+            aws_secret_access_key=self.cloud_credentials_secret or None,
+            region_name=self.cloud_credentials_region,
+        )
+
+        if resource_conf["is_spot"]:
+            # Create a request for a spot instance in AWS
+            encoded_user_data = base64.b64encode(user_data.encode("ascii")).decode(
+                "ascii"
+            )
+            instances = ec2.request_spot_instances(
+                LaunchSpecification={
+                    "ImageId": resource_conf["ami_id"],
+                    "InstanceType": resource_conf["instance_type"],
+                    "Placement": {
+                        "AvailabilityZone": resource_conf["availability_zone"]
+                    },
+                    "UserData": encoded_user_data,
+                    "BlockDeviceMappings": [
+                        {
+                            "DeviceName": resource_conf["ebs_device_name"],
+                            "Ebs": {
+                                "VolumeSize": resource_conf["ebs_volume_size"],
+                                "VolumeType": resource_conf["ebs_volume_type"],
+                            },
+                        }
+                    ],
+                }
+            )
+
+            # Wait until spot request is fulfilled
+            request_id = instances["SpotInstanceRequests"][0]["SpotInstanceRequestId"]
+            waiter = ec2.get_waiter("spot_instance_request_fulfilled")
+            waiter.wait(SpotInstanceRequestIds=[request_id])
+            # Get the instance object for later use
+            response = ec2.describe_spot_instance_requests(
+                SpotInstanceRequestIds=[request_id]
+            )
+            instance_id = response["SpotInstanceRequests"][0]["InstanceId"]
+
+        else:
+            # Create a new EC2 instance
+            instances = ec2.run_instances(
+                ImageId=resource_conf["ami_id"],
+                MinCount=1,
+                MaxCount=1,
+                InstanceType=resource_conf["instance_type"],
+                UserData=user_data,
+                InstanceInitiatedShutdownBehavior="terminate",
+                BlockDeviceMappings=[
+                    {
+                        "DeviceName": resource_conf["ebs_device_name"],
+                        "Ebs": {
+                            "VolumeSize": resource_conf["ebs_volume_size"],
+                            "VolumeType": resource_conf["ebs_volume_type"],
+                        },
+                    }
+                ],
+            )
+
+            # Get the instance object for later use
+            instance_id = instances["Instances"][0]["InstanceId"]
+
+        instance = boto3.resource(
+            "ec2",
+            aws_access_key_id=self.cloud_credentials_key or None,
+            aws_secret_access_key=self.cloud_credentials_secret or None,
+            region_name=self.cloud_credentials_region,
+        ).Instance(instance_id)
+
+        # Wait until instance is in running state
+        instance.wait_until_running()
+
+    # Cloud-specific implementation (currently, only AWS EC2 is supported)
+    def spin_down_worker(self, instance_id):
+        """
+        Destroys the cloud instance.
+
+        :param instance_id: Cloud instance ID to be destroyed (currently, only AWS EC2 is supported)
+        :type instance_id: str
+        """
+        try:
+            boto3.resource(
+                "ec2",
+                aws_access_key_id=self.cloud_credentials_key or None,
+                aws_secret_access_key=self.cloud_credentials_secret or None,
+                region_name=self.cloud_credentials_region,
+            ).instances.filter(InstanceIds=[instance_id]).terminate()
+        except Exception as ex:
+            raise ex