2021-02-01 21:44:01 +00:00
|
|
|
import json
|
2020-07-10 22:32:51 +00:00
|
|
|
from argparse import ArgumentParser
|
|
|
|
from collections import defaultdict
|
2020-07-30 12:17:59 +00:00
|
|
|
from itertools import chain
|
2021-11-17 17:15:48 +00:00
|
|
|
from pathlib import Path
|
2020-10-12 08:12:33 +00:00
|
|
|
from typing import Tuple
|
2020-07-10 22:32:51 +00:00
|
|
|
|
|
|
|
import yaml
|
|
|
|
|
2020-12-22 21:25:37 +00:00
|
|
|
from clearml import Task
|
2021-11-17 17:15:48 +00:00
|
|
|
from clearml.automation.auto_scaler import AutoScaler, ScalerConfig
|
|
|
|
from clearml.automation.aws_driver import AWSDriver
|
2020-12-22 21:25:37 +00:00
|
|
|
from clearml.config import running_remotely
|
|
|
|
from clearml.utilities.wizard.user_input import (
|
2021-11-17 17:15:48 +00:00
|
|
|
get_input, input_bool, input_int, input_list, multiline_input
|
2020-10-12 08:12:33 +00:00
|
|
|
)
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2020-07-30 12:17:59 +00:00
|
|
|
DEFAULT_DOCKER_IMAGE = "nvidia/cuda:10.1-runtime-ubuntu18.04"
|
2020-07-10 22:32:51 +00:00
|
|
|
|
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
default_config = {
|
|
|
|
'hyper_params': {
|
|
|
|
'git_user': '',
|
|
|
|
'git_pass': '',
|
|
|
|
'cloud_credentials_key': '',
|
|
|
|
'cloud_credentials_secret': '',
|
|
|
|
'cloud_credentials_region': None,
|
|
|
|
'default_docker_image': 'nvidia/cuda',
|
|
|
|
'max_idle_time_min': 15,
|
|
|
|
'polling_interval_time_min': 5,
|
|
|
|
'max_spin_up_time_min': 30,
|
|
|
|
'workers_prefix': 'dynamic_worker',
|
|
|
|
'cloud_provider': '',
|
|
|
|
},
|
|
|
|
'configurations': {
|
|
|
|
'resource_configurations': None,
|
|
|
|
'queues': None,
|
|
|
|
'extra_trains_conf': '',
|
|
|
|
'extra_clearml_conf': '',
|
|
|
|
'extra_vm_bash_script': '',
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-07-10 22:32:51 +00:00
|
|
|
def main():
|
|
|
|
parser = ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
|
|
"--run",
|
|
|
|
help="Run the autoscaler after wizard finished",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
)
|
2020-07-30 12:17:59 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--remote",
|
|
|
|
help="Run the autoscaler as a service, launch on the `services` queue",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
)
|
2021-11-17 17:15:48 +00:00
|
|
|
parser.add_argument(
|
|
|
|
"--config-file",
|
|
|
|
help="Configuration file name",
|
|
|
|
type=Path,
|
|
|
|
default=Path("aws_autoscaler.yaml"),
|
|
|
|
)
|
2020-07-10 22:32:51 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
if running_remotely():
|
2021-11-17 17:15:48 +00:00
|
|
|
conf = default_config
|
2020-07-10 22:32:51 +00:00
|
|
|
else:
|
2020-07-30 12:17:59 +00:00
|
|
|
print("AWS Autoscaler setup wizard\n"
|
|
|
|
"---------------------------\n"
|
|
|
|
"Follow the wizard to configure your AWS auto-scaler service.\n"
|
2020-12-22 21:25:37 +00:00
|
|
|
"Once completed, you will be able to view and change the configuration in the clearml-server web UI.\n"
|
2020-07-30 12:17:59 +00:00
|
|
|
"It means there is no need to worry about typos or mistakes :)\n")
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
if args.config_file.exists() and input_bool(
|
|
|
|
"Load configurations from config file '{}' [Y/n]? ".format(args.config_file),
|
2020-07-10 22:32:51 +00:00
|
|
|
default=True,
|
|
|
|
):
|
2021-11-17 17:15:48 +00:00
|
|
|
with args.config_file.open("r") as f:
|
2020-07-10 22:32:51 +00:00
|
|
|
conf = yaml.load(f, Loader=yaml.SafeLoader)
|
|
|
|
else:
|
|
|
|
configurations, hyper_params = run_wizard()
|
2021-11-17 17:15:48 +00:00
|
|
|
conf = {
|
|
|
|
"hyper_params": hyper_params,
|
|
|
|
"configurations": configurations,
|
|
|
|
}
|
2020-07-30 12:17:59 +00:00
|
|
|
# noinspection PyBroadException
|
2020-07-10 22:32:51 +00:00
|
|
|
try:
|
2021-11-17 17:15:48 +00:00
|
|
|
with args.config_file.open("w+") as f:
|
2020-07-10 22:32:51 +00:00
|
|
|
yaml.safe_dump(conf, f)
|
|
|
|
except Exception:
|
|
|
|
print(
|
|
|
|
"Error! Could not write configuration file at: {}".format(
|
2021-11-17 17:15:48 +00:00
|
|
|
args.config_file
|
2020-07-10 22:32:51 +00:00
|
|
|
)
|
|
|
|
)
|
|
|
|
return
|
|
|
|
|
2020-12-23 22:30:32 +00:00
|
|
|
# Connecting ClearML with the current process,
|
|
|
|
# from here on everything is logged automatically
|
2020-07-30 12:17:59 +00:00
|
|
|
task = Task.init(project_name="DevOps", task_name="AWS Auto-Scaler", task_type=Task.TaskTypes.service)
|
2021-11-17 17:15:48 +00:00
|
|
|
task.connect(conf['hyper_params'])
|
|
|
|
configurations = conf['configurations']
|
2021-02-01 21:44:01 +00:00
|
|
|
configurations.update(json.loads(task.get_configuration_object(name="General") or "{}"))
|
|
|
|
task.set_configuration_object(name="General", config_text=json.dumps(configurations, indent=2))
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2020-07-30 12:17:59 +00:00
|
|
|
if args.remote or args.run:
|
|
|
|
print("Running AWS auto-scaler as a service\nExecution log {}".format(task.get_output_log_web_page()))
|
|
|
|
|
|
|
|
if args.remote:
|
|
|
|
# if we are running remotely enqueue this run, and leave the process
|
2020-12-22 21:25:37 +00:00
|
|
|
# the clearml-agent services will pick it up and execute it for us.
|
2020-07-30 12:17:59 +00:00
|
|
|
task.execute_remotely(queue_name='services')
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
driver = AWSDriver.from_config(conf)
|
|
|
|
conf = ScalerConfig.from_config(conf)
|
|
|
|
autoscaler = AutoScaler(conf, driver)
|
2020-07-10 22:32:51 +00:00
|
|
|
if running_remotely() or args.run:
|
|
|
|
autoscaler.start()
|
|
|
|
|
|
|
|
|
|
|
|
def run_wizard():
|
|
|
|
# type: () -> Tuple[dict, dict]
|
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
hyper_params = default_config['hyper_params']
|
|
|
|
configurations = default_config['configurations']
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
hyper_params['cloud_credentials_key'] = get_input("AWS Access Key ID", required=True)
|
|
|
|
hyper_params['cloud_credentials_secret'] = get_input(
|
2020-07-10 22:32:51 +00:00
|
|
|
"AWS Secret Access Key", required=True
|
|
|
|
)
|
2021-11-17 17:15:48 +00:00
|
|
|
hyper_params['cloud_credentials_region'] = get_input(
|
2020-07-30 12:17:59 +00:00
|
|
|
"AWS region name",
|
2020-10-12 08:01:08 +00:00
|
|
|
"[us-east-1]",
|
|
|
|
default='us-east-1')
|
2020-07-10 22:32:51 +00:00
|
|
|
# get GIT User/Pass for cloning
|
|
|
|
print(
|
|
|
|
"\nGIT credentials:"
|
|
|
|
"\nEnter GIT username for repository cloning (leave blank for SSH key authentication): [] ",
|
|
|
|
end="",
|
|
|
|
)
|
|
|
|
git_user = input()
|
|
|
|
if git_user.strip():
|
|
|
|
print("Enter password for user '{}': ".format(git_user), end="")
|
|
|
|
git_pass = input()
|
|
|
|
print(
|
|
|
|
"Git repository cloning will be using user={} password={}".format(
|
|
|
|
git_user, git_pass
|
|
|
|
)
|
|
|
|
)
|
|
|
|
else:
|
2021-11-17 17:15:48 +00:00
|
|
|
git_user = ''
|
|
|
|
git_pass = ''
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
hyper_params['git_user'] = git_user
|
|
|
|
hyper_params['git_pass'] = git_pass
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
hyper_params['default_docker_image'] = get_input(
|
2020-07-10 22:32:51 +00:00
|
|
|
"default docker image/parameters",
|
2020-07-30 12:17:59 +00:00
|
|
|
"to use [{}]".format(DEFAULT_DOCKER_IMAGE),
|
2020-07-10 22:32:51 +00:00
|
|
|
default=DEFAULT_DOCKER_IMAGE,
|
|
|
|
new_line=True,
|
|
|
|
)
|
2020-07-30 12:17:59 +00:00
|
|
|
print("\nConfigure the machine types for the auto-scaler:")
|
|
|
|
print("------------------------------------------------")
|
2020-07-10 22:32:51 +00:00
|
|
|
resource_configurations = {}
|
|
|
|
while True:
|
2020-07-30 12:17:59 +00:00
|
|
|
a_resource = {
|
2020-07-10 22:32:51 +00:00
|
|
|
"instance_type": get_input(
|
2020-07-30 12:17:59 +00:00
|
|
|
"Amazon instance type",
|
|
|
|
"['g4dn.4xlarge']",
|
|
|
|
question='Select',
|
2020-07-10 22:32:51 +00:00
|
|
|
default="g4dn.4xlarge",
|
|
|
|
),
|
|
|
|
"is_spot": input_bool(
|
2020-07-30 12:17:59 +00:00
|
|
|
"Use spot instances? [y/N]"
|
2020-07-10 22:32:51 +00:00
|
|
|
),
|
|
|
|
"availability_zone": get_input(
|
|
|
|
"availability zone",
|
2020-07-30 12:17:59 +00:00
|
|
|
"['us-east-1b']",
|
|
|
|
question='Select',
|
2020-07-10 22:32:51 +00:00
|
|
|
default="us-east-1b",
|
|
|
|
),
|
|
|
|
"ami_id": get_input(
|
2020-07-30 12:17:59 +00:00
|
|
|
"the Amazon Machine Image id",
|
2021-02-01 21:44:01 +00:00
|
|
|
"['ami-04c0416d6bd8e4b1f']",
|
2020-07-30 12:17:59 +00:00
|
|
|
question='Select',
|
2021-02-01 21:44:01 +00:00
|
|
|
default="ami-04c0416d6bd8e4b1f",
|
2020-07-10 22:32:51 +00:00
|
|
|
),
|
|
|
|
"ebs_device_name": get_input(
|
2020-07-30 12:17:59 +00:00
|
|
|
"the Amazon EBS device",
|
2021-02-01 21:44:01 +00:00
|
|
|
"['/dev/sda1']",
|
|
|
|
default="/dev/sda1",
|
2020-07-10 22:32:51 +00:00
|
|
|
),
|
|
|
|
"ebs_volume_size": input_int(
|
2020-07-30 12:17:59 +00:00
|
|
|
"the Amazon EBS volume size",
|
|
|
|
"(in GiB) [100]",
|
2020-07-10 22:32:51 +00:00
|
|
|
default=100,
|
|
|
|
),
|
|
|
|
"ebs_volume_type": get_input(
|
2020-07-30 12:17:59 +00:00
|
|
|
"the Amazon EBS volume type",
|
2021-02-01 21:44:01 +00:00
|
|
|
"['gp3']",
|
|
|
|
default="gp3",
|
2020-07-10 22:32:51 +00:00
|
|
|
),
|
2020-10-12 08:12:33 +00:00
|
|
|
"key_name": get_input(
|
|
|
|
"the Amazon Key Pair name",
|
|
|
|
),
|
|
|
|
"security_group_ids": input_list(
|
|
|
|
"Amazon Security Group ID",
|
|
|
|
),
|
2020-07-10 22:32:51 +00:00
|
|
|
}
|
2020-07-30 12:17:59 +00:00
|
|
|
|
|
|
|
while True:
|
|
|
|
resource_name = get_input(
|
|
|
|
"a name for this instance type",
|
|
|
|
"(used in the budget section) For example 'aws4gpu'",
|
|
|
|
question='Select',
|
|
|
|
required=True,
|
|
|
|
)
|
|
|
|
if resource_name in resource_configurations:
|
|
|
|
print("\tError: instance type '{}' already used!".format(resource_name))
|
|
|
|
continue
|
|
|
|
break
|
|
|
|
resource_configurations[resource_name] = a_resource
|
|
|
|
|
|
|
|
if not input_bool("\nDefine another instance type? [y/N]"):
|
2020-07-10 22:32:51 +00:00
|
|
|
break
|
2020-07-30 12:17:59 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
configurations['resource_configurations'] = resource_configurations
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
configurations['extra_vm_bash_script'], num_lines_bash_script = multiline_input(
|
2020-10-12 08:07:01 +00:00
|
|
|
"\nEnter any pre-execution bash script to be executed on the newly created instances []"
|
2020-07-10 22:32:51 +00:00
|
|
|
)
|
2020-10-12 08:07:01 +00:00
|
|
|
print("Entered {} lines of pre-execution bash script".format(num_lines_bash_script))
|
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
configurations['extra_clearml_conf'], num_lines_clearml_conf = multiline_input(
|
2021-02-01 21:44:01 +00:00
|
|
|
"\nEnter anything you'd like to include in your clearml.conf file []"
|
2020-10-12 08:07:01 +00:00
|
|
|
)
|
2021-02-01 21:44:01 +00:00
|
|
|
print("Entered {} extra lines for clearml.conf file".format(num_lines_clearml_conf))
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2020-07-30 12:17:59 +00:00
|
|
|
print("\nDefine the machines budget:")
|
|
|
|
print("-----------------------------")
|
2021-11-17 17:15:48 +00:00
|
|
|
resource_configurations_names = list(configurations['resource_configurations'].keys())
|
2020-07-10 22:32:51 +00:00
|
|
|
queues = defaultdict(list)
|
|
|
|
while True:
|
|
|
|
while True:
|
2020-07-30 12:17:59 +00:00
|
|
|
queue_name = get_input("a queue name (for example: 'aws_4gpu_machines')", question='Select', required=True)
|
|
|
|
if queue_name in queues:
|
|
|
|
print("\tError: queue name '{}' already used!".format(queue_name))
|
|
|
|
continue
|
|
|
|
break
|
|
|
|
|
|
|
|
while True:
|
|
|
|
valid_instances = [k for k in resource_configurations_names
|
|
|
|
if k not in (q[0] for q in queues[queue_name])]
|
|
|
|
while True:
|
|
|
|
queue_type = get_input(
|
2020-10-12 08:01:08 +00:00
|
|
|
"an instance type to attach to the queue",
|
2020-07-30 12:17:59 +00:00
|
|
|
"{}".format(valid_instances),
|
|
|
|
question="Select",
|
|
|
|
required=True,
|
|
|
|
)
|
2021-11-17 17:15:48 +00:00
|
|
|
if queue_type not in configurations['resource_configurations']:
|
2020-07-30 12:17:59 +00:00
|
|
|
print("\tError: instance type '{}' not in predefined instances {}!".format(
|
2021-11-17 17:15:48 +00:00
|
|
|
queue_type, resource_configurations_names))
|
2020-07-30 12:17:59 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
if queue_type in (q[0] for q in queues[queue_name]):
|
|
|
|
print("\tError: instance type '{}' already in {}!".format(
|
|
|
|
queue_type, queue_name))
|
|
|
|
continue
|
|
|
|
|
|
|
|
if queue_type in [q[0] for q in chain.from_iterable(queues.values())]:
|
|
|
|
queue_type_new = '{}_{}'.format(queue_type, queue_name)
|
|
|
|
print("\tInstance type '{}' already used, renaming instance to {}".format(
|
|
|
|
queue_type, queue_type_new))
|
2021-11-17 17:15:48 +00:00
|
|
|
configurations['resource_configurations'][queue_type_new] = \
|
|
|
|
dict(**configurations['resource_configurations'][queue_type])
|
2020-07-30 12:17:59 +00:00
|
|
|
queue_type = queue_type_new
|
|
|
|
|
|
|
|
# make sure the renamed name is not reused
|
|
|
|
if queue_type in (q[0] for q in queues[queue_name]):
|
|
|
|
print("\tError: instance type '{}' already in {}!".format(
|
|
|
|
queue_type, queue_name))
|
|
|
|
continue
|
|
|
|
|
|
|
|
break
|
2020-07-10 22:32:51 +00:00
|
|
|
max_instances = input_int(
|
2020-07-30 12:17:59 +00:00
|
|
|
"maximum number of '{}' instances to spin simultaneously (example: 3)".format(queue_type),
|
|
|
|
required=True
|
2020-07-10 22:32:51 +00:00
|
|
|
)
|
2020-07-30 12:17:59 +00:00
|
|
|
|
2020-07-10 22:32:51 +00:00
|
|
|
queues[queue_name].append((queue_type, max_instances))
|
2021-11-17 17:15:48 +00:00
|
|
|
valid_instances = [k for k in configurations['resource_configurations'].keys()
|
2020-07-30 12:17:59 +00:00
|
|
|
if k not in (q[0] for q in queues[queue_name])]
|
|
|
|
if not valid_instances:
|
|
|
|
break
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2020-07-30 12:17:59 +00:00
|
|
|
if not input_bool("Do you wish to add another instance type to queue? [y/N]: "):
|
2020-07-10 22:32:51 +00:00
|
|
|
break
|
2021-02-01 21:44:01 +00:00
|
|
|
if not input_bool("\nAdd another queue? [y/N]"):
|
2020-07-10 22:32:51 +00:00
|
|
|
break
|
2021-11-17 17:15:48 +00:00
|
|
|
configurations['queues'] = dict(queues)
|
2020-07-10 22:32:51 +00:00
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
hyper_params['max_idle_time_min'] = input_int(
|
2020-07-10 22:32:51 +00:00
|
|
|
"maximum idle time",
|
2020-07-30 12:17:59 +00:00
|
|
|
"for the auto-scaler to spin down an instance (in minutes) [15]",
|
2020-07-10 22:32:51 +00:00
|
|
|
default=15,
|
|
|
|
new_line=True,
|
|
|
|
)
|
2021-11-17 17:15:48 +00:00
|
|
|
hyper_params['polling_interval_time_min'] = input_int(
|
2020-07-30 12:17:59 +00:00
|
|
|
"instances polling interval", "for the auto-scaler (in minutes) [5]", default=5,
|
2020-07-10 22:32:51 +00:00
|
|
|
)
|
|
|
|
|
2021-11-17 17:15:48 +00:00
|
|
|
return configurations, hyper_params
|
2020-07-10 22:32:51 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|