Add AWS dynamic cluster management service

This commit is contained in:
allegroai 2019-12-24 23:09:26 +02:00
parent 966b14f914
commit 901c4be9ae

View File

@ -228,7 +228,8 @@
" \"\"\"\n",
" Creates a new worker for trains.\n",
" First, create an instance in the cloud and install some required packages.\n",
" Then, define trains-agent environment variables and run trains-agent for the specified queue.\n",
" Then, define trains-agent environment variables and run \n",
" trains-agent for the specified queue.\n",
" NOTE: - Will wait until instance is running\n",
" - This implementation assumes the instance image already has docker installed\n",
"\n",
@ -244,8 +245,9 @@
" instance_type=resource_conf[\"instance_type\"],\n",
" )\n",
"\n",
" # user_data script will automatically run when the instance is started. it will install the required packages\n",
" # for trains-agent configure it using environment variables and run trains-agent on the required queue\n",
" # user_data script will automatically run when the instance is started. \n",
" # It will install the required packages for trains-agent configure it using \n",
" # environment variables and run trains-agent on the required queue\n",
" user_data = \"\"\"#!/bin/bash \n",
" sudo apt-get update\n",
" sudo apt-get install -y python3-dev\n",
@ -346,8 +348,8 @@
" \"\"\"\n",
" Destroys the cloud instance.\n",
"\n",
" :param instance_id: Cloud instance ID to be destroyed (currently, only AWS EC2 is supported)\n",
" :type instance_id: str\n",
" :param str instance_id: Cloud instance ID to be destroyed \n",
" (currently, only AWS EC2 is supported)\n",
" \"\"\"\n",
" try:\n",
" boto3.resource(\"ec2\", region_name=CLOUD_CREDENTIALS_REGION).instances.filter(\n",
@ -378,15 +380,16 @@
" 2. Check if there are enough idle workers available for those tasks.\n",
" 3. In case more instances are required, and we haven't reached max instances allowed,\n",
" create the required instances with regards to the maximum number defined in QUEUES\n",
" Choose which instance to create according to their order QUEUES. Won't create more instances\n",
" if maximum number defined has already reached.\n",
" - spin down instances according to their idle time. instance which is idle for more than MAX_IDLE_TIME_MIN\n",
" minutes would be removed.\n",
" Choose which instance to create according to their order QUEUES. Won't create \n",
" more instances if maximum number defined has already reached.\n",
" - spin down instances according to their idle time. instance which is idle for \n",
" more than MAX_IDLE_TIME_MIN minutes would be removed.\n",
" \"\"\"\n",
"\n",
" # Internal definitions\n",
" workers_prefix = \"dynamic_aws\"\n",
" # Worker's id in trains would be composed from prefix, name, instance_type and cloud_id separated by ';'\n",
" # Worker's id in trains would be composed from:\n",
" # prefix, name, instance_type and cloud_id separated by ';'\n",
" workers_pattern = re.compile(\n",
" r\"^(?P<prefix>[^:]+):(?P<name>[^:]+):(?P<instance_type>[^:]+):(?P<cloud_id>[^:]+)\"\n",
" )\n",
@ -469,7 +472,8 @@
" # skip resource types that might be needed\n",
" if resources in required_idle_resources:\n",
" continue\n",
" # Remove from both aws and trains all instances that are idle for longer than MAX_IDLE_TIME_MIN\n",
" # Remove from both aws and trains all instances that are \n",
" # idle for longer than MAX_IDLE_TIME_MIN\n",
" if time() - timestamp > MAX_IDLE_TIME_MIN * 60.0:\n",
" cloud_id = workers_pattern.match(worker.id)[\"cloud_id\"]\n",
" spin_down_worker(cloud_id)\n",