mirror of
https://github.com/clearml/clearml-agent
synced 2025-01-31 00:56:53 +00:00
Add AWS dynamic cluster management service
This commit is contained in:
parent
966b14f914
commit
901c4be9ae
@ -228,7 +228,8 @@
|
||||
" \"\"\"\n",
|
||||
" Creates a new worker for trains.\n",
|
||||
" First, create an instance in the cloud and install some required packages.\n",
|
||||
" Then, define trains-agent environment variables and run trains-agent for the specified queue.\n",
|
||||
" Then, define trains-agent environment variables and run \n",
|
||||
" trains-agent for the specified queue.\n",
|
||||
" NOTE: - Will wait until instance is running\n",
|
||||
" - This implementation assumes the instance image already has docker installed\n",
|
||||
"\n",
|
||||
@ -244,8 +245,9 @@
|
||||
" instance_type=resource_conf[\"instance_type\"],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # user_data script will automatically run when the instance is started. it will install the required packages\n",
|
||||
" # for trains-agent configure it using environment variables and run trains-agent on the required queue\n",
|
||||
" # user_data script will automatically run when the instance is started. \n",
|
||||
" # It will install the required packages for trains-agent configure it using \n",
|
||||
" # environment variables and run trains-agent on the required queue\n",
|
||||
" user_data = \"\"\"#!/bin/bash \n",
|
||||
" sudo apt-get update\n",
|
||||
" sudo apt-get install -y python3-dev\n",
|
||||
@ -346,8 +348,8 @@
|
||||
" \"\"\"\n",
|
||||
" Destroys the cloud instance.\n",
|
||||
"\n",
|
||||
" :param instance_id: Cloud instance ID to be destroyed (currently, only AWS EC2 is supported)\n",
|
||||
" :type instance_id: str\n",
|
||||
" :param str instance_id: Cloud instance ID to be destroyed \n",
|
||||
" (currently, only AWS EC2 is supported)\n",
|
||||
" \"\"\"\n",
|
||||
" try:\n",
|
||||
" boto3.resource(\"ec2\", region_name=CLOUD_CREDENTIALS_REGION).instances.filter(\n",
|
||||
@ -378,15 +380,16 @@
|
||||
" 2. Check if there are enough idle workers available for those tasks.\n",
|
||||
" 3. In case more instances are required, and we haven't reached max instances allowed,\n",
|
||||
" create the required instances with regards to the maximum number defined in QUEUES\n",
|
||||
" Choose which instance to create according to their order QUEUES. Won't create more instances\n",
|
||||
" if maximum number defined has already reached.\n",
|
||||
" - spin down instances according to their idle time. instance which is idle for more than MAX_IDLE_TIME_MIN\n",
|
||||
" minutes would be removed.\n",
|
||||
" Choose which instance to create according to their order QUEUES. Won't create \n",
|
||||
" more instances if maximum number defined has already reached.\n",
|
||||
" - spin down instances according to their idle time. instance which is idle for \n",
|
||||
" more than MAX_IDLE_TIME_MIN minutes would be removed.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Internal definitions\n",
|
||||
" workers_prefix = \"dynamic_aws\"\n",
|
||||
" # Worker's id in trains would be composed from prefix, name, instance_type and cloud_id separated by ';'\n",
|
||||
" # Worker's id in trains would be composed from:\n",
|
||||
" # prefix, name, instance_type and cloud_id separated by ';'\n",
|
||||
" workers_pattern = re.compile(\n",
|
||||
" r\"^(?P<prefix>[^:]+):(?P<name>[^:]+):(?P<instance_type>[^:]+):(?P<cloud_id>[^:]+)\"\n",
|
||||
" )\n",
|
||||
@ -469,7 +472,8 @@
|
||||
" # skip resource types that might be needed\n",
|
||||
" if resources in required_idle_resources:\n",
|
||||
" continue\n",
|
||||
" # Remove from both aws and trains all instances that are idle for longer than MAX_IDLE_TIME_MIN\n",
|
||||
" # Remove from both aws and trains all instances that are \n",
|
||||
" # idle for longer than MAX_IDLE_TIME_MIN\n",
|
||||
" if time() - timestamp > MAX_IDLE_TIME_MIN * 60.0:\n",
|
||||
" cloud_id = workers_pattern.match(worker.id)[\"cloud_id\"]\n",
|
||||
" spin_down_worker(cloud_id)\n",
|
||||
|
Loading…
Reference in New Issue
Block a user