Add auto terminate, increased polling interval and default docker image in AWS dynamic cluster management service

This commit is contained in:
allegroai 2020-01-08 12:27:40 +02:00
parent facbee0005
commit 23668a403a

View File

@ -166,7 +166,11 @@
"# echo \"This is the second line\"\n", "# echo \"This is the second line\"\n",
"# \"\"\"\n", "# \"\"\"\n",
"EXTRA_BASH_SCRIPT = \"\"\"\n", "EXTRA_BASH_SCRIPT = \"\"\"\n",
"\"\"\"" "\"\"\"\n",
"\n",
"# Default docker for trains-agent when running in docker mode (requires docker v19.03 and above). \n",
"# Leave empty to run trains-agent in non-docker mode.\n",
"DEFAULT_DOCKER_IMAGE = \"nvidia/cuda\""
] ]
}, },
{ {
@ -180,7 +184,8 @@
"# maximum idle time in minutes, after which the instance will be shutdown\n", "# maximum idle time in minutes, after which the instance will be shutdown\n",
"MAX_IDLE_TIME_MIN = 15\n", "MAX_IDLE_TIME_MIN = 15\n",
"# polling interval in minutes\n", "# polling interval in minutes\n",
"POLLING_INTERVAL_MIN = 2.0" "# make sure to increase in case bash commands were added in EXTRA_BASH_SCRIPT\n",
"POLLING_INTERVAL_MIN = 5.0"
] ]
}, },
{ {
@ -284,7 +289,9 @@
" export TRAINS_API_SECRET_KEY='{secret_key}'\n", " export TRAINS_API_SECRET_KEY='{secret_key}'\n",
" screen\n", " screen\n",
" {bash_script}\n", " {bash_script}\n",
" python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' --docker\"\"\".format(\n", " python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' {docker}\n",
" shutdown\n",
" \"\"\".format(\n",
" api_server=TRAINS_SERVER_API_SERVER,\n", " api_server=TRAINS_SERVER_API_SERVER,\n",
" web_server=TRAINS_SERVER_WEB_SERVER,\n", " web_server=TRAINS_SERVER_WEB_SERVER,\n",
" files_server=TRAINS_SERVER_FILES_SERVER,\n", " files_server=TRAINS_SERVER_FILES_SERVER,\n",
@ -295,7 +302,8 @@
" git_user=TRAINS_GIT_USER,\n", " git_user=TRAINS_GIT_USER,\n",
" git_pass=TRAINS_GIT_PASS,\n", " git_pass=TRAINS_GIT_PASS,\n",
" trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n", " trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n",
" bash_script=EXTRA_BASH_SCRIPT\n", " bash_script=EXTRA_BASH_SCRIPT,\n",
" docker=\"--docker '{}'\".format(DEFAULT_DOCKER_IMAGE) if DEFAULT_DOCKER_IMAGE else \"\"\n",
" )\n", " )\n",
"\n", "\n",
" ec2 = boto3.client(\n", " ec2 = boto3.client(\n",
@ -344,6 +352,7 @@
" MaxCount=1,\n", " MaxCount=1,\n",
" InstanceType=resource_conf[\"instance_type\"],\n", " InstanceType=resource_conf[\"instance_type\"],\n",
" UserData=user_data,\n", " UserData=user_data,\n",
" InstanceInitiatedShutdownBehavior='terminate',\n",
" BlockDeviceMappings=[\n", " BlockDeviceMappings=[\n",
" {\n", " {\n",
" \"DeviceName\": resource_conf[\"ebs_device_name\"],\n", " \"DeviceName\": resource_conf[\"ebs_device_name\"],\n",