mirror of
https://github.com/clearml/clearml-agent
synced 2025-02-07 13:26:08 +00:00
Add auto terminate, increased polling interval and default docker image in AWS dynamic cluster management service
This commit is contained in:
parent
facbee0005
commit
23668a403a
@ -166,7 +166,11 @@
|
|||||||
"# echo \"This is the second line\"\n",
|
"# echo \"This is the second line\"\n",
|
||||||
"# \"\"\"\n",
|
"# \"\"\"\n",
|
||||||
"EXTRA_BASH_SCRIPT = \"\"\"\n",
|
"EXTRA_BASH_SCRIPT = \"\"\"\n",
|
||||||
"\"\"\""
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Default docker for trains-agent when running in docker mode (requires docker v19.03 and above). \n",
|
||||||
|
"# Leave empty to run trains-agent in non-docker mode.\n",
|
||||||
|
"DEFAULT_DOCKER_IMAGE = \"nvidia/cuda\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -180,7 +184,8 @@
|
|||||||
"# maximum idle time in minutes, after which the instance will be shutdown\n",
|
"# maximum idle time in minutes, after which the instance will be shutdown\n",
|
||||||
"MAX_IDLE_TIME_MIN = 15\n",
|
"MAX_IDLE_TIME_MIN = 15\n",
|
||||||
"# polling interval in minutes\n",
|
"# polling interval in minutes\n",
|
||||||
"POLLING_INTERVAL_MIN = 2.0"
|
"# make sure to increase in case bash commands were added in EXTRA_BASH_SCRIPT\n",
|
||||||
|
"POLLING_INTERVAL_MIN = 5.0"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -284,7 +289,9 @@
|
|||||||
" export TRAINS_API_SECRET_KEY='{secret_key}'\n",
|
" export TRAINS_API_SECRET_KEY='{secret_key}'\n",
|
||||||
" screen\n",
|
" screen\n",
|
||||||
" {bash_script}\n",
|
" {bash_script}\n",
|
||||||
" python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' --docker\"\"\".format(\n",
|
" python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' {docker}\n",
|
||||||
|
" shutdown\n",
|
||||||
|
" \"\"\".format(\n",
|
||||||
" api_server=TRAINS_SERVER_API_SERVER,\n",
|
" api_server=TRAINS_SERVER_API_SERVER,\n",
|
||||||
" web_server=TRAINS_SERVER_WEB_SERVER,\n",
|
" web_server=TRAINS_SERVER_WEB_SERVER,\n",
|
||||||
" files_server=TRAINS_SERVER_FILES_SERVER,\n",
|
" files_server=TRAINS_SERVER_FILES_SERVER,\n",
|
||||||
@ -295,7 +302,8 @@
|
|||||||
" git_user=TRAINS_GIT_USER,\n",
|
" git_user=TRAINS_GIT_USER,\n",
|
||||||
" git_pass=TRAINS_GIT_PASS,\n",
|
" git_pass=TRAINS_GIT_PASS,\n",
|
||||||
" trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n",
|
" trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n",
|
||||||
" bash_script=EXTRA_BASH_SCRIPT\n",
|
" bash_script=EXTRA_BASH_SCRIPT,\n",
|
||||||
|
" docker=\"--docker '{}'\".format(DEFAULT_DOCKER_IMAGE) if DEFAULT_DOCKER_IMAGE else \"\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ec2 = boto3.client(\n",
|
" ec2 = boto3.client(\n",
|
||||||
@ -344,6 +352,7 @@
|
|||||||
" MaxCount=1,\n",
|
" MaxCount=1,\n",
|
||||||
" InstanceType=resource_conf[\"instance_type\"],\n",
|
" InstanceType=resource_conf[\"instance_type\"],\n",
|
||||||
" UserData=user_data,\n",
|
" UserData=user_data,\n",
|
||||||
|
" InstanceInitiatedShutdownBehavior='terminate',\n",
|
||||||
" BlockDeviceMappings=[\n",
|
" BlockDeviceMappings=[\n",
|
||||||
" {\n",
|
" {\n",
|
||||||
" \"DeviceName\": resource_conf[\"ebs_device_name\"],\n",
|
" \"DeviceName\": resource_conf[\"ebs_device_name\"],\n",
|
||||||
|
Loading…
Reference in New Issue
Block a user