From 23668a403a7edb33b0f6f4d26578a03cb67c49d2 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 8 Jan 2020 12:27:40 +0200 Subject: [PATCH] Add auto terminate, increased polling interval and default docker image in AWS dynamic cluster management service --- examples/dynamic_cloud_cluster.ipynb | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/examples/dynamic_cloud_cluster.ipynb b/examples/dynamic_cloud_cluster.ipynb index ab1bc57..380e076 100644 --- a/examples/dynamic_cloud_cluster.ipynb +++ b/examples/dynamic_cloud_cluster.ipynb @@ -166,7 +166,11 @@ "# echo \"This is the second line\"\n", "# \"\"\"\n", "EXTRA_BASH_SCRIPT = \"\"\"\n", - "\"\"\"" + "\"\"\"\n", + "\n", + "# Default docker for trains-agent when running in docker mode (requires docker v19.03 and above). \n", + "# Leave empty to run trains-agent in non-docker mode.\n", + "DEFAULT_DOCKER_IMAGE = \"nvidia/cuda\"" ] }, { @@ -180,7 +184,8 @@ "# maximum idle time in minutes, after which the instance will be shutdown\n", "MAX_IDLE_TIME_MIN = 15\n", "# polling interval in minutes\n", - "POLLING_INTERVAL_MIN = 2.0" + "# make sure to increase in case bash commands were added in EXTRA_BASH_SCRIPT\n", + "POLLING_INTERVAL_MIN = 5.0" ] }, { @@ -284,7 +289,9 @@ " export TRAINS_API_SECRET_KEY='{secret_key}'\n", " screen\n", " {bash_script}\n", - " python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' --docker\"\"\".format(\n", + " python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' {docker}\n", + " shutdown\n", + " \"\"\".format(\n", " api_server=TRAINS_SERVER_API_SERVER,\n", " web_server=TRAINS_SERVER_WEB_SERVER,\n", " files_server=TRAINS_SERVER_FILES_SERVER,\n", @@ -295,7 +302,8 @@ " git_user=TRAINS_GIT_USER,\n", " git_pass=TRAINS_GIT_PASS,\n", " trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n", - " bash_script=EXTRA_BASH_SCRIPT\n", + " bash_script=EXTRA_BASH_SCRIPT,\n", + " docker=\"--docker '{}'\".format(DEFAULT_DOCKER_IMAGE) if DEFAULT_DOCKER_IMAGE else \"\"\n", " )\n", "\n", " ec2 = boto3.client(\n", @@ -344,6 +352,7 @@ " MaxCount=1,\n", " InstanceType=resource_conf[\"instance_type\"],\n", " UserData=user_data,\n", + " InstanceInitiatedShutdownBehavior='terminate',\n", " BlockDeviceMappings=[\n", " {\n", " \"DeviceName\": resource_conf[\"ebs_device_name\"],\n",