Add extra agent configuration and bash script for the AWS dynamic cluster management service

This commit is contained in:
allegroai 2020-01-06 15:26:55 +02:00
parent 119ecaa2e3
commit c486cfd09f

View File

@ -153,7 +153,20 @@
"# Git User/Pass to be used by trains-agent,\n", "# Git User/Pass to be used by trains-agent,\n",
"# leave empty if image already contains git ssh-key\n", "# leave empty if image already contains git ssh-key\n",
"TRAINS_GIT_USER = \"\"\n", "TRAINS_GIT_USER = \"\"\n",
"TRAINS_GIT_PASS = \"\"" "TRAINS_GIT_PASS = \"\"\n",
"\n",
"# Additional fields for trains.conf file created on the remote instance\n",
"# for example: 'agent.default_docker.image: \"nvidia/cuda:10.0-cudnn7-runtime\"'\n",
"EXTRA_TRAINS_CONF = \"\"\"\n",
"\"\"\"\n",
"\n",
"# Bash script to run on instances before running trains-agent\n",
"# Example: \"\"\"\n",
"# echo \"This is the first line\"\n",
"# echo \"This is the second line\"\n",
"# \"\"\"\n",
"EXTRA_BASH_SCRIPT = \"\"\"\n",
"\"\"\""
] ]
}, },
{ {
@ -207,7 +220,10 @@
" print(\n", " print(\n",
" \"Error: at least one resource name is used in multiple queues. \"\n", " \"Error: at least one resource name is used in multiple queues. \"\n",
" \"A resource name can only appear in a single queue definition.\"\n", " \"A resource name can only appear in a single queue definition.\"\n",
" )" " )\n",
"\n",
"# Encode EXTRA_TRAINS_CONF for later bash script usage\n",
"EXTRA_TRAINS_CONF_ENCODED = \"\\\\\\\"\".join(EXTRA_TRAINS_CONF.split(\"\\\"\"))"
] ]
}, },
{ {
@ -258,6 +274,7 @@
" sudo python3 -m pip install trains-agent\n", " sudo python3 -m pip install trains-agent\n",
" echo 'agent.git_user=\\\"{git_user}\\\"' >> /root/trains.conf\n", " echo 'agent.git_user=\\\"{git_user}\\\"' >> /root/trains.conf\n",
" echo 'agent.git_pass=\\\"{git_pass}\\\"' >> /root/trains.conf\n", " echo 'agent.git_pass=\\\"{git_pass}\\\"' >> /root/trains.conf\n",
" echo {trains_conf} >> /root/trains.conf\n",
" export TRAINS_API_HOST={api_server}\n", " export TRAINS_API_HOST={api_server}\n",
" export TRAINS_WEB_HOST={web_server}\n", " export TRAINS_WEB_HOST={web_server}\n",
" export TRAINS_FILES_HOST={files_server}\n", " export TRAINS_FILES_HOST={files_server}\n",
@ -266,6 +283,7 @@
" export TRAINS_API_ACCESS_KEY='{access_key}'\n", " export TRAINS_API_ACCESS_KEY='{access_key}'\n",
" export TRAINS_API_SECRET_KEY='{secret_key}'\n", " export TRAINS_API_SECRET_KEY='{secret_key}'\n",
" screen\n", " screen\n",
" {bash_script}\n",
" python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' --docker\"\"\".format(\n", " python3 -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' --docker\"\"\".format(\n",
" api_server=TRAINS_SERVER_API_SERVER,\n", " api_server=TRAINS_SERVER_API_SERVER,\n",
" web_server=TRAINS_SERVER_WEB_SERVER,\n", " web_server=TRAINS_SERVER_WEB_SERVER,\n",
@ -276,6 +294,8 @@
" queue=queue_name,\n", " queue=queue_name,\n",
" git_user=TRAINS_GIT_USER,\n", " git_user=TRAINS_GIT_USER,\n",
" git_pass=TRAINS_GIT_PASS,\n", " git_pass=TRAINS_GIT_PASS,\n",
" trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n",
" bash_script=EXTRA_BASH_SCRIPT\n",
" )\n", " )\n",
"\n", "\n",
" ec2 = boto3.client(\n", " ec2 = boto3.client(\n",