mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Merge branch 'allegroai:master' into master
This commit is contained in:
commit
1caf7b104f
@ -26,6 +26,9 @@
|
|||||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||||
# The default is the python executing the clearml_agent
|
# The default is the python executing the clearml_agent
|
||||||
python_binary: ""
|
python_binary: ""
|
||||||
|
# ignore any requested python version (Default: False, if a Task was using a
|
||||||
|
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||||
|
# ignore_requested_python_version: true
|
||||||
|
|
||||||
# select python package manager:
|
# select python package manager:
|
||||||
# currently supported pip and conda
|
# currently supported pip and conda
|
||||||
@ -182,4 +185,16 @@
|
|||||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||||
# cuda_version: 10.1
|
# cuda_version: 10.1
|
||||||
# cudnn_version: 7.6
|
# cudnn_version: 7.6
|
||||||
|
|
||||||
|
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||||
|
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||||
|
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||||
|
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||||
|
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||||
|
# docker command, set:
|
||||||
|
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||||
|
hide_docker_command_env_vars {
|
||||||
|
enabled: true
|
||||||
|
extra_keys: []
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -111,7 +111,8 @@ class Session(TokenManager):
|
|||||||
self._logger = logger
|
self._logger = logger
|
||||||
|
|
||||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key)
|
default=(self.config.get("api.credentials.access_key", None) or self.default_key),
|
||||||
|
value_cb=lambda key, value: logger.info("Using environment access key {}={}".format(key, value))
|
||||||
)
|
)
|
||||||
if not self.access_key:
|
if not self.access_key:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -119,7 +120,8 @@ class Session(TokenManager):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret)
|
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
|
||||||
|
value_cb=lambda key, value: logger.info("Using environment secret key {}=********".format(key))
|
||||||
)
|
)
|
||||||
if not self.secret_key:
|
if not self.secret_key:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -64,8 +64,8 @@ class Entry(object):
|
|||||||
converter = self.default_conversions().get(self.type, self.type)
|
converter = self.default_conversions().get(self.type, self.type)
|
||||||
return converter(value)
|
return converter(value)
|
||||||
|
|
||||||
def get_pair(self, default=NotSet, converter=None):
|
def get_pair(self, default=NotSet, converter=None, value_cb=None):
|
||||||
# type: (Any, Converter) -> Optional[Tuple[Text, Any]]
|
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Tuple[Text, Any]]
|
||||||
for key in self.keys:
|
for key in self.keys:
|
||||||
value = self._get(key)
|
value = self._get(key)
|
||||||
if value is NotSet:
|
if value is NotSet:
|
||||||
@ -75,13 +75,20 @@ class Entry(object):
|
|||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
self.error("invalid value {key}={value}: {ex}".format(**locals()))
|
self.error("invalid value {key}={value}: {ex}".format(**locals()))
|
||||||
break
|
break
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
if value_cb:
|
||||||
|
value_cb(key, value)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
return key, value
|
return key, value
|
||||||
|
|
||||||
result = self.default if default is NotSet else default
|
result = self.default if default is NotSet else default
|
||||||
return self.key, result
|
return self.key, result
|
||||||
|
|
||||||
def get(self, default=NotSet, converter=None):
|
def get(self, default=NotSet, converter=None, value_cb=None):
|
||||||
# type: (Any, Converter) -> Optional[Any]
|
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Any]
|
||||||
return self.get_pair(default=default, converter=converter)[1]
|
return self.get_pair(default=default, converter=converter, value_cb=value_cb)[1]
|
||||||
|
|
||||||
def set(self, value):
|
def set(self, value):
|
||||||
# type: (Any, Any) -> (Text, Any)
|
# type: (Any, Any) -> (Text, Any)
|
||||||
|
@ -11,6 +11,7 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
import traceback
|
import traceback
|
||||||
|
import shlex
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from copy import deepcopy, copy
|
from copy import deepcopy, copy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -19,7 +20,7 @@ from functools import partial, cmp_to_key
|
|||||||
from itertools import chain
|
from itertools import chain
|
||||||
from tempfile import mkdtemp, NamedTemporaryFile
|
from tempfile import mkdtemp, NamedTemporaryFile
|
||||||
from time import sleep, time
|
from time import sleep, time
|
||||||
from typing import Text, Optional, Any, Tuple
|
from typing import Text, Optional, Any, Tuple, List
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
import psutil
|
import psutil
|
||||||
@ -43,7 +44,14 @@ from clearml_agent.definitions import (
|
|||||||
ENV_DOCKER_HOST_MOUNT,
|
ENV_DOCKER_HOST_MOUNT,
|
||||||
ENV_TASK_EXTRA_PYTHON_PATH,
|
ENV_TASK_EXTRA_PYTHON_PATH,
|
||||||
ENV_AGENT_GIT_USER,
|
ENV_AGENT_GIT_USER,
|
||||||
ENV_AGENT_GIT_PASS, ENV_WORKER_ID, ENV_DOCKER_SKIP_GPUS_FLAG, )
|
ENV_AGENT_GIT_PASS,
|
||||||
|
ENV_WORKER_ID,
|
||||||
|
ENV_DOCKER_SKIP_GPUS_FLAG,
|
||||||
|
ENV_AGENT_SECRET_KEY,
|
||||||
|
ENV_AWS_SECRET_KEY,
|
||||||
|
ENV_AZURE_ACCOUNT_KEY,
|
||||||
|
ENV_AGENT_DISABLE_SSH_MOUNT,
|
||||||
|
)
|
||||||
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
|
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
|
||||||
from clearml_agent.errors import APIError, CommandFailedError, Sigterm
|
from clearml_agent.errors import APIError, CommandFailedError, Sigterm
|
||||||
from clearml_agent.helper.base import (
|
from clearml_agent.helper.base import (
|
||||||
@ -67,7 +75,9 @@ from clearml_agent.helper.base import (
|
|||||||
get_python_path,
|
get_python_path,
|
||||||
is_linux_platform,
|
is_linux_platform,
|
||||||
rm_file,
|
rm_file,
|
||||||
add_python_path, safe_remove_tree, )
|
add_python_path,
|
||||||
|
safe_remove_tree,
|
||||||
|
)
|
||||||
from clearml_agent.helper.console import ensure_text, print_text, decode_binary_lines
|
from clearml_agent.helper.console import ensure_text, print_text, decode_binary_lines
|
||||||
from clearml_agent.helper.os.daemonize import daemonize_process
|
from clearml_agent.helper.os.daemonize import daemonize_process
|
||||||
from clearml_agent.helper.package.base import PackageManager
|
from clearml_agent.helper.package.base import PackageManager
|
||||||
@ -90,7 +100,10 @@ from clearml_agent.helper.process import (
|
|||||||
get_bash_output,
|
get_bash_output,
|
||||||
shutdown_docker_process,
|
shutdown_docker_process,
|
||||||
get_docker_id,
|
get_docker_id,
|
||||||
commit_docker, terminate_process, check_if_command_exists,
|
commit_docker,
|
||||||
|
terminate_process,
|
||||||
|
check_if_command_exists,
|
||||||
|
terminate_all_child_processes,
|
||||||
)
|
)
|
||||||
from clearml_agent.helper.package.priority_req import PriorityPackageRequirement, PackageCollectorRequirement
|
from clearml_agent.helper.package.priority_req import PriorityPackageRequirement, PackageCollectorRequirement
|
||||||
from clearml_agent.helper.repo import clone_repository_cached, RepoInfo, VCS, fix_package_import_diff_patch
|
from clearml_agent.helper.repo import clone_repository_cached, RepoInfo, VCS, fix_package_import_diff_patch
|
||||||
@ -187,7 +200,7 @@ class LiteralScriptManager(object):
|
|||||||
location = location or (repo_info and repo_info.root)
|
location = location or (repo_info and repo_info.root)
|
||||||
if not location:
|
if not location:
|
||||||
location = Path(self.venv_folder, "code")
|
location = Path(self.venv_folder, "code")
|
||||||
location.mkdir(exist_ok=True)
|
location.mkdir(exist_ok=True, parents=True)
|
||||||
log.debug("selected execution directory: %s", location)
|
log.debug("selected execution directory: %s", location)
|
||||||
return Text(location), self.write(task, location, execution.entry_point)
|
return Text(location), self.write(task, location, execution.entry_point)
|
||||||
|
|
||||||
@ -221,6 +234,9 @@ def get_task(session, task_id, *args, **kwargs):
|
|||||||
|
|
||||||
|
|
||||||
def get_task_container(session, task_id):
|
def get_task_container(session, task_id):
|
||||||
|
"""
|
||||||
|
Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''}
|
||||||
|
"""
|
||||||
if session.check_min_api_version("2.13"):
|
if session.check_min_api_version("2.13"):
|
||||||
result = session.send_request(
|
result = session.send_request(
|
||||||
service='tasks',
|
service='tasks',
|
||||||
@ -233,12 +249,12 @@ def get_task_container(session, task_id):
|
|||||||
try:
|
try:
|
||||||
container = result.json()['data']['tasks'][0]['container'] if result.ok else {}
|
container = result.json()['data']['tasks'][0]['container'] if result.ok else {}
|
||||||
if container.get('arguments'):
|
if container.get('arguments'):
|
||||||
container['arguments'] = str(container.get('arguments')).split(' ')
|
container['arguments'] = shlex.split(str(container.get('arguments')).strip())
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
container = {}
|
container = {}
|
||||||
else:
|
else:
|
||||||
response = get_task(session, task_id, only_fields=["execution.docker_cmd"])
|
response = get_task(session, task_id, only_fields=["execution.docker_cmd"])
|
||||||
task_docker_cmd_parts = str(response.execution.docker_cmd or '').strip().split(' ')
|
task_docker_cmd_parts = shlex.split(str(response.execution.docker_cmd or '').strip())
|
||||||
try:
|
try:
|
||||||
container = dict(
|
container = dict(
|
||||||
container=task_docker_cmd_parts[0],
|
container=task_docker_cmd_parts[0],
|
||||||
@ -251,11 +267,14 @@ def get_task_container(session, task_id):
|
|||||||
|
|
||||||
|
|
||||||
def set_task_container(session, task_id, docker_image=None, docker_arguments=None, docker_bash_script=None):
|
def set_task_container(session, task_id, docker_image=None, docker_arguments=None, docker_bash_script=None):
|
||||||
|
if docker_arguments and isinstance(docker_arguments, str):
|
||||||
|
docker_arguments = [docker_arguments]
|
||||||
|
|
||||||
if session.check_min_api_version("2.13"):
|
if session.check_min_api_version("2.13"):
|
||||||
container = dict(
|
container = dict(
|
||||||
image=docker_image or None,
|
image=docker_image or '',
|
||||||
arguments=' '.join(docker_arguments) if docker_arguments else None,
|
arguments=' '.join(docker_arguments) if docker_arguments else '',
|
||||||
setup_shell_script=docker_bash_script or None,
|
setup_shell_script=docker_bash_script or '',
|
||||||
)
|
)
|
||||||
result = session.send_request(
|
result = session.send_request(
|
||||||
service='tasks',
|
service='tasks',
|
||||||
@ -614,10 +633,13 @@ class Worker(ServiceCommandSection):
|
|||||||
'--standalone-mode' if self._standalone_mode else '',
|
'--standalone-mode' if self._standalone_mode else '',
|
||||||
task_id)
|
task_id)
|
||||||
|
|
||||||
# send the actual used command line to the backend
|
display_docker_command = self._sanitize_docker_command(full_docker_cmd)
|
||||||
self.send_logs(task_id=task_id, lines=['Executing: {}\n'.format(full_docker_cmd)], level="INFO")
|
|
||||||
|
# send the actual used command line to the backend
|
||||||
|
self.send_logs(task_id=task_id, lines=['Executing: {}\n'.format(display_docker_command)], level="INFO")
|
||||||
|
|
||||||
|
cmd = Argv(*full_docker_cmd, display_argv=display_docker_command)
|
||||||
|
|
||||||
cmd = Argv(*full_docker_cmd)
|
|
||||||
print('Running Docker:\n{}\n'.format(str(cmd)))
|
print('Running Docker:\n{}\n'.format(str(cmd)))
|
||||||
else:
|
else:
|
||||||
cmd = worker_args.get_argv_for_command("execute") + (
|
cmd = worker_args.get_argv_for_command("execute") + (
|
||||||
@ -685,6 +707,9 @@ class Worker(ServiceCommandSection):
|
|||||||
shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(task_id))
|
shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(task_id))
|
||||||
safe_remove_file(temp_stdout_name)
|
safe_remove_file(temp_stdout_name)
|
||||||
safe_remove_file(temp_stderr_name)
|
safe_remove_file(temp_stderr_name)
|
||||||
|
if self._services_mode and status == ExitStatus.interrupted:
|
||||||
|
# unregister this worker, it was killed
|
||||||
|
self._unregister()
|
||||||
|
|
||||||
def run_tasks_loop(self, queues, worker_params, priority_order=True, gpu_indexes=None, gpu_queues=None):
|
def run_tasks_loop(self, queues, worker_params, priority_order=True, gpu_indexes=None, gpu_queues=None):
|
||||||
"""
|
"""
|
||||||
@ -719,6 +744,7 @@ class Worker(ServiceCommandSection):
|
|||||||
|
|
||||||
# get current running instances
|
# get current running instances
|
||||||
available_gpus = None
|
available_gpus = None
|
||||||
|
dynamic_gpus_worker_id = None
|
||||||
if gpu_indexes and gpu_queues:
|
if gpu_indexes and gpu_queues:
|
||||||
available_gpus, gpu_queues = self._setup_dynamic_gpus(gpu_queues)
|
available_gpus, gpu_queues = self._setup_dynamic_gpus(gpu_queues)
|
||||||
# multi instance support
|
# multi instance support
|
||||||
@ -812,7 +838,7 @@ class Worker(ServiceCommandSection):
|
|||||||
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
|
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
|
||||||
|
|
||||||
org_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES')
|
org_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES')
|
||||||
worker_id = self.worker_id
|
dynamic_gpus_worker_id = self.worker_id
|
||||||
# the following is only executed in dynamic gpus mode
|
# the following is only executed in dynamic gpus mode
|
||||||
if gpu_queues and gpu_queues.get(queue):
|
if gpu_queues and gpu_queues.get(queue):
|
||||||
# pick the first available GPUs
|
# pick the first available GPUs
|
||||||
@ -836,7 +862,7 @@ class Worker(ServiceCommandSection):
|
|||||||
self.run_one_task(queue, task_id, worker_params)
|
self.run_one_task(queue, task_id, worker_params)
|
||||||
|
|
||||||
if gpu_queues:
|
if gpu_queues:
|
||||||
self.worker_id = worker_id
|
self.worker_id = dynamic_gpus_worker_id
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = \
|
os.environ['CUDA_VISIBLE_DEVICES'] = \
|
||||||
os.environ['NVIDIA_VISIBLE_DEVICES'] = org_gpus
|
os.environ['NVIDIA_VISIBLE_DEVICES'] = org_gpus
|
||||||
|
|
||||||
@ -864,23 +890,23 @@ class Worker(ServiceCommandSection):
|
|||||||
if shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(t_id)):
|
if shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(t_id)):
|
||||||
self.handle_task_termination(task_id=t_id, exit_code=0, stop_reason=TaskStopReason.stopped)
|
self.handle_task_termination(task_id=t_id, exit_code=0, stop_reason=TaskStopReason.stopped)
|
||||||
else:
|
else:
|
||||||
|
# if we are in dynamic gpus / services mode,
|
||||||
|
# we should send termination signal to all child processes
|
||||||
|
if self._services_mode:
|
||||||
|
terminate_all_child_processes(timeout=20, include_parent=False)
|
||||||
|
|
||||||
# if we are here, just kill all sub processes
|
# if we are here, just kill all sub processes
|
||||||
kill_all_child_processes()
|
kill_all_child_processes()
|
||||||
for t_id in set(list_task_gpus_ids.values()):
|
|
||||||
# check if Task is running,
|
# unregister dynamic GPU worker, if we were terminated while setting up a Task
|
||||||
task_info = get_task(
|
if dynamic_gpus_worker_id:
|
||||||
self._session, t_id, only_fields=["status"]
|
self.worker_id = dynamic_gpus_worker_id
|
||||||
)
|
self._unregister()
|
||||||
# this is a bit risky we might have rerun it again after it already completed
|
|
||||||
# basically we are not removing completed tasks from the list, hence the issue
|
|
||||||
if str(task_info.status) == "in_progress":
|
|
||||||
self.handle_task_termination(
|
|
||||||
task_id=t_id, exit_code=0, stop_reason=TaskStopReason.stopped)
|
|
||||||
|
|
||||||
def _dynamic_gpu_get_available(self, gpu_indexes):
|
def _dynamic_gpu_get_available(self, gpu_indexes):
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
response = self._session.send_api(workers_api.GetAllRequest(last_seen=60))
|
response = self._session.send_api(workers_api.GetAllRequest(last_seen=600))
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -1360,6 +1386,7 @@ class Worker(ServiceCommandSection):
|
|||||||
service_mode_internal_agent_started = None
|
service_mode_internal_agent_started = None
|
||||||
stopping = False
|
stopping = False
|
||||||
status = None
|
status = None
|
||||||
|
process = None
|
||||||
try:
|
try:
|
||||||
_last_machine_update_ts = time()
|
_last_machine_update_ts = time()
|
||||||
stop_reason = None
|
stop_reason = None
|
||||||
@ -1396,7 +1423,7 @@ class Worker(ServiceCommandSection):
|
|||||||
|
|
||||||
# get diff from previous poll
|
# get diff from previous poll
|
||||||
printed_lines, stdout_pos_count = _print_file(stdout_path, stdout_pos_count)
|
printed_lines, stdout_pos_count = _print_file(stdout_path, stdout_pos_count)
|
||||||
if self._services_mode and not stopping and not status:
|
if self._services_mode and not stopping and status is None:
|
||||||
# if the internal agent started, we stop logging, it will take over logging.
|
# if the internal agent started, we stop logging, it will take over logging.
|
||||||
# if the internal agent started running the task itself, it will return status==0,
|
# if the internal agent started running the task itself, it will return status==0,
|
||||||
# then we can quit the monitoring loop of this process
|
# then we can quit the monitoring loop of this process
|
||||||
@ -1416,6 +1443,8 @@ class Worker(ServiceCommandSection):
|
|||||||
status = ex.returncode
|
status = ex.returncode
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# so someone else will catch us
|
# so someone else will catch us
|
||||||
|
if process:
|
||||||
|
kill_all_child_processes(process.pid)
|
||||||
raise
|
raise
|
||||||
except Exception:
|
except Exception:
|
||||||
# we should not get here, but better safe than sorry
|
# we should not get here, but better safe than sorry
|
||||||
@ -1427,6 +1456,10 @@ class Worker(ServiceCommandSection):
|
|||||||
stop_reason = TaskStopReason.exception
|
stop_reason = TaskStopReason.exception
|
||||||
status = -1
|
status = -1
|
||||||
|
|
||||||
|
# full cleanup (just in case)
|
||||||
|
if process:
|
||||||
|
kill_all_child_processes(process.pid)
|
||||||
|
|
||||||
# if running in services mode, keep the file open
|
# if running in services mode, keep the file open
|
||||||
# in case the docker was so quick it started and finished, check the stop reason
|
# in case the docker was so quick it started and finished, check the stop reason
|
||||||
if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started':
|
if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started':
|
||||||
@ -1792,15 +1825,16 @@ class Worker(ServiceCommandSection):
|
|||||||
debug=self._session.debug_mode,
|
debug=self._session.debug_mode,
|
||||||
trace=self._session.trace,
|
trace=self._session.trace,
|
||||||
)
|
)
|
||||||
self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
|
try:
|
||||||
self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
|
self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
|
||||||
|
self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
|
||||||
|
finally:
|
||||||
|
self.stop_monitor()
|
||||||
|
self._unregister()
|
||||||
|
|
||||||
self.stop_monitor()
|
if full_monitoring and self.temp_config_path:
|
||||||
self._unregister()
|
safe_remove_file(self._session.config_file)
|
||||||
|
Singleton.close_pid_file()
|
||||||
if full_monitoring and self.temp_config_path:
|
|
||||||
safe_remove_file(self._session.config_file)
|
|
||||||
Singleton.close_pid_file()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
self._session.print_configuration()
|
self._session.print_configuration()
|
||||||
@ -1908,7 +1942,6 @@ class Worker(ServiceCommandSection):
|
|||||||
if current_task.script.binary and current_task.script.binary.startswith('python') and \
|
if current_task.script.binary and current_task.script.binary.startswith('python') and \
|
||||||
execution.entry_point and execution.entry_point.split()[0].strip() == '-m':
|
execution.entry_point and execution.entry_point.split()[0].strip() == '-m':
|
||||||
# we need to split it
|
# we need to split it
|
||||||
import shlex
|
|
||||||
extra.extend(shlex.split(execution.entry_point))
|
extra.extend(shlex.split(execution.entry_point))
|
||||||
else:
|
else:
|
||||||
extra.append(execution.entry_point)
|
extra.append(execution.entry_point)
|
||||||
@ -2693,8 +2726,11 @@ class Worker(ServiceCommandSection):
|
|||||||
if temp_config.get("agent.venvs_cache.path", None):
|
if temp_config.get("agent.venvs_cache.path", None):
|
||||||
temp_config.put("agent.venvs_cache.path", '/root/.clearml/venvs-cache')
|
temp_config.put("agent.venvs_cache.path", '/root/.clearml/venvs-cache')
|
||||||
|
|
||||||
self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
|
if ENV_AGENT_DISABLE_SSH_MOUNT.get():
|
||||||
self._temp_cleanup_list.append(self._host_ssh_cache)
|
self._host_ssh_cache = None
|
||||||
|
else:
|
||||||
|
self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
|
||||||
|
self._temp_cleanup_list.append(self._host_ssh_cache)
|
||||||
|
|
||||||
return temp_config, partial(self._get_docker_config_cmd, temp_config=temp_config)
|
return temp_config, partial(self._get_docker_config_cmd, temp_config=temp_config)
|
||||||
|
|
||||||
@ -2716,24 +2752,31 @@ class Worker(ServiceCommandSection):
|
|||||||
"agent.docker_pip_cache", '~/.clearml/pip-cache'))).expanduser().as_posix()
|
"agent.docker_pip_cache", '~/.clearml/pip-cache'))).expanduser().as_posix()
|
||||||
|
|
||||||
# make sure all folders are valid
|
# make sure all folders are valid
|
||||||
Path(host_apt_cache).mkdir(parents=True, exist_ok=True)
|
if host_apt_cache:
|
||||||
Path(host_pip_cache).mkdir(parents=True, exist_ok=True)
|
Path(host_apt_cache).mkdir(parents=True, exist_ok=True)
|
||||||
Path(host_cache).mkdir(parents=True, exist_ok=True)
|
if host_pip_cache:
|
||||||
Path(host_pip_dl).mkdir(parents=True, exist_ok=True)
|
Path(host_pip_cache).mkdir(parents=True, exist_ok=True)
|
||||||
Path(host_vcs_cache).mkdir(parents=True, exist_ok=True)
|
if host_cache:
|
||||||
Path(host_ssh_cache).mkdir(parents=True, exist_ok=True)
|
Path(host_cache).mkdir(parents=True, exist_ok=True)
|
||||||
|
if host_pip_dl:
|
||||||
|
Path(host_pip_dl).mkdir(parents=True, exist_ok=True)
|
||||||
|
if host_vcs_cache:
|
||||||
|
Path(host_vcs_cache).mkdir(parents=True, exist_ok=True)
|
||||||
|
if host_ssh_cache:
|
||||||
|
Path(host_ssh_cache).mkdir(parents=True, exist_ok=True)
|
||||||
if host_venvs_cache:
|
if host_venvs_cache:
|
||||||
Path(host_venvs_cache).mkdir(parents=True, exist_ok=True)
|
Path(host_venvs_cache).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# copy the .ssh folder to a temp folder, to be mapped into docker
|
if host_ssh_cache:
|
||||||
# noinspection PyBroadException
|
# copy the .ssh folder to a temp folder, to be mapped into docker
|
||||||
try:
|
# noinspection PyBroadException
|
||||||
if Path(host_ssh_cache).is_dir():
|
try:
|
||||||
shutil.rmtree(host_ssh_cache, ignore_errors=True)
|
if Path(host_ssh_cache).is_dir():
|
||||||
shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache)
|
shutil.rmtree(host_ssh_cache, ignore_errors=True)
|
||||||
except Exception:
|
shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache)
|
||||||
host_ssh_cache = None
|
except Exception:
|
||||||
self.log.warning('Failed creating temporary copy of ~/.ssh for git credential')
|
host_ssh_cache = None
|
||||||
|
self.log.warning('Failed creating temporary copy of ~/.ssh for git credential')
|
||||||
|
|
||||||
# check if the .git credentials exist:
|
# check if the .git credentials exist:
|
||||||
try:
|
try:
|
||||||
@ -3080,7 +3123,7 @@ class Worker(ServiceCommandSection):
|
|||||||
warning('Could not terminate process pid={}'.format(pid))
|
warning('Could not terminate process pid={}'.format(pid))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# wither we have a match for the worker_id or we just pick the first one, and kill it.
|
# either we have a match for the worker_id or we just pick the first one, and kill it.
|
||||||
if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))):
|
if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))):
|
||||||
# this is us kill it
|
# this is us kill it
|
||||||
print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid))
|
print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid))
|
||||||
@ -3143,6 +3186,33 @@ class Worker(ServiceCommandSection):
|
|||||||
queue_ids.append(q_id)
|
queue_ids.append(q_id)
|
||||||
return queue_ids
|
return queue_ids
|
||||||
|
|
||||||
|
def _sanitize_docker_command(self, docker_command):
|
||||||
|
# type: (List[str]) -> List[str]
|
||||||
|
if not self._session.config.get('agent.hide_docker_command_env_vars.enabled', False):
|
||||||
|
return docker_command
|
||||||
|
|
||||||
|
keys = set(self._session.config.get('agent.hide_docker_command_env_vars.extra_keys', []))
|
||||||
|
keys.update(
|
||||||
|
ENV_AGENT_GIT_PASS.vars,
|
||||||
|
ENV_AGENT_SECRET_KEY.vars,
|
||||||
|
ENV_AWS_SECRET_KEY.vars,
|
||||||
|
ENV_AZURE_ACCOUNT_KEY.vars
|
||||||
|
)
|
||||||
|
|
||||||
|
result = docker_command[:]
|
||||||
|
for i, item in enumerate(docker_command):
|
||||||
|
try:
|
||||||
|
if item not in ("-e", "--env"):
|
||||||
|
continue
|
||||||
|
key, sep, _ = result[i + 1].partition("=")
|
||||||
|
if key not in keys or not sep:
|
||||||
|
continue
|
||||||
|
result[i + 1] = "{}={}".format(key, "********")
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pass
|
pass
|
||||||
|
@ -62,6 +62,10 @@ class EnvironmentConfig(object):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||||
|
ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
|
||||||
|
ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")
|
||||||
|
|
||||||
ENVIRONMENT_CONFIG = {
|
ENVIRONMENT_CONFIG = {
|
||||||
"api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
|
"api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
|
||||||
"api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ),
|
"api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ),
|
||||||
@ -69,9 +73,7 @@ ENVIRONMENT_CONFIG = {
|
|||||||
"api.credentials.access_key": EnvironmentConfig(
|
"api.credentials.access_key": EnvironmentConfig(
|
||||||
"CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
|
"CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
|
||||||
),
|
),
|
||||||
"api.credentials.secret_key": EnvironmentConfig(
|
"api.credentials.secret_key": ENV_AGENT_SECRET_KEY,
|
||||||
"CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY",
|
|
||||||
),
|
|
||||||
"agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
|
"agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
|
||||||
"agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
|
"agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
|
||||||
"agent.cuda_version": EnvironmentConfig(
|
"agent.cuda_version": EnvironmentConfig(
|
||||||
@ -84,10 +86,10 @@ ENVIRONMENT_CONFIG = {
|
|||||||
names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
|
names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
|
||||||
),
|
),
|
||||||
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
|
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
|
||||||
"sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"),
|
"sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
|
||||||
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
|
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
|
||||||
"sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
"sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
||||||
'account_key': EnvironmentConfig("AZURE_STORAGE_KEY")},
|
'account_key': ENV_AZURE_ACCOUNT_KEY},
|
||||||
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
|
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,6 +134,7 @@ ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', '
|
|||||||
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
|
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
|
||||||
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
|
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
|
||||||
ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
|
ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
|
||||||
|
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig('CLEARML_AGENT_DISABLE_SSH_MOUNT', type=bool)
|
||||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
|
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
|
||||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
|
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
|
||||||
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
|
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
|
||||||
|
@ -32,9 +32,9 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
K8S_DEFAULT_NAMESPACE = "clearml"
|
K8S_DEFAULT_NAMESPACE = "clearml"
|
||||||
|
|
||||||
KUBECTL_APPLY_CMD = "kubectl apply -f"
|
KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"
|
||||||
|
|
||||||
KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \
|
KUBECTL_RUN_CMD = "kubectl run clearml-id-{task_id} " \
|
||||||
"--image {docker_image} " \
|
"--image {docker_image} " \
|
||||||
"--restart=Never " \
|
"--restart=Never " \
|
||||||
"--namespace={namespace}"
|
"--namespace={namespace}"
|
||||||
@ -95,6 +95,7 @@ class K8sIntegration(Worker):
|
|||||||
clearml_conf_file=None,
|
clearml_conf_file=None,
|
||||||
extra_bash_init_script=None,
|
extra_bash_init_script=None,
|
||||||
namespace=None,
|
namespace=None,
|
||||||
|
max_pods_limit=None,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -122,6 +123,7 @@ class K8sIntegration(Worker):
|
|||||||
:param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional)
|
:param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional)
|
||||||
:param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
|
:param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
|
||||||
:param str namespace: K8S namespace to be used when creating the new pods (default: clearml)
|
:param str namespace: K8S namespace to be used when creating the new pods (default: clearml)
|
||||||
|
:param int max_pods_limit: Maximum number of pods that K8S glue can run at the same time
|
||||||
"""
|
"""
|
||||||
super(K8sIntegration, self).__init__()
|
super(K8sIntegration, self).__init__()
|
||||||
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
|
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
|
||||||
@ -147,6 +149,7 @@ class K8sIntegration(Worker):
|
|||||||
self.namespace = namespace or self.K8S_DEFAULT_NAMESPACE
|
self.namespace = namespace or self.K8S_DEFAULT_NAMESPACE
|
||||||
self.pod_limits = []
|
self.pod_limits = []
|
||||||
self.pod_requests = []
|
self.pod_requests = []
|
||||||
|
self.max_pods_limit = max_pods_limit if not self.ports_mode else None
|
||||||
if overrides_yaml:
|
if overrides_yaml:
|
||||||
with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
|
with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
|
||||||
overrides = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
overrides = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
||||||
@ -304,20 +307,22 @@ class K8sIntegration(Worker):
|
|||||||
except Exception:
|
except Exception:
|
||||||
queue_name = 'k8s'
|
queue_name = 'k8s'
|
||||||
|
|
||||||
# conform queue name to k8s standards
|
|
||||||
safe_queue_name = queue_name.lower().strip()
|
|
||||||
safe_queue_name = re.sub(r'\W+', '', safe_queue_name).replace('_', '').replace('-', '')
|
|
||||||
|
|
||||||
# Search for a free pod number
|
# Search for a free pod number
|
||||||
pod_count = 0
|
pod_count = 0
|
||||||
pod_number = self.base_pod_num
|
pod_number = self.base_pod_num
|
||||||
while self.ports_mode:
|
while self.ports_mode or self.max_pods_limit:
|
||||||
pod_number = self.base_pod_num + pod_count
|
pod_number = self.base_pod_num + pod_count
|
||||||
kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
|
if self.ports_mode:
|
||||||
pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
|
kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
|
||||||
agent_label=self.AGENT_LABEL,
|
pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
|
||||||
namespace=self.namespace,
|
agent_label=self.AGENT_LABEL,
|
||||||
)
|
namespace=self.namespace,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
|
||||||
|
agent_label=self.AGENT_LABEL,
|
||||||
|
namespace=self.namespace,
|
||||||
|
)
|
||||||
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
output, error = process.communicate()
|
output, error = process.communicate()
|
||||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||||
@ -326,21 +331,47 @@ class K8sIntegration(Worker):
|
|||||||
if not output:
|
if not output:
|
||||||
# No such pod exist so we can use the pod_number we found
|
# No such pod exist so we can use the pod_number we found
|
||||||
break
|
break
|
||||||
if pod_count >= self.num_of_services - 1:
|
|
||||||
# All pod numbers are taken, exit
|
if self.max_pods_limit:
|
||||||
|
try:
|
||||||
|
current_pod_count = len(json.loads(output).get("items", []))
|
||||||
|
except (ValueError, TypeError) as ex:
|
||||||
|
self.log.warning(
|
||||||
|
"K8S Glue pods monitor: Failed parsing kubectl output:\n{}\ntask '{}' "
|
||||||
|
"will be enqueued back to queue '{}'\nEx: {}".format(
|
||||||
|
output, task_id, queue, ex
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self._session.api_client.tasks.reset(task_id)
|
||||||
|
self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error')
|
||||||
|
return
|
||||||
|
max_count = self.max_pods_limit
|
||||||
|
else:
|
||||||
|
current_pod_count = pod_count
|
||||||
|
max_count = self.num_of_services - 1
|
||||||
|
|
||||||
|
if current_pod_count >= max_count:
|
||||||
|
# All pods are taken, exit
|
||||||
|
self.log.debug(
|
||||||
|
"kubectl last result: {}\n{}".format(error, output))
|
||||||
self.log.warning(
|
self.log.warning(
|
||||||
"kubectl last result: {}\n{}\nAll k8s services are in use, task '{}' "
|
"All k8s services are in use, task '{}' "
|
||||||
"will be enqueued back to queue '{}'".format(
|
"will be enqueued back to queue '{}'".format(
|
||||||
error, output, task_id, queue
|
task_id, queue
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self._session.api_client.tasks.reset(task_id)
|
self._session.api_client.tasks.reset(task_id)
|
||||||
self._session.api_client.tasks.enqueue(
|
self._session.api_client.tasks.enqueue(
|
||||||
task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
|
task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
|
||||||
return
|
return
|
||||||
|
elif self.max_pods_limit:
|
||||||
|
# max pods limit hasn't reached yet, so we can create the pod
|
||||||
|
break
|
||||||
pod_count += 1
|
pod_count += 1
|
||||||
|
|
||||||
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]
|
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]
|
||||||
|
labels.append("clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)))
|
||||||
|
labels.append("clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name)))
|
||||||
|
|
||||||
if self.ports_mode:
|
if self.ports_mode:
|
||||||
print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
|
print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
|
||||||
@ -351,13 +382,13 @@ class K8sIntegration(Worker):
|
|||||||
output, error = self._kubectl_apply(
|
output, error = self._kubectl_apply(
|
||||||
create_clearml_conf=create_clearml_conf,
|
create_clearml_conf=create_clearml_conf,
|
||||||
labels=labels, docker_image=docker_image, docker_args=docker_args,
|
labels=labels, docker_image=docker_image, docker_args=docker_args,
|
||||||
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
task_id=task_id, queue=queue)
|
||||||
else:
|
else:
|
||||||
output, error = self._kubectl_run(
|
output, error = self._kubectl_run(
|
||||||
create_clearml_conf=create_clearml_conf,
|
create_clearml_conf=create_clearml_conf,
|
||||||
labels=labels, docker_image=docker_cmd,
|
labels=labels, docker_image=docker_cmd,
|
||||||
task_data=task_data,
|
task_data=task_data,
|
||||||
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
task_id=task_id, queue=queue)
|
||||||
|
|
||||||
error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
|
error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
|
||||||
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
|
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
|
||||||
@ -404,15 +435,16 @@ class K8sIntegration(Worker):
|
|||||||
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
|
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
|
||||||
return {'env': kube_args} if kube_args else {}
|
return {'env': kube_args} if kube_args else {}
|
||||||
|
|
||||||
def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id, queue_name):
|
def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id):
|
||||||
template = deepcopy(self.template_dict)
|
template = deepcopy(self.template_dict)
|
||||||
template.setdefault('apiVersion', 'v1')
|
template.setdefault('apiVersion', 'v1')
|
||||||
template['kind'] = 'Pod'
|
template['kind'] = 'Pod'
|
||||||
template.setdefault('metadata', {})
|
template.setdefault('metadata', {})
|
||||||
name = 'clearml-{queue}-id-{task_id}'.format(queue=queue_name, task_id=task_id)
|
name = 'clearml-id-{task_id}'.format(task_id=task_id)
|
||||||
template['metadata']['name'] = name
|
template['metadata']['name'] = name
|
||||||
template.setdefault('spec', {})
|
template.setdefault('spec', {})
|
||||||
template['spec'].setdefault('containers', [])
|
template['spec'].setdefault('containers', [])
|
||||||
|
template['spec'].setdefault('restartPolicy', 'Never')
|
||||||
if labels:
|
if labels:
|
||||||
labels_dict = dict(pair.split('=', 1) for pair in labels)
|
labels_dict = dict(pair.split('=', 1) for pair in labels)
|
||||||
template['metadata'].setdefault('labels', {})
|
template['metadata'].setdefault('labels', {})
|
||||||
@ -474,12 +506,11 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
return output, error
|
return output, error
|
||||||
|
|
||||||
def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id, queue_name):
|
def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id):
|
||||||
if callable(self.kubectl_cmd):
|
if callable(self.kubectl_cmd):
|
||||||
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data, queue_name)
|
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
|
||||||
else:
|
else:
|
||||||
kubectl_cmd = self.kubectl_cmd.format(
|
kubectl_cmd = self.kubectl_cmd.format(
|
||||||
queue_name=queue_name,
|
|
||||||
task_id=task_id,
|
task_id=task_id,
|
||||||
docker_image=docker_image,
|
docker_image=docker_image,
|
||||||
queue_id=queue,
|
queue_id=queue,
|
||||||
@ -607,3 +638,13 @@ class K8sIntegration(Worker):
|
|||||||
return merge_dicts(
|
return merge_dicts(
|
||||||
c1, c2, custom_merge_func=merge_env
|
c1, c2, custom_merge_func=merge_env
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _safe_k8s_label_value(value):
|
||||||
|
""" Conform string to k8s standards for a label value """
|
||||||
|
value = value.lower().strip()
|
||||||
|
value = re.sub(r'^[^A-Za-z0-9]+', '', value) # strip leading non-alphanumeric chars
|
||||||
|
value = re.sub(r'[^A-Za-z0-9]+$', '', value) # strip trailing non-alphanumeric chars
|
||||||
|
value = re.sub(r'\W+', '-', value) # allow only word chars (this removed "." which is supported, but nvm)
|
||||||
|
value = re.sub(r'-+', '-', value) # don't leave messy "--" after replacing previous chars
|
||||||
|
return value[:63]
|
||||||
|
@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
|
|||||||
return output if not strip or not output else output.strip()
|
return output if not strip or not output else output.strip()
|
||||||
|
|
||||||
|
|
||||||
def terminate_process(pid, timeout=10.):
|
def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
proc = psutil.Process(pid)
|
proc = psutil.Process(pid)
|
||||||
|
children = proc.children(recursive=True) if include_children else []
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while proc.is_running() and cnt < timeout:
|
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||||
sleep(1.)
|
sleep(1.)
|
||||||
cnt += 1
|
cnt += 1
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
|
|
||||||
|
# terminate children
|
||||||
|
for c in children:
|
||||||
|
c.terminate()
|
||||||
|
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while proc.is_running() and cnt < timeout:
|
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||||
sleep(1.)
|
sleep(1.)
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
|
||||||
|
# kill children
|
||||||
|
for c in children:
|
||||||
|
c.kill()
|
||||||
|
|
||||||
proc.kill()
|
proc.kill()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10.):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def kill_all_child_processes(pid=None):
|
def kill_all_child_processes(pid=None, include_parent=True):
|
||||||
# get current process if pid not provided
|
# get current process if pid not provided
|
||||||
include_parent = True
|
|
||||||
if not pid:
|
if not pid:
|
||||||
pid = os.getpid()
|
pid = os.getpid()
|
||||||
include_parent = False
|
include_parent = False
|
||||||
@ -84,6 +94,23 @@ def kill_all_child_processes(pid=None):
|
|||||||
parent.kill()
|
parent.kill()
|
||||||
|
|
||||||
|
|
||||||
|
def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
|
||||||
|
# get current process if pid not provided
|
||||||
|
if not pid:
|
||||||
|
pid = os.getpid()
|
||||||
|
include_parent = False
|
||||||
|
try:
|
||||||
|
parent = psutil.Process(pid)
|
||||||
|
except psutil.Error:
|
||||||
|
# could not find parent process id
|
||||||
|
return
|
||||||
|
for child in parent.children(recursive=False):
|
||||||
|
print('Terminating child process {}'.format(child.pid))
|
||||||
|
terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True)
|
||||||
|
if include_parent:
|
||||||
|
terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
|
||||||
|
|
||||||
|
|
||||||
def get_docker_id(docker_cmd_contains):
|
def get_docker_id(docker_cmd_contains):
|
||||||
try:
|
try:
|
||||||
containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"')
|
containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"')
|
||||||
@ -194,6 +221,7 @@ class Argv(Executable):
|
|||||||
"""
|
"""
|
||||||
self.argv = argv
|
self.argv = argv
|
||||||
self._log = kwargs.pop("log", None)
|
self._log = kwargs.pop("log", None)
|
||||||
|
self._display_argv = kwargs.pop("display_argv", argv)
|
||||||
if not self._log:
|
if not self._log:
|
||||||
self._log = logging.getLogger(__name__)
|
self._log = logging.getLogger(__name__)
|
||||||
self._log.propagate = False
|
self._log.propagate = False
|
||||||
@ -218,10 +246,10 @@ class Argv(Executable):
|
|||||||
return self.argv
|
return self.argv
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<Argv{}>".format(self.argv)
|
return "<Argv{}>".format(self._display_argv)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "Executing: {}".format(self.argv)
|
return "Executing: {}".format(self._display_argv)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
if is_windows_platform():
|
if is_windows_platform():
|
||||||
|
@ -591,7 +591,7 @@ def clone_repository_cached(session, execution, destination):
|
|||||||
# mock lock
|
# mock lock
|
||||||
repo_lock = Lock()
|
repo_lock = Lock()
|
||||||
repo_lock_timeout_sec = 300
|
repo_lock_timeout_sec = 300
|
||||||
repo_url = execution.repository # type: str
|
repo_url = execution.repository or '' # type: str
|
||||||
parsed_url = furl(repo_url)
|
parsed_url = furl(repo_url)
|
||||||
no_password_url = parsed_url.copy().remove(password=True).url
|
no_password_url = parsed_url.copy().remove(password=True).url
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = '0.17.2'
|
__version__ = '1.0.1rc1'
|
||||||
|
@ -42,6 +42,9 @@ agent {
|
|||||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||||
# The default is the python executing the clearml_agent
|
# The default is the python executing the clearml_agent
|
||||||
python_binary: ""
|
python_binary: ""
|
||||||
|
# ignore any requested python version (Default: False, if a Task was using a
|
||||||
|
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||||
|
# ignore_requested_python_version: true
|
||||||
|
|
||||||
# select python package manager:
|
# select python package manager:
|
||||||
# currently supported pip and conda
|
# currently supported pip and conda
|
||||||
@ -107,11 +110,12 @@ agent {
|
|||||||
path: ~/.clearml/vcs-cache
|
path: ~/.clearml/vcs-cache
|
||||||
},
|
},
|
||||||
|
|
||||||
|
# DEPRECATED: please use `venvs_cache` and set `venvs_cache.path`
|
||||||
# use venv-update in order to accelerate python virtual environment building
|
# use venv-update in order to accelerate python virtual environment building
|
||||||
# Still in beta, turned off by default
|
# Still in beta, turned off by default
|
||||||
venv_update: {
|
# venv_update: {
|
||||||
enabled: false,
|
# enabled: false,
|
||||||
},
|
# },
|
||||||
|
|
||||||
# cached folder for specific python package download (mostly pytorch versions)
|
# cached folder for specific python package download (mostly pytorch versions)
|
||||||
pip_download_cache {
|
pip_download_cache {
|
||||||
|
@ -10,12 +10,15 @@ from clearml_agent.glue.k8s import K8sIntegration
|
|||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--queue", type=str, help="Queue to pull tasks from"
|
"--queue", type=str, help="Queue to pull tasks from"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
group.add_argument(
|
||||||
"--ports-mode", action='store_true', default=False,
|
"--ports-mode", action='store_true', default=False,
|
||||||
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
||||||
|
"Should not be used with max-pods"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-of-services", type=int, default=20,
|
"--num-of-services", type=int, default=20,
|
||||||
@ -57,6 +60,11 @@ def parse_args():
|
|||||||
"--namespace", type=str,
|
"--namespace", type=str,
|
||||||
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
|
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
|
||||||
)
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--max-pods", type=int,
|
||||||
|
help="Limit the maximum number of pods that this service can run at the same time."
|
||||||
|
"Should not be used with ports-mode"
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -77,7 +85,7 @@ def main():
|
|||||||
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
|
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
|
||||||
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
||||||
namespace=args.namespace,
|
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
||||||
)
|
)
|
||||||
k8s.k8s_daemon(args.queue)
|
k8s.k8s_daemon(args.queue)
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ pyhocon>=0.3.38,<0.4.0
|
|||||||
pyparsing>=2.0.3,<2.5.0
|
pyparsing>=2.0.3,<2.5.0
|
||||||
python-dateutil>=2.4.2,<2.9.0
|
python-dateutil>=2.4.2,<2.9.0
|
||||||
pyjwt>=1.6.4,<2.1.0
|
pyjwt>=1.6.4,<2.1.0
|
||||||
PyYAML>=3.12,<5.4.0
|
PyYAML>=3.12,<5.5.0
|
||||||
requests>=2.20.0,<2.26.0
|
requests>=2.20.0,<2.26.0
|
||||||
six>=1.11.0,<1.16.0
|
six>=1.11.0,<1.16.0
|
||||||
typing>=3.6.4,<3.8.0
|
typing>=3.6.4,<3.8.0
|
||||||
|
Loading…
Reference in New Issue
Block a user