Compare commits

...

4 Commits

4 changed files with 39 additions and 23 deletions

View File

@@ -235,7 +235,8 @@
docker_internal_mounts {
sdk_cache: "/clearml_agent_cache"
apt_cache: "/var/cache/apt/archives"
ssh_folder: "/root/.ssh"
ssh_folder: "~/.ssh"
ssh_ro_folder: "/.ssh"
pip_cache: "/root/.cache/pip"
poetry_cache: "/root/.cache/pypoetry"
vcs_cache: "/root/.clearml/vcs-cache"

View File

@@ -347,7 +347,7 @@ class ServiceCommandSection(BaseCommandSection):
except AttributeError:
raise NameResolutionError('Name resolution unavailable for {}'.format(service))
request = request_cls.from_dict(dict(name=name, only_fields=['name', 'id']))
request = request_cls.from_dict(dict(name=re.escape(name), only_fields=['name', 'id']))
# from_dict will ignore unrecognised keyword arguments - not all GetAll's have only_fields
response = getattr(self._session.send_api(request), service)
matches = [db_object for db_object in response if name.lower() == db_object.name.lower()]

View File

@@ -3503,8 +3503,16 @@ class Worker(ServiceCommandSection):
shutil.rmtree(host_ssh_cache, ignore_errors=True)
shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache)
except Exception:
host_ssh_cache = None
self.log.warning('Failed creating temporary copy of ~/.ssh for git credential')
# if we failed to copy / delete, let's see if we
self.log.warning('Failed creating temporary copy of ~/.ssh for git credential, '
'creating a new temp folder')
# noinspection PyBroadException
try:
host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache)
except Exception:
self.log.warning('Failed creating temporary copy of ~/.ssh for git credential, removing mount!')
host_ssh_cache = None
# check if the .git credentials exist:
try:
@@ -3534,6 +3542,7 @@ class Worker(ServiceCommandSection):
mounted_vcs_cache = temp_config.get("agent.vcs_cache.path")
mounted_venvs_cache = temp_config.get("agent.venvs_cache.path", "")
mount_ssh = temp_config.get("agent.docker_internal_mounts.ssh_folder", None)
mount_ssh_ro = temp_config.get("agent.docker_internal_mounts.ssh_ro_folder", None)
mount_apt_cache = temp_config.get("agent.docker_internal_mounts.apt_cache", None)
mount_pip_cache = temp_config.get("agent.docker_internal_mounts.pip_cache", None)
mount_poetry_cache = temp_config.get("agent.docker_internal_mounts.poetry_cache", None)
@@ -3565,6 +3574,7 @@ class Worker(ServiceCommandSection):
preprocess_bash_script=preprocess_bash_script,
install_opencv_libs=install_opencv_libs,
mount_ssh=mount_ssh,
mount_ssh_ro=mount_ssh_ro,
mount_apt_cache=mount_apt_cache,
mount_pip_cache=mount_pip_cache,
mount_poetry_cache=mount_poetry_cache,
@@ -3618,7 +3628,7 @@ class Worker(ServiceCommandSection):
auth_token=None,
worker_tags=None,
name=None,
mount_ssh=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
mount_ssh=None, mount_ssh_ro=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
env_task_id=None,
):
self.debug("Constructing docker command", context="docker")
@@ -3761,6 +3771,12 @@ class Worker(ServiceCommandSection):
# clearml-agent{specify_version}
clearml_agent_wheel = 'clearml-agent{specify_version}'.format(specify_version=specify_version)
mount_ssh = mount_ssh or '/root/.ssh'
mount_ssh_ro = mount_ssh_ro or "{}_ro".format(mount_ssh.rstrip("/"))
mount_apt_cache = mount_apt_cache or '/var/cache/apt/archives'
mount_pip_cache = mount_pip_cache or '/root/.cache/pip'
mount_poetry_cache = mount_poetry_cache or '/root/.cache/pypoetry'
if not standalone_mode:
if not bash_script:
# Find the highest python version installed, or install from apt-get
@@ -3771,6 +3787,7 @@ class Worker(ServiceCommandSection):
"export DEBIAN_FRONTEND=noninteractive",
"export CLEARML_APT_INSTALL=\"$CLEARML_APT_INSTALL{}\"".format(
' libsm6 libxext6 libxrender-dev libglib2.0-0' if install_opencv_libs else ""),
"cp -Rf {mount_ssh_ro} -T {mount_ssh}" if host_ssh_cache else "",
"[ ! -z $(which git) ] || export CLEARML_APT_INSTALL=\"$CLEARML_APT_INSTALL git\"",
"declare LOCAL_PYTHON",
"[ ! -z $LOCAL_PYTHON ] || for i in {{15..5}}; do which {python_single_digit}.$i && " +
@@ -3798,7 +3815,9 @@ class Worker(ServiceCommandSection):
"$LOCAL_PYTHON -m pip install -U {clearml_agent_wheel} ; ").format(
python_single_digit=python_version.split('.')[0],
python=python_version, pip_version=PackageManager.get_pip_version(),
clearml_agent_wheel=clearml_agent_wheel)
clearml_agent_wheel=clearml_agent_wheel,
mount_ssh_ro=mount_ssh_ro, mount_ssh=mount_ssh,
)
if host_git_credentials:
for git_credentials in host_git_credentials:
@@ -3810,11 +3829,6 @@ class Worker(ServiceCommandSection):
for line in docker_bash_setup_script.split('\n') if line.strip()) + \
' ; '
mount_ssh = mount_ssh or '/root/.ssh'
mount_apt_cache = mount_apt_cache or '/var/cache/apt/archives'
mount_pip_cache = mount_pip_cache or '/root/.cache/pip'
mount_poetry_cache = mount_poetry_cache or '/root/.cache/pypoetry'
self.debug(
"Adding mounts: host_ssh_cache={}, host_apt_cache={}, host_pip_cache={}, host_poetry_cache={}, "
"host_pip_dl={}, host_cache={}, host_vcs_cache={}, host_venvs_cache={}".format(
@@ -3828,7 +3842,7 @@ class Worker(ServiceCommandSection):
(['--name', name] if name else []) +
['-v', conf_file+':'+DOCKER_ROOT_CONF_FILE] +
['-e', "CLEARML_CONFIG_FILE={}".format(DOCKER_ROOT_CONF_FILE)] +
(['-v', host_ssh_cache+':'+mount_ssh] if host_ssh_cache else []) +
(['-v', host_ssh_cache+':'+mount_ssh_ro] if host_ssh_cache else []) +
(['-v', host_apt_cache+':'+mount_apt_cache] if host_apt_cache else []) +
(['-v', host_pip_cache+':'+mount_pip_cache] if host_pip_cache else []) +
(['-v', host_poetry_cache + ':'+mount_poetry_cache] if host_poetry_cache else []) +

View File

@@ -130,6 +130,7 @@ class K8sIntegration(Worker):
"""
super(K8sIntegration, self).__init__()
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
self.k8s_pending_queue_id = None
self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
# Always do system packages, because by we will be running inside a docker
@@ -394,17 +395,17 @@ class K8sIntegration(Worker):
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
try:
print('Pushing task {} into temporary pending queue'.format(task_id))
res = self._session.api_client.tasks.stop(task_id, force=True)
_ = self._session.api_client.tasks.stop(task_id, force=True)
res = self._session.api_client.tasks.enqueue(
task_id,
queue=self.k8s_pending_queue_name,
queue=self.k8s_pending_queue_id,
status_reason='k8s pending scheduler',
)
if res.meta.result_code != 200:
raise Exception(res.meta.result_msg)
except Exception as e:
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
task_id, self.k8s_pending_queue_name, e))
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue {} [{}], error: {}".format(
task_id, self.k8s_pending_queue_name, self.k8s_pending_queue_id, e))
return
container = get_task_container(self._session, task_id)
@@ -765,13 +766,13 @@ class K8sIntegration(Worker):
events_service = self.get_service(Events)
# make sure we have a k8s pending queue
# noinspection PyBroadException
try:
self._session.api_client.queues.create(self.k8s_pending_queue_name)
except Exception:
pass
# get queue id
self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")
if not self.k8s_pending_queue_id:
resolved_ids = self._resolve_queue_names([self.k8s_pending_queue_name], create_if_missing=True)
if not resolved_ids:
raise ValueError(
"Failed resolving or creating k8s pending queue {}".format(self.k8s_pending_queue_name)
)
self.k8s_pending_queue_id = resolved_ids[0]
_last_machine_update_ts = 0
while True: