mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
42450dcbc4 | ||
|
|
ef47225d41 | ||
|
|
e61accefb9 | ||
|
|
5c1543d112 | ||
|
|
7ff6aee20c | ||
|
|
37ea381d98 | ||
|
|
67fc884895 | ||
|
|
1e3646b57c | ||
|
|
ba2db4e727 | ||
|
|
077148be00 | ||
|
|
594ee5842e | ||
|
|
a69766bd8b | ||
|
|
857a750eb1 | ||
|
|
26aa50f1b5 | ||
|
|
8b4f1eefc2 | ||
|
|
97c2e21dcc | ||
|
|
918dd39b87 | ||
|
|
7776e906c4 | ||
|
|
1bf865ec08 | ||
|
|
3f1ce847dc |
@@ -197,7 +197,7 @@ with `--cpu-only`).
|
||||
|
||||
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for
|
||||
the `clearml-agent` <br>
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES="none"`, no gpu will be allocated for
|
||||
the `clearml-agent`
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine:
|
||||
|
||||
@@ -137,6 +137,12 @@
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
|
||||
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
|
||||
# default is false, automatically mounts ~/.ssh
|
||||
# Must be set to True if using "clearml-session" with this agent!
|
||||
# disable_ssh_mount: false
|
||||
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
@@ -235,7 +241,8 @@
|
||||
docker_internal_mounts {
|
||||
sdk_cache: "/clearml_agent_cache"
|
||||
apt_cache: "/var/cache/apt/archives"
|
||||
ssh_folder: "/root/.ssh"
|
||||
ssh_folder: "~/.ssh"
|
||||
ssh_ro_folder: "/.ssh"
|
||||
pip_cache: "/root/.cache/pip"
|
||||
poetry_cache: "/root/.cache/pypoetry"
|
||||
vcs_cache: "/root/.clearml/vcs-cache"
|
||||
|
||||
@@ -28,6 +28,9 @@
|
||||
|
||||
pool_maxsize: 512
|
||||
pool_connections: 512
|
||||
|
||||
# Override the default http method, use "put" if working behind GCP load balancer (default: "get")
|
||||
# default_method: "get"
|
||||
}
|
||||
|
||||
auth {
|
||||
|
||||
@@ -8,13 +8,14 @@ from .datamodel import DataModel
|
||||
from .defs import ENV_API_DEFAULT_REQ_METHOD
|
||||
|
||||
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST"):
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST", "PUT"):
|
||||
raise ValueError(
|
||||
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
|
||||
)
|
||||
|
||||
|
||||
class Request(ApiModel):
|
||||
def_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
||||
@@ -14,8 +14,9 @@ from requests.auth import HTTPBasicAuth
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
|
||||
from .callresult import CallResult
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD
|
||||
from .defs import (
|
||||
ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD, )
|
||||
from .request import Request, BatchRequest
|
||||
from .token_manager import TokenManager
|
||||
from ..config import load
|
||||
@@ -110,6 +111,19 @@ class Session(TokenManager):
|
||||
self._logger = logger
|
||||
self.__auth_token = None
|
||||
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
|
||||
# Make sure we update the config object, so we pass it into the new containers when we map them
|
||||
self.config["api.http.default_method"] = ENV_API_DEFAULT_REQ_METHOD.get()
|
||||
# notice the default setting of Request.def_method are already set by the OS environment
|
||||
elif self.config.get("api.http.default_method", None):
|
||||
def_method = str(self.config.get("api.http.default_method", None)).strip()
|
||||
if def_method.upper() not in ("GET", "POST", "PUT"):
|
||||
raise ValueError(
|
||||
"api.http.default_method variable must be 'get' or 'post' (any case is allowed)."
|
||||
)
|
||||
Request.def_method = def_method
|
||||
Request._method = Request.def_method
|
||||
|
||||
if ENV_AUTH_TOKEN.get(
|
||||
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
|
||||
):
|
||||
@@ -251,7 +265,7 @@ class Session(TokenManager):
|
||||
service,
|
||||
action,
|
||||
version=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
headers=None,
|
||||
auth=None,
|
||||
data=None,
|
||||
@@ -328,7 +342,7 @@ class Session(TokenManager):
|
||||
service,
|
||||
action,
|
||||
version=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
headers=None,
|
||||
data=None,
|
||||
json=None,
|
||||
@@ -371,7 +385,7 @@ class Session(TokenManager):
|
||||
headers=None,
|
||||
data=None,
|
||||
json=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
):
|
||||
"""
|
||||
Send a raw batch API request. Batch requests always use application/json-lines content type.
|
||||
@@ -615,7 +629,7 @@ class Session(TokenManager):
|
||||
try:
|
||||
data = {"expiration_sec": exp} if exp else {}
|
||||
res = self._send_request(
|
||||
method=ENV_API_DEFAULT_REQ_METHOD.get(default="get"),
|
||||
method=Request.def_method,
|
||||
service="auth",
|
||||
action="login",
|
||||
auth=auth,
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import json
|
||||
import re
|
||||
import shlex
|
||||
|
||||
from clearml_agent.backend_api.session import Request
|
||||
from clearml_agent.helper.package.requirements import (
|
||||
RequirementsManager, MarkerRequirement,
|
||||
compare_version_rules, )
|
||||
@@ -26,7 +28,7 @@ def resolve_default_container(session, task_id, container_config):
|
||||
'script.repository', 'script.branch',
|
||||
'project', 'container'],
|
||||
'search_hidden': True},
|
||||
method='get',
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
@@ -53,7 +55,7 @@ def resolve_default_container(session, task_id, container_config):
|
||||
'id': [task_info.get('project')],
|
||||
'only_fields': ['name'],
|
||||
},
|
||||
method='get',
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
|
||||
@@ -38,7 +38,7 @@ from clearml_agent.backend_api.services import auth as auth_api
|
||||
from clearml_agent.backend_api.services import queues as queues_api
|
||||
from clearml_agent.backend_api.services import tasks as tasks_api
|
||||
from clearml_agent.backend_api.services import workers as workers_api
|
||||
from clearml_agent.backend_api.session import CallResult
|
||||
from clearml_agent.backend_api.session import CallResult, Request
|
||||
from clearml_agent.backend_api.session.defs import (
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
|
||||
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
|
||||
@@ -71,6 +71,8 @@ from clearml_agent.definitions import (
|
||||
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL,
|
||||
WORKING_STANDALONE_DIR,
|
||||
ENV_DEBUG_INFO,
|
||||
ENV_CHILD_AGENTS_COUNT_CMD,
|
||||
ENV_DOCKER_ARGS_FILTERS,
|
||||
)
|
||||
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
|
||||
from clearml_agent.errors import (
|
||||
@@ -272,7 +274,7 @@ def get_task(session, task_id, **kwargs):
|
||||
action='get_all',
|
||||
version='2.14',
|
||||
json={"id": [task_id], "search_hidden": True, **kwargs},
|
||||
method='get',
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
result = CallResult.from_result(
|
||||
@@ -304,7 +306,7 @@ def get_next_task(session, queue, get_task_info=False):
|
||||
action='get_next_task',
|
||||
version='2.14',
|
||||
json=request,
|
||||
method='get',
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
if not result.ok:
|
||||
@@ -325,7 +327,7 @@ def get_task_container(session, task_id):
|
||||
action='get_all',
|
||||
version='2.14',
|
||||
json={'id': [task_id], 'only_fields': ['container'], 'search_hidden': True},
|
||||
method='get',
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
@@ -366,7 +368,7 @@ def set_task_container(session, task_id, docker_image=None, docker_arguments=Non
|
||||
action='edit',
|
||||
version='2.13',
|
||||
json={'task': task_id, 'container': container, 'force': True},
|
||||
method='get',
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
return result.ok
|
||||
@@ -685,6 +687,16 @@ class Worker(ServiceCommandSection):
|
||||
# str - not supported, version string indicates last server version
|
||||
self._runtime_props_support = None
|
||||
|
||||
# allow docker sanitization, needs backend support
|
||||
if ENV_DOCKER_ARGS_FILTERS.get():
|
||||
self._docker_args_filters = \
|
||||
[re.compile(f) for f in shlex.split(ENV_DOCKER_ARGS_FILTERS.get())]
|
||||
elif self._session.config.get('agent.docker_args_filters', None):
|
||||
self._docker_args_filters = \
|
||||
[re.compile(f) for f in self._session.config.get('agent.docker_args_filters', [])]
|
||||
else:
|
||||
self._docker_args_filters = []
|
||||
|
||||
@classmethod
|
||||
def _verify_command_states(cls, kwargs):
|
||||
"""
|
||||
@@ -1379,6 +1391,9 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
self._session.print_configuration()
|
||||
|
||||
def resolve_daemon_queue_names(self, queues, create_if_missing=False):
|
||||
return self._resolve_queue_names(queues=queues, create_if_missing=create_if_missing)
|
||||
|
||||
def daemon(self, queues, log_level, foreground=False, docker=False, detached=False, order_fairness=False, **kwargs):
|
||||
self._apply_extra_configuration()
|
||||
|
||||
@@ -1421,7 +1436,7 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
# if we do not need to create queues, make sure they are valid
|
||||
# match previous behaviour when we validated queue names before everything else
|
||||
queues = self._resolve_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))
|
||||
queues = self.resolve_daemon_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))
|
||||
|
||||
queues_info = [
|
||||
q.to_dict()
|
||||
@@ -3265,6 +3280,11 @@ class Worker(ServiceCommandSection):
|
||||
first_time=first_time,
|
||||
)
|
||||
|
||||
# print message so users know they can enable cache
|
||||
if not self.package_api.is_cached_enabled():
|
||||
print('::: Python virtual environment cache is disabled. '
|
||||
'To accelerate spin-up time set `agent.venvs_cache.path=~/.clearml/venvs-cache` :::\n')
|
||||
|
||||
# check if we have a cached folder
|
||||
if cached_requirements and not skip_pip_venv_install and self.package_api.get_cached_venv(
|
||||
requirements=cached_requirements,
|
||||
@@ -3435,7 +3455,7 @@ class Worker(ServiceCommandSection):
|
||||
'-v', '{}:{}'.format(ENV_SSH_AUTH_SOCK.get(), ENV_SSH_AUTH_SOCK.get()),
|
||||
'-e', ssh_auth_sock_env,
|
||||
]
|
||||
elif ENV_AGENT_DISABLE_SSH_MOUNT.get():
|
||||
elif ENV_AGENT_DISABLE_SSH_MOUNT.get() or self._session.config.get("agent.disable_ssh_mount", None):
|
||||
self._host_ssh_cache = None
|
||||
else:
|
||||
self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
|
||||
@@ -3542,6 +3562,7 @@ class Worker(ServiceCommandSection):
|
||||
mounted_vcs_cache = temp_config.get("agent.vcs_cache.path")
|
||||
mounted_venvs_cache = temp_config.get("agent.venvs_cache.path", "")
|
||||
mount_ssh = temp_config.get("agent.docker_internal_mounts.ssh_folder", None)
|
||||
mount_ssh_ro = temp_config.get("agent.docker_internal_mounts.ssh_ro_folder", None)
|
||||
mount_apt_cache = temp_config.get("agent.docker_internal_mounts.apt_cache", None)
|
||||
mount_pip_cache = temp_config.get("agent.docker_internal_mounts.pip_cache", None)
|
||||
mount_poetry_cache = temp_config.get("agent.docker_internal_mounts.poetry_cache", None)
|
||||
@@ -3573,6 +3594,7 @@ class Worker(ServiceCommandSection):
|
||||
preprocess_bash_script=preprocess_bash_script,
|
||||
install_opencv_libs=install_opencv_libs,
|
||||
mount_ssh=mount_ssh,
|
||||
mount_ssh_ro=mount_ssh_ro,
|
||||
mount_apt_cache=mount_apt_cache,
|
||||
mount_pip_cache=mount_pip_cache,
|
||||
mount_poetry_cache=mount_poetry_cache,
|
||||
@@ -3584,15 +3606,11 @@ class Worker(ServiceCommandSection):
|
||||
def _get_child_agents_count_for_worker(self):
|
||||
"""Get the amount of running child agents. In case of any error return 0"""
|
||||
parent_worker_label = self._parent_worker_label.format(self.worker_id)
|
||||
cmd = [
|
||||
'docker',
|
||||
'ps',
|
||||
'--filter',
|
||||
'label={}'.format(parent_worker_label),
|
||||
'--format',
|
||||
# get some fields for debugging
|
||||
'{"ID":"{{ .ID }}", "Image": "{{ .Image }}", "Names":"{{ .Names }}", "Labels":"{{ .Labels }}"}'
|
||||
]
|
||||
|
||||
default_cmd = 'docker ps --filter label={parent_worker_label} --format {{{{.ID}}}}'
|
||||
child_agents_cmd = ENV_CHILD_AGENTS_COUNT_CMD.get() or default_cmd
|
||||
|
||||
cmd = shlex.split(child_agents_cmd.format(parent_worker_label=parent_worker_label))
|
||||
try:
|
||||
output = Argv(*cmd).get_output(
|
||||
stderr=subprocess.STDOUT
|
||||
@@ -3603,6 +3621,31 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
return len(output.splitlines()) if output else 0
|
||||
|
||||
def _filter_docker_args(self, docker_args):
|
||||
# type: (List[str]) -> List[str]
|
||||
"""
|
||||
Filter docker args matching specific flags.
|
||||
Supports list of Regular expressions, e.g self._docker_args_filters = ["^--env$", "^-e$"]
|
||||
|
||||
:argument docker_args: List of docker argument strings (flags and values)
|
||||
"""
|
||||
# if no filtering, do nothing
|
||||
if not docker_args or not self._docker_args_filters:
|
||||
return docker_args
|
||||
|
||||
args = docker_args[:]
|
||||
results = []
|
||||
while args:
|
||||
cmd = args.pop(0).strip()
|
||||
if any(f.match(cmd) for f in self._docker_args_filters):
|
||||
results.append(cmd)
|
||||
if "=" not in cmd and args and not args[0].startswith("-"):
|
||||
try:
|
||||
results.append(args.pop(0).strip())
|
||||
except IndexError:
|
||||
pass
|
||||
return results
|
||||
|
||||
def _get_docker_cmd(
|
||||
self,
|
||||
worker_id, parent_worker_id,
|
||||
@@ -3626,7 +3669,7 @@ class Worker(ServiceCommandSection):
|
||||
auth_token=None,
|
||||
worker_tags=None,
|
||||
name=None,
|
||||
mount_ssh=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
|
||||
mount_ssh=None, mount_ssh_ro=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
|
||||
env_task_id=None,
|
||||
):
|
||||
self.debug("Constructing docker command", context="docker")
|
||||
@@ -3656,6 +3699,7 @@ class Worker(ServiceCommandSection):
|
||||
if docker_arguments:
|
||||
docker_arguments = list(docker_arguments) \
|
||||
if isinstance(docker_arguments, (list, tuple)) else [docker_arguments]
|
||||
docker_arguments = self._filter_docker_args(docker_arguments)
|
||||
base_cmd += [a for a in docker_arguments if a]
|
||||
|
||||
if extra_docker_arguments:
|
||||
@@ -3770,7 +3814,7 @@ class Worker(ServiceCommandSection):
|
||||
clearml_agent_wheel = 'clearml-agent{specify_version}'.format(specify_version=specify_version)
|
||||
|
||||
mount_ssh = mount_ssh or '/root/.ssh'
|
||||
mount_ssh_ro = "{}_ro".format(mount_ssh.rstrip("/"))
|
||||
mount_ssh_ro = mount_ssh_ro or "{}_ro".format(mount_ssh.rstrip("/"))
|
||||
mount_apt_cache = mount_apt_cache or '/var/cache/apt/archives'
|
||||
mount_pip_cache = mount_pip_cache or '/root/.cache/pip'
|
||||
mount_poetry_cache = mount_poetry_cache or '/root/.cache/pypoetry'
|
||||
|
||||
@@ -149,6 +149,8 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEAR
|
||||
ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
|
||||
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
|
||||
ENV_DEBUG_INFO = EnvironmentConfig('CLEARML_AGENT_DEBUG_INFO')
|
||||
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig('CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD')
|
||||
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig('CLEARML_AGENT_DOCKER_ARGS_FILTERS')
|
||||
|
||||
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
|
||||
"""
|
||||
|
||||
@@ -11,6 +11,7 @@ import subprocess
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
from threading import Thread
|
||||
from time import sleep
|
||||
from typing import Text, List, Callable, Any, Collection, Optional, Union
|
||||
@@ -26,8 +27,8 @@ from clearml_agent.helper.dicts import merge_dicts
|
||||
from clearml_agent.helper.process import get_bash_output
|
||||
from clearml_agent.helper.resource_monitor import ResourceMonitor
|
||||
from clearml_agent.interface.base import ObjectID
|
||||
|
||||
from .definitions import ENV_START_AGENT_SCRIPT_PATH
|
||||
from clearml_agent.backend_api.session import Request
|
||||
from clearml_agent.glue.definitions import ENV_START_AGENT_SCRIPT_PATH
|
||||
|
||||
|
||||
class K8sIntegration(Worker):
|
||||
@@ -75,8 +76,8 @@ class K8sIntegration(Worker):
|
||||
"export LOCAL_PYTHON=$(which python3.$i) && break ; done",
|
||||
"[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
|
||||
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
|
||||
"$LOCAL_PYTHON -m pip install clearml-agent",
|
||||
"{extra_bash_init_cmd}",
|
||||
"$LOCAL_PYTHON -m pip install clearml-agent",
|
||||
"{extra_docker_bash_script}",
|
||||
"$LOCAL_PYTHON -m clearml_agent execute --full-monitoring --require-queue --id {task_id}"
|
||||
]
|
||||
@@ -130,6 +131,7 @@ class K8sIntegration(Worker):
|
||||
"""
|
||||
super(K8sIntegration, self).__init__()
|
||||
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
|
||||
self.k8s_pending_queue_id = None
|
||||
self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
|
||||
self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
|
||||
# Always do system packages, because by we will be running inside a docker
|
||||
@@ -297,7 +299,7 @@ class K8sIntegration(Worker):
|
||||
service='tasks',
|
||||
action='update',
|
||||
json={"task": task_id, "status_message": "K8S glue status: {}".format(msg)},
|
||||
method='get',
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
if not result.ok:
|
||||
@@ -394,17 +396,17 @@ class K8sIntegration(Worker):
|
||||
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
|
||||
try:
|
||||
print('Pushing task {} into temporary pending queue'.format(task_id))
|
||||
res = self._session.api_client.tasks.stop(task_id, force=True)
|
||||
_ = self._session.api_client.tasks.stop(task_id, force=True)
|
||||
res = self._session.api_client.tasks.enqueue(
|
||||
task_id,
|
||||
queue=self.k8s_pending_queue_name,
|
||||
queue=self.k8s_pending_queue_id,
|
||||
status_reason='k8s pending scheduler',
|
||||
)
|
||||
if res.meta.result_code != 200:
|
||||
raise Exception(res.meta.result_msg)
|
||||
except Exception as e:
|
||||
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
|
||||
task_id, self.k8s_pending_queue_name, e))
|
||||
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue {} [{}], error: {}".format(
|
||||
task_id, self.k8s_pending_queue_name, self.k8s_pending_queue_id, e))
|
||||
return
|
||||
|
||||
container = get_task_container(self._session, task_id)
|
||||
@@ -679,6 +681,8 @@ class K8sIntegration(Worker):
|
||||
with open(yaml_file, 'wt') as f:
|
||||
yaml.dump(template, f)
|
||||
|
||||
self.log.debug("Applying template:\n{}".format(pformat(template, indent=2)))
|
||||
|
||||
kubectl_cmd = self.KUBECTL_APPLY_CMD.format(
|
||||
task_id=task_id,
|
||||
docker_image=docker_image,
|
||||
@@ -765,13 +769,13 @@ class K8sIntegration(Worker):
|
||||
events_service = self.get_service(Events)
|
||||
|
||||
# make sure we have a k8s pending queue
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._session.api_client.queues.create(self.k8s_pending_queue_name)
|
||||
except Exception:
|
||||
pass
|
||||
# get queue id
|
||||
self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")
|
||||
if not self.k8s_pending_queue_id:
|
||||
resolved_ids = self._resolve_queue_names([self.k8s_pending_queue_name], create_if_missing=True)
|
||||
if not resolved_ids:
|
||||
raise ValueError(
|
||||
"Failed resolving or creating k8s pending queue {}".format(self.k8s_pending_queue_name)
|
||||
)
|
||||
self.k8s_pending_queue_id = resolved_ids[0]
|
||||
|
||||
_last_machine_update_ts = 0
|
||||
while True:
|
||||
|
||||
@@ -213,6 +213,13 @@ class PackageManager(object):
|
||||
return
|
||||
return self._get_cache_manager().get_last_copied_entry()
|
||||
|
||||
def is_cached_enabled(self):
|
||||
if not self._cache_manager:
|
||||
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
|
||||
if not cache_folder:
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
|
||||
# type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]
|
||||
|
||||
@@ -53,17 +53,16 @@ class PytorchWheel(object):
|
||||
python = attr.ib(type=str, converter=lambda x: str(x).replace(".", ""))
|
||||
torch_version = attr.ib(type=str, converter=fix_version)
|
||||
|
||||
url_template = (
|
||||
"http://download.pytorch.org/whl/"
|
||||
"{0.cuda_version}/torch-{0.torch_version}-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
|
||||
)
|
||||
url_template_prefix = "http://download.pytorch.org/whl/"
|
||||
url_template = "{0.cuda_version}/torch-{0.torch_version}" \
|
||||
"-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
|
||||
|
||||
def __attrs_post_init__(self):
|
||||
self.unicode = "u" if self.python.startswith("2") else ""
|
||||
|
||||
def make_url(self):
|
||||
# type: () -> Text
|
||||
return self.url_template.format(self)
|
||||
return (self.url_template_prefix + self.url_template).format(self)
|
||||
|
||||
|
||||
class PytorchResolutionError(FatalSpecsResolutionError):
|
||||
@@ -183,6 +182,19 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
self._fix_setuptools = None
|
||||
self.exceptions = []
|
||||
self._original_req = []
|
||||
# allow override pytorch lookup pages
|
||||
if self.config.get("agent.package_manager.torch_page", None):
|
||||
SimplePytorchRequirement.page_lookup_template = \
|
||||
self.config.get("agent.package_manager.torch_page", None)
|
||||
if self.config.get("agent.package_manager.torch_nightly_page", None):
|
||||
SimplePytorchRequirement.nightly_page_lookup_template = \
|
||||
self.config.get("agent.package_manager.torch_nightly_page", None)
|
||||
if self.config.get("agent.package_manager.torch_url_template_prefix", None):
|
||||
PytorchWheel.url_template_prefix = \
|
||||
self.config.get("agent.package_manager.torch_url_template_prefix", None)
|
||||
if self.config.get("agent.package_manager.torch_url_template", None):
|
||||
PytorchWheel.url_template = \
|
||||
self.config.get("agent.package_manager.torch_url_template", None)
|
||||
|
||||
def _init_python_ver_cuda_ver(self):
|
||||
if self.cuda is None:
|
||||
|
||||
@@ -92,9 +92,10 @@ class ResourceMonitor(object):
|
||||
# None means no filtering, report all gpus
|
||||
self._active_gpus = None
|
||||
try:
|
||||
active_gpus = Session.get_nvidia_visible_env() or ""
|
||||
if active_gpus:
|
||||
self._active_gpus = [g.strip() for g in active_gpus.split(',')]
|
||||
active_gpus = Session.get_nvidia_visible_env()
|
||||
# None means no filtering, report all gpus
|
||||
if active_gpus and active_gpus != "all":
|
||||
self._active_gpus = [g.strip() for g in str(active_gpus).split(',')]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@@ -288,7 +288,7 @@ class Session(_Session):
|
||||
def get(self, service, action, version=None, headers=None,
|
||||
data=None, json=None, async_enable=False, **kwargs):
|
||||
return self._manual_request(service=service, action=action,
|
||||
version=version, method="get", headers=headers,
|
||||
version=version, method=Request.def_method, headers=headers,
|
||||
data=data, async_enable=async_enable,
|
||||
json=json or kwargs)
|
||||
|
||||
@@ -299,7 +299,7 @@ class Session(_Session):
|
||||
data=data, async_enable=async_enable,
|
||||
json=json or kwargs)
|
||||
|
||||
def _manual_request(self, service, action, version=None, method="get", headers=None,
|
||||
def _manual_request(self, service, action, version=None, method=Request.def_method, headers=None,
|
||||
data=None, json=None, async_enable=False, **kwargs):
|
||||
|
||||
res = self.send_request(service=service, action=action,
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '1.3.0'
|
||||
__version__ = '1.4.1'
|
||||
|
||||
@@ -136,6 +136,12 @@ agent {
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
|
||||
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
|
||||
# default is false, automatically mounts ~/.ssh
|
||||
# Must be set to True if using "clearml-session" with this agent!
|
||||
# disable_ssh_mount: false
|
||||
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
|
||||
Reference in New Issue
Block a user