Compare commits

..

20 Commits

Author SHA1 Message Date
allegroai
42450dcbc4 Update clearml.conf 2022-10-07 15:33:19 +03:00
allegroai
ef47225d41 Version bump to v1.4.1 2022-10-07 15:27:49 +03:00
allegroai
e61accefb9 PEP8 + refactor 2022-10-07 15:26:31 +03:00
allegroai
5c1543d112 Add agent.disable_ssh_mount configuration option (same as CLEARML_AGENT_DISABLE_SSH_MOUNT env var) 2022-10-07 15:24:39 +03:00
allegroai
7ff6aee20c Add warning if venv cache is disabled 2022-10-07 15:23:10 +03:00
allegroai
37ea381d98 Add support for docker args filters 2022-10-07 15:22:42 +03:00
allegroai
67fc884895 Fix --gpus all not reporting GPU stats on worker machine 2022-10-07 15:22:13 +03:00
allegroai
1e3646b57c Fix docker command for monitoring child agents 2022-10-07 15:21:32 +03:00
allegroai
ba2db4e727 Version bump to v1.4.0 2022-09-29 18:21:04 +03:00
allegroai
077148be00 version bump 2022-09-16 17:29:42 +03:00
allegroai
594ee5842e Allow to pverride pytorch lookup page: "agent.package_manager.torch_page / torch_nightly_page / torch_url_template_prefix" 2022-09-15 20:16:41 +03:00
allegroai
a69766bd8b Add CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD to allow overriding child agent count command in k8s 2022-09-15 20:16:01 +03:00
allegroai
857a750eb1 Fix GCP load balancer not fwd GET request body, allow to change default request Action to Put/Post/Get. see api.http.default_method or CLEARML_API_DEFAULT_REQ_METHOD 2022-09-15 20:15:42 +03:00
allegroai
26aa50f1b5 Fix k8s glue extra_bash_init_cmd location in initial bash script 2022-09-02 23:50:03 +03:00
allegroai
8b4f1eefc2 Add more debug printouts in k8s glue 2022-09-02 23:49:28 +03:00
allegroai
97c2e21dcc Fix resolving k8s pending queue may cause a queue with a uuid name to be created 2022-09-02 23:49:28 +03:00
allegroai
918dd39b87 Add docker ssh_ro_folder (default: "/.ssh") changed docker ssh_folder (default: "~/.ssh") 2022-09-02 23:49:27 +03:00
allegroai
7776e906c4 Fix second .ssh temp mount fails if container changes the files inside 2022-09-02 23:49:27 +03:00
allegroai
1bf865ec08 Fix name not escaped as regex (all services "get_all" use regex for name) 2022-09-02 23:49:27 +03:00
Luca Cerone
3f1ce847dc Fixed documentation (#117)
* Fixed documentation

* Update README.md
2022-09-01 17:18:48 +03:00
15 changed files with 158 additions and 55 deletions

View File

@@ -197,7 +197,7 @@ with `--cpu-only`).
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for
the `clearml-agent` <br>
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES="none"`, no gpu will be allocated for
the `clearml-agent`
Example: spin two agents, one per gpu on the same machine:

View File

@@ -137,6 +137,12 @@
},
translate_ssh: true,
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
# default is false, automatically mounts ~/.ssh
# Must be set to True if using "clearml-session" with this agent!
# disable_ssh_mount: false
# reload configuration file every daemon execution
reload_config: false,
@@ -235,7 +241,8 @@
docker_internal_mounts {
sdk_cache: "/clearml_agent_cache"
apt_cache: "/var/cache/apt/archives"
ssh_folder: "/root/.ssh"
ssh_folder: "~/.ssh"
ssh_ro_folder: "/.ssh"
pip_cache: "/root/.cache/pip"
poetry_cache: "/root/.cache/pypoetry"
vcs_cache: "/root/.clearml/vcs-cache"

View File

@@ -28,6 +28,9 @@
pool_maxsize: 512
pool_connections: 512
# Override the default http method, use "put" if working behind GCP load balancer (default: "get")
# default_method: "get"
}
auth {

View File

@@ -8,13 +8,14 @@ from .datamodel import DataModel
from .defs import ENV_API_DEFAULT_REQ_METHOD
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST"):
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST", "PUT"):
raise ValueError(
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
)
class Request(ApiModel):
def_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
def __init__(self, **kwargs):

View File

@@ -14,8 +14,9 @@ from requests.auth import HTTPBasicAuth
from six.moves.urllib.parse import urlparse, urlunparse
from .callresult import CallResult
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD
from .defs import (
ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD, )
from .request import Request, BatchRequest
from .token_manager import TokenManager
from ..config import load
@@ -110,6 +111,19 @@ class Session(TokenManager):
self._logger = logger
self.__auth_token = None
if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
# Make sure we update the config object, so we pass it into the new containers when we map them
self.config["api.http.default_method"] = ENV_API_DEFAULT_REQ_METHOD.get()
# notice the default setting of Request.def_method are already set by the OS environment
elif self.config.get("api.http.default_method", None):
def_method = str(self.config.get("api.http.default_method", None)).strip()
if def_method.upper() not in ("GET", "POST", "PUT"):
raise ValueError(
"api.http.default_method variable must be 'get' or 'post' (any case is allowed)."
)
Request.def_method = def_method
Request._method = Request.def_method
if ENV_AUTH_TOKEN.get(
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
):
@@ -251,7 +265,7 @@ class Session(TokenManager):
service,
action,
version=None,
method="get",
method=Request.def_method,
headers=None,
auth=None,
data=None,
@@ -328,7 +342,7 @@ class Session(TokenManager):
service,
action,
version=None,
method="get",
method=Request.def_method,
headers=None,
data=None,
json=None,
@@ -371,7 +385,7 @@ class Session(TokenManager):
headers=None,
data=None,
json=None,
method="get",
method=Request.def_method,
):
"""
Send a raw batch API request. Batch requests always use application/json-lines content type.
@@ -615,7 +629,7 @@ class Session(TokenManager):
try:
data = {"expiration_sec": exp} if exp else {}
res = self._send_request(
method=ENV_API_DEFAULT_REQ_METHOD.get(default="get"),
method=Request.def_method,
service="auth",
action="login",
auth=auth,

View File

@@ -1,6 +1,8 @@
import json
import re
import shlex
from clearml_agent.backend_api.session import Request
from clearml_agent.helper.package.requirements import (
RequirementsManager, MarkerRequirement,
compare_version_rules, )
@@ -26,7 +28,7 @@ def resolve_default_container(session, task_id, container_config):
'script.repository', 'script.branch',
'project', 'container'],
'search_hidden': True},
method='get',
method=Request.def_method,
async_enable=False,
)
try:
@@ -53,7 +55,7 @@ def resolve_default_container(session, task_id, container_config):
'id': [task_info.get('project')],
'only_fields': ['name'],
},
method='get',
method=Request.def_method,
async_enable=False,
)
try:

View File

@@ -38,7 +38,7 @@ from clearml_agent.backend_api.services import auth as auth_api
from clearml_agent.backend_api.services import queues as queues_api
from clearml_agent.backend_api.services import tasks as tasks_api
from clearml_agent.backend_api.services import workers as workers_api
from clearml_agent.backend_api.session import CallResult
from clearml_agent.backend_api.session import CallResult, Request
from clearml_agent.backend_api.session.defs import (
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
@@ -71,6 +71,8 @@ from clearml_agent.definitions import (
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL,
WORKING_STANDALONE_DIR,
ENV_DEBUG_INFO,
ENV_CHILD_AGENTS_COUNT_CMD,
ENV_DOCKER_ARGS_FILTERS,
)
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
from clearml_agent.errors import (
@@ -272,7 +274,7 @@ def get_task(session, task_id, **kwargs):
action='get_all',
version='2.14',
json={"id": [task_id], "search_hidden": True, **kwargs},
method='get',
method=Request.def_method,
async_enable=False,
)
result = CallResult.from_result(
@@ -304,7 +306,7 @@ def get_next_task(session, queue, get_task_info=False):
action='get_next_task',
version='2.14',
json=request,
method='get',
method=Request.def_method,
async_enable=False,
)
if not result.ok:
@@ -325,7 +327,7 @@ def get_task_container(session, task_id):
action='get_all',
version='2.14',
json={'id': [task_id], 'only_fields': ['container'], 'search_hidden': True},
method='get',
method=Request.def_method,
async_enable=False,
)
try:
@@ -366,7 +368,7 @@ def set_task_container(session, task_id, docker_image=None, docker_arguments=Non
action='edit',
version='2.13',
json={'task': task_id, 'container': container, 'force': True},
method='get',
method=Request.def_method,
async_enable=False,
)
return result.ok
@@ -685,6 +687,16 @@ class Worker(ServiceCommandSection):
# str - not supported, version string indicates last server version
self._runtime_props_support = None
# allow docker sanitization, needs backend support
if ENV_DOCKER_ARGS_FILTERS.get():
self._docker_args_filters = \
[re.compile(f) for f in shlex.split(ENV_DOCKER_ARGS_FILTERS.get())]
elif self._session.config.get('agent.docker_args_filters', None):
self._docker_args_filters = \
[re.compile(f) for f in self._session.config.get('agent.docker_args_filters', [])]
else:
self._docker_args_filters = []
@classmethod
def _verify_command_states(cls, kwargs):
"""
@@ -1379,6 +1391,9 @@ class Worker(ServiceCommandSection):
self._session.print_configuration()
def resolve_daemon_queue_names(self, queues, create_if_missing=False):
return self._resolve_queue_names(queues=queues, create_if_missing=create_if_missing)
def daemon(self, queues, log_level, foreground=False, docker=False, detached=False, order_fairness=False, **kwargs):
self._apply_extra_configuration()
@@ -1421,7 +1436,7 @@ class Worker(ServiceCommandSection):
# if we do not need to create queues, make sure they are valid
# match previous behaviour when we validated queue names before everything else
queues = self._resolve_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))
queues = self.resolve_daemon_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))
queues_info = [
q.to_dict()
@@ -3265,6 +3280,11 @@ class Worker(ServiceCommandSection):
first_time=first_time,
)
# print message so users know they can enable cache
if not self.package_api.is_cached_enabled():
print('::: Python virtual environment cache is disabled. '
'To accelerate spin-up time set `agent.venvs_cache.path=~/.clearml/venvs-cache` :::\n')
# check if we have a cached folder
if cached_requirements and not skip_pip_venv_install and self.package_api.get_cached_venv(
requirements=cached_requirements,
@@ -3435,7 +3455,7 @@ class Worker(ServiceCommandSection):
'-v', '{}:{}'.format(ENV_SSH_AUTH_SOCK.get(), ENV_SSH_AUTH_SOCK.get()),
'-e', ssh_auth_sock_env,
]
elif ENV_AGENT_DISABLE_SSH_MOUNT.get():
elif ENV_AGENT_DISABLE_SSH_MOUNT.get() or self._session.config.get("agent.disable_ssh_mount", None):
self._host_ssh_cache = None
else:
self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
@@ -3542,6 +3562,7 @@ class Worker(ServiceCommandSection):
mounted_vcs_cache = temp_config.get("agent.vcs_cache.path")
mounted_venvs_cache = temp_config.get("agent.venvs_cache.path", "")
mount_ssh = temp_config.get("agent.docker_internal_mounts.ssh_folder", None)
mount_ssh_ro = temp_config.get("agent.docker_internal_mounts.ssh_ro_folder", None)
mount_apt_cache = temp_config.get("agent.docker_internal_mounts.apt_cache", None)
mount_pip_cache = temp_config.get("agent.docker_internal_mounts.pip_cache", None)
mount_poetry_cache = temp_config.get("agent.docker_internal_mounts.poetry_cache", None)
@@ -3573,6 +3594,7 @@ class Worker(ServiceCommandSection):
preprocess_bash_script=preprocess_bash_script,
install_opencv_libs=install_opencv_libs,
mount_ssh=mount_ssh,
mount_ssh_ro=mount_ssh_ro,
mount_apt_cache=mount_apt_cache,
mount_pip_cache=mount_pip_cache,
mount_poetry_cache=mount_poetry_cache,
@@ -3584,15 +3606,11 @@ class Worker(ServiceCommandSection):
def _get_child_agents_count_for_worker(self):
"""Get the amount of running child agents. In case of any error return 0"""
parent_worker_label = self._parent_worker_label.format(self.worker_id)
cmd = [
'docker',
'ps',
'--filter',
'label={}'.format(parent_worker_label),
'--format',
# get some fields for debugging
'{"ID":"{{ .ID }}", "Image": "{{ .Image }}", "Names":"{{ .Names }}", "Labels":"{{ .Labels }}"}'
]
default_cmd = 'docker ps --filter label={parent_worker_label} --format {{{{.ID}}}}'
child_agents_cmd = ENV_CHILD_AGENTS_COUNT_CMD.get() or default_cmd
cmd = shlex.split(child_agents_cmd.format(parent_worker_label=parent_worker_label))
try:
output = Argv(*cmd).get_output(
stderr=subprocess.STDOUT
@@ -3603,6 +3621,31 @@ class Worker(ServiceCommandSection):
return len(output.splitlines()) if output else 0
def _filter_docker_args(self, docker_args):
# type: (List[str]) -> List[str]
"""
Filter docker args matching specific flags.
Supports list of Regular expressions, e.g self._docker_args_filters = ["^--env$", "^-e$"]
:argument docker_args: List of docker argument strings (flags and values)
"""
# if no filtering, do nothing
if not docker_args or not self._docker_args_filters:
return docker_args
args = docker_args[:]
results = []
while args:
cmd = args.pop(0).strip()
if any(f.match(cmd) for f in self._docker_args_filters):
results.append(cmd)
if "=" not in cmd and args and not args[0].startswith("-"):
try:
results.append(args.pop(0).strip())
except IndexError:
pass
return results
def _get_docker_cmd(
self,
worker_id, parent_worker_id,
@@ -3626,7 +3669,7 @@ class Worker(ServiceCommandSection):
auth_token=None,
worker_tags=None,
name=None,
mount_ssh=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
mount_ssh=None, mount_ssh_ro=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
env_task_id=None,
):
self.debug("Constructing docker command", context="docker")
@@ -3656,6 +3699,7 @@ class Worker(ServiceCommandSection):
if docker_arguments:
docker_arguments = list(docker_arguments) \
if isinstance(docker_arguments, (list, tuple)) else [docker_arguments]
docker_arguments = self._filter_docker_args(docker_arguments)
base_cmd += [a for a in docker_arguments if a]
if extra_docker_arguments:
@@ -3770,7 +3814,7 @@ class Worker(ServiceCommandSection):
clearml_agent_wheel = 'clearml-agent{specify_version}'.format(specify_version=specify_version)
mount_ssh = mount_ssh or '/root/.ssh'
mount_ssh_ro = "{}_ro".format(mount_ssh.rstrip("/"))
mount_ssh_ro = mount_ssh_ro or "{}_ro".format(mount_ssh.rstrip("/"))
mount_apt_cache = mount_apt_cache or '/var/cache/apt/archives'
mount_pip_cache = mount_pip_cache or '/root/.cache/pip'
mount_poetry_cache = mount_poetry_cache or '/root/.cache/pypoetry'

View File

@@ -149,6 +149,8 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEAR
ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
ENV_DEBUG_INFO = EnvironmentConfig('CLEARML_AGENT_DEBUG_INFO')
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig('CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD')
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig('CLEARML_AGENT_DOCKER_ARGS_FILTERS')
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
"""

View File

@@ -11,6 +11,7 @@ import subprocess
import tempfile
from copy import deepcopy
from pathlib import Path
from pprint import pformat
from threading import Thread
from time import sleep
from typing import Text, List, Callable, Any, Collection, Optional, Union
@@ -26,8 +27,8 @@ from clearml_agent.helper.dicts import merge_dicts
from clearml_agent.helper.process import get_bash_output
from clearml_agent.helper.resource_monitor import ResourceMonitor
from clearml_agent.interface.base import ObjectID
from .definitions import ENV_START_AGENT_SCRIPT_PATH
from clearml_agent.backend_api.session import Request
from clearml_agent.glue.definitions import ENV_START_AGENT_SCRIPT_PATH
class K8sIntegration(Worker):
@@ -75,8 +76,8 @@ class K8sIntegration(Worker):
"export LOCAL_PYTHON=$(which python3.$i) && break ; done",
"[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
"$LOCAL_PYTHON -m pip install clearml-agent",
"{extra_bash_init_cmd}",
"$LOCAL_PYTHON -m pip install clearml-agent",
"{extra_docker_bash_script}",
"$LOCAL_PYTHON -m clearml_agent execute --full-monitoring --require-queue --id {task_id}"
]
@@ -130,6 +131,7 @@ class K8sIntegration(Worker):
"""
super(K8sIntegration, self).__init__()
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
self.k8s_pending_queue_id = None
self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
# Always do system packages, because by we will be running inside a docker
@@ -297,7 +299,7 @@ class K8sIntegration(Worker):
service='tasks',
action='update',
json={"task": task_id, "status_message": "K8S glue status: {}".format(msg)},
method='get',
method=Request.def_method,
async_enable=False,
)
if not result.ok:
@@ -394,17 +396,17 @@ class K8sIntegration(Worker):
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
try:
print('Pushing task {} into temporary pending queue'.format(task_id))
res = self._session.api_client.tasks.stop(task_id, force=True)
_ = self._session.api_client.tasks.stop(task_id, force=True)
res = self._session.api_client.tasks.enqueue(
task_id,
queue=self.k8s_pending_queue_name,
queue=self.k8s_pending_queue_id,
status_reason='k8s pending scheduler',
)
if res.meta.result_code != 200:
raise Exception(res.meta.result_msg)
except Exception as e:
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
task_id, self.k8s_pending_queue_name, e))
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue {} [{}], error: {}".format(
task_id, self.k8s_pending_queue_name, self.k8s_pending_queue_id, e))
return
container = get_task_container(self._session, task_id)
@@ -679,6 +681,8 @@ class K8sIntegration(Worker):
with open(yaml_file, 'wt') as f:
yaml.dump(template, f)
self.log.debug("Applying template:\n{}".format(pformat(template, indent=2)))
kubectl_cmd = self.KUBECTL_APPLY_CMD.format(
task_id=task_id,
docker_image=docker_image,
@@ -765,13 +769,13 @@ class K8sIntegration(Worker):
events_service = self.get_service(Events)
# make sure we have a k8s pending queue
# noinspection PyBroadException
try:
self._session.api_client.queues.create(self.k8s_pending_queue_name)
except Exception:
pass
# get queue id
self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")
if not self.k8s_pending_queue_id:
resolved_ids = self._resolve_queue_names([self.k8s_pending_queue_name], create_if_missing=True)
if not resolved_ids:
raise ValueError(
"Failed resolving or creating k8s pending queue {}".format(self.k8s_pending_queue_name)
)
self.k8s_pending_queue_id = resolved_ids[0]
_last_machine_update_ts = 0
while True:

View File

@@ -213,6 +213,13 @@ class PackageManager(object):
return
return self._get_cache_manager().get_last_copied_entry()
def is_cached_enabled(self):
if not self._cache_manager:
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
if not cache_folder:
return False
return True
@classmethod
def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
# type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]

View File

@@ -53,17 +53,16 @@ class PytorchWheel(object):
python = attr.ib(type=str, converter=lambda x: str(x).replace(".", ""))
torch_version = attr.ib(type=str, converter=fix_version)
url_template = (
"http://download.pytorch.org/whl/"
"{0.cuda_version}/torch-{0.torch_version}-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
)
url_template_prefix = "http://download.pytorch.org/whl/"
url_template = "{0.cuda_version}/torch-{0.torch_version}" \
"-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
def __attrs_post_init__(self):
self.unicode = "u" if self.python.startswith("2") else ""
def make_url(self):
# type: () -> Text
return self.url_template.format(self)
return (self.url_template_prefix + self.url_template).format(self)
class PytorchResolutionError(FatalSpecsResolutionError):
@@ -183,6 +182,19 @@ class PytorchRequirement(SimpleSubstitution):
self._fix_setuptools = None
self.exceptions = []
self._original_req = []
# allow override pytorch lookup pages
if self.config.get("agent.package_manager.torch_page", None):
SimplePytorchRequirement.page_lookup_template = \
self.config.get("agent.package_manager.torch_page", None)
if self.config.get("agent.package_manager.torch_nightly_page", None):
SimplePytorchRequirement.nightly_page_lookup_template = \
self.config.get("agent.package_manager.torch_nightly_page", None)
if self.config.get("agent.package_manager.torch_url_template_prefix", None):
PytorchWheel.url_template_prefix = \
self.config.get("agent.package_manager.torch_url_template_prefix", None)
if self.config.get("agent.package_manager.torch_url_template", None):
PytorchWheel.url_template = \
self.config.get("agent.package_manager.torch_url_template", None)
def _init_python_ver_cuda_ver(self):
if self.cuda is None:

View File

@@ -92,9 +92,10 @@ class ResourceMonitor(object):
# None means no filtering, report all gpus
self._active_gpus = None
try:
active_gpus = Session.get_nvidia_visible_env() or ""
if active_gpus:
self._active_gpus = [g.strip() for g in active_gpus.split(',')]
active_gpus = Session.get_nvidia_visible_env()
# None means no filtering, report all gpus
if active_gpus and active_gpus != "all":
self._active_gpus = [g.strip() for g in str(active_gpus).split(',')]
except Exception:
pass

View File

@@ -288,7 +288,7 @@ class Session(_Session):
def get(self, service, action, version=None, headers=None,
data=None, json=None, async_enable=False, **kwargs):
return self._manual_request(service=service, action=action,
version=version, method="get", headers=headers,
version=version, method=Request.def_method, headers=headers,
data=data, async_enable=async_enable,
json=json or kwargs)
@@ -299,7 +299,7 @@ class Session(_Session):
data=data, async_enable=async_enable,
json=json or kwargs)
def _manual_request(self, service, action, version=None, method="get", headers=None,
def _manual_request(self, service, action, version=None, method=Request.def_method, headers=None,
data=None, json=None, async_enable=False, **kwargs):
res = self.send_request(service=service, action=action,

View File

@@ -1 +1 @@
__version__ = '1.3.0'
__version__ = '1.4.1'

View File

@@ -136,6 +136,12 @@ agent {
},
translate_ssh: true,
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
# default is false, automatically mounts ~/.ssh
# Must be set to True if using "clearml-session" with this agent!
# disable_ssh_mount: false
# reload configuration file every daemon execution
reload_config: false,