Version bump to v1.4.0

version bump
Allow to pverride pytorch lookup page: "agent.package_manager.torch_page / torch_nightly_page / torch_url_template_prefix"
2025-06-26 18:16:15 +00:00 · 2022-09-29 18:21:04 +03:00 · 2022-09-16 17:29:42 +03:00 · 2022-09-15 20:16:41 +03:00 · 2022-09-15 20:16:01 +03:00 · 2022-09-15 20:15:42 +03:00
11 changed files with 75 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -197,7 +197,7 @@ with `--cpu-only`).

 If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for
 the `clearml-agent` <br>
-If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for
+If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES="none"`, no gpu will be allocated for
 the `clearml-agent`

 Example: spin two agents, one per gpu on the same machine:
--- a/clearml_agent/backend_api/config/default/api.conf
+++ b/clearml_agent/backend_api/config/default/api.conf
@@ -28,6 +28,9 @@

        pool_maxsize: 512
        pool_connections: 512
+
+        # Override the default http method, use "put" if working behind GCP load balancer (default: "get")
+        # default_method: "get"
    }

    auth {
--- a/clearml_agent/backend_api/session/request.py
+++ b/clearml_agent/backend_api/session/request.py
@@ -8,13 +8,14 @@ from .datamodel import DataModel
 from .defs import ENV_API_DEFAULT_REQ_METHOD


-if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST"):
+if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST", "PUT"):
    raise ValueError(
        "CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
    )


 class Request(ApiModel):
+    def_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
    _method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")

    def __init__(self, **kwargs):
--- a/clearml_agent/backend_api/session/session.py
+++ b/clearml_agent/backend_api/session/session.py
@@ -14,8 +14,9 @@ from requests.auth import HTTPBasicAuth
 from six.moves.urllib.parse import urlparse, urlunparse

 from .callresult import CallResult
-from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
-    ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD
+from .defs import (
+    ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
+    ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD, )
 from .request import Request, BatchRequest
 from .token_manager import TokenManager
 from ..config import load
@@ -110,6 +111,19 @@ class Session(TokenManager):
        self._logger = logger
        self.__auth_token = None

+        if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
+            # Make sure we update the config object, so we pass it into the new containers when we map them
+            self.config["api.http.default_method"] = ENV_API_DEFAULT_REQ_METHOD.get()
+            # notice the default setting of Request.def_method are already set by the OS environment
+        elif self.config.get("api.http.default_method", None):
+            def_method = str(self.config.get("api.http.default_method", None)).strip()
+            if def_method.upper() not in ("GET", "POST", "PUT"):
+                raise ValueError(
+                    "api.http.default_method variable must be 'get' or 'post' (any case is allowed)."
+                )
+            Request.def_method = def_method
+            Request._method = Request.def_method
+
        if ENV_AUTH_TOKEN.get(
            value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
        ):
@@ -251,7 +265,7 @@ class Session(TokenManager):
        service,
        action,
        version=None,
-        method="get",
+        method=Request.def_method,
        headers=None,
        auth=None,
        data=None,
@@ -328,7 +342,7 @@ class Session(TokenManager):
        service,
        action,
        version=None,
-        method="get",
+        method=Request.def_method,
        headers=None,
        data=None,
        json=None,
@@ -371,7 +385,7 @@ class Session(TokenManager):
        headers=None,
        data=None,
        json=None,
-        method="get",
+        method=Request.def_method,
    ):
        """
        Send a raw batch API request. Batch requests always use application/json-lines content type.
@@ -615,7 +629,7 @@ class Session(TokenManager):
        try:
            data = {"expiration_sec": exp} if exp else {}
            res = self._send_request(
-                method=ENV_API_DEFAULT_REQ_METHOD.get(default="get"),
+                method=Request.def_method,
                service="auth",
                action="login",
                auth=auth,
--- a/clearml_agent/commands/resolver.py
+++ b/clearml_agent/commands/resolver.py
@@ -1,6 +1,8 @@
 import json
 import re
 import shlex
+
+from clearml_agent.backend_api.session import Request
 from clearml_agent.helper.package.requirements import (
    RequirementsManager, MarkerRequirement,
    compare_version_rules, )
@@ -26,7 +28,7 @@ def resolve_default_container(session, task_id, container_config):
                              'script.repository', 'script.branch',
                              'project', 'container'],
              'search_hidden': True},
-        method='get',
+        method=Request.def_method,
        async_enable=False,
    )
    try:
@@ -53,7 +55,7 @@ def resolve_default_container(session, task_id, container_config):
                'id': [task_info.get('project')],
                'only_fields': ['name'],
            },
-            method='get',
+            method=Request.def_method,
            async_enable=False,
        )
        try:
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -38,7 +38,7 @@ from clearml_agent.backend_api.services import auth as auth_api
 from clearml_agent.backend_api.services import queues as queues_api
 from clearml_agent.backend_api.services import tasks as tasks_api
 from clearml_agent.backend_api.services import workers as workers_api
-from clearml_agent.backend_api.session import CallResult
+from clearml_agent.backend_api.session import CallResult, Request
 from clearml_agent.backend_api.session.defs import (
    ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
    ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
@@ -71,6 +71,7 @@ from clearml_agent.definitions import (
    ENV_AGENT_SKIP_PYTHON_ENV_INSTALL,
    WORKING_STANDALONE_DIR,
    ENV_DEBUG_INFO,
+    ENV_CHILD_AGENTS_COUNT_CMD,
 )
 from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
 from clearml_agent.errors import (
@@ -272,7 +273,7 @@ def get_task(session, task_id, **kwargs):
        action='get_all',
        version='2.14',
        json={"id": [task_id], "search_hidden": True, **kwargs},
-        method='get',
+        method=Request.def_method,
        async_enable=False,
    )
    result = CallResult.from_result(
@@ -304,7 +305,7 @@ def get_next_task(session, queue, get_task_info=False):
        action='get_next_task',
        version='2.14',
        json=request,
-        method='get',
+        method=Request.def_method,
        async_enable=False,
    )
    if not result.ok:
@@ -325,7 +326,7 @@ def get_task_container(session, task_id):
            action='get_all',
            version='2.14',
            json={'id': [task_id], 'only_fields': ['container'], 'search_hidden': True},
-            method='get',
+            method=Request.def_method,
            async_enable=False,
        )
        try:
@@ -366,7 +367,7 @@ def set_task_container(session, task_id, docker_image=None, docker_arguments=Non
            action='edit',
            version='2.13',
            json={'task': task_id, 'container': container, 'force': True},
-            method='get',
+            method=Request.def_method,
            async_enable=False,
        )
        return result.ok
@@ -1379,6 +1380,9 @@ class Worker(ServiceCommandSection):

        self._session.print_configuration()

+    def resolve_daemon_queue_names(self, queues, create_if_missing=False):
+        return self._resolve_queue_names(queues=queues, create_if_missing=create_if_missing)
+
    def daemon(self, queues, log_level, foreground=False, docker=False, detached=False, order_fairness=False, **kwargs):
        self._apply_extra_configuration()

@@ -1421,7 +1425,7 @@ class Worker(ServiceCommandSection):

        # if we do not need to create queues, make sure they are valid
        # match previous behaviour when we validated queue names before everything else
-        queues = self._resolve_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))
+        queues = self.resolve_daemon_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))

        queues_info = [
            q.to_dict()
@@ -3586,15 +3590,13 @@ class Worker(ServiceCommandSection):
    def _get_child_agents_count_for_worker(self):
        """Get the amount of running child agents. In case of any error return 0"""
        parent_worker_label = self._parent_worker_label.format(self.worker_id)
-        cmd = [
-            'docker',
-            'ps',
-            '--filter',
-            'label={}'.format(parent_worker_label),
-            '--format',
-            # get some fields for debugging
-            '{"ID":"{{ .ID }}", "Image": "{{ .Image }}", "Names":"{{ .Names }}", "Labels":"{{ .Labels }}"}'
-        ]
+
+        default_cmd = 'docker ps --filter label={parent_worker_label} --format ' \
+                      '{{"ID":"{{{{ .ID }}}}", "Image": "{{{{ .Image }}}}", ' \
+                      '"Names":"{{{{ .Names }}}}", "Labels":"{{{{ .Labels }}}}"}}'
+        child_agents_cmd = ENV_CHILD_AGENTS_COUNT_CMD.get() or default_cmd
+
+        cmd = shlex.split(child_agents_cmd.format(parent_worker_label=parent_worker_label))
        try:
            output = Argv(*cmd).get_output(
                stderr=subprocess.STDOUT
--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@@ -149,6 +149,7 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEAR
 ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
 ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
 ENV_DEBUG_INFO = EnvironmentConfig('CLEARML_AGENT_DEBUG_INFO')
+ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig('CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD')

 ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
 """
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@@ -11,6 +11,7 @@ import subprocess
 import tempfile
 from copy import deepcopy
 from pathlib import Path
+from pprint import pformat
 from threading import Thread
 from time import sleep
 from typing import Text, List, Callable, Any, Collection, Optional, Union
@@ -26,8 +27,8 @@ from clearml_agent.helper.dicts import merge_dicts
 from clearml_agent.helper.process import get_bash_output
 from clearml_agent.helper.resource_monitor import ResourceMonitor
 from clearml_agent.interface.base import ObjectID
-
-from .definitions import ENV_START_AGENT_SCRIPT_PATH
+from clearml_agent.backend_api.session import Request
+from clearml_agent.glue.definitions import ENV_START_AGENT_SCRIPT_PATH


 class K8sIntegration(Worker):
@@ -75,8 +76,8 @@ class K8sIntegration(Worker):
        "export LOCAL_PYTHON=$(which python3.$i) && break ; done",
        "[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
        "[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
-        "$LOCAL_PYTHON -m pip install clearml-agent",
        "{extra_bash_init_cmd}",
+        "$LOCAL_PYTHON -m pip install clearml-agent",
        "{extra_docker_bash_script}",
        "$LOCAL_PYTHON -m clearml_agent execute --full-monitoring --require-queue --id {task_id}"
    ]
@@ -298,7 +299,7 @@ class K8sIntegration(Worker):
                            service='tasks',
                            action='update',
                            json={"task": task_id, "status_message": "K8S glue status: {}".format(msg)},
-                            method='get',
+                            method=Request.def_method,
                            async_enable=False,
                        )
                        if not result.ok:
@@ -680,6 +681,8 @@ class K8sIntegration(Worker):
        with open(yaml_file, 'wt') as f:
            yaml.dump(template, f)

+        self.log.debug("Applying template:\n{}".format(pformat(template, indent=2)))
+
        kubectl_cmd = self.KUBECTL_APPLY_CMD.format(
            task_id=task_id,
            docker_image=docker_image,
--- a/clearml_agent/helper/package/pytorch.py
+++ b/clearml_agent/helper/package/pytorch.py
@@ -53,17 +53,16 @@ class PytorchWheel(object):
    python = attr.ib(type=str, converter=lambda x: str(x).replace(".", ""))
    torch_version = attr.ib(type=str, converter=fix_version)

-    url_template = (
-        "http://download.pytorch.org/whl/"
-        "{0.cuda_version}/torch-{0.torch_version}-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
-    )
+    url_template_prefix = "http://download.pytorch.org/whl/"
+    url_template = "{0.cuda_version}/torch-{0.torch_version}" \
+                   "-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"

    def __attrs_post_init__(self):
        self.unicode = "u" if self.python.startswith("2") else ""

    def make_url(self):
        # type: () -> Text
-        return self.url_template.format(self)
+        return (self.url_template_prefix + self.url_template).format(self)


 class PytorchResolutionError(FatalSpecsResolutionError):
@@ -183,6 +182,19 @@ class PytorchRequirement(SimpleSubstitution):
        self._fix_setuptools = None
        self.exceptions = []
        self._original_req = []
+        # allow override pytorch lookup pages
+        if self.config.get("agent.package_manager.torch_page", None):
+            SimplePytorchRequirement.page_lookup_template = \
+                self.config.get("agent.package_manager.torch_page", None)
+        if self.config.get("agent.package_manager.torch_nightly_page", None):
+            SimplePytorchRequirement.nightly_page_lookup_template = \
+                self.config.get("agent.package_manager.torch_nightly_page", None)
+        if self.config.get("agent.package_manager.torch_url_template_prefix", None):
+            PytorchWheel.url_template_prefix = \
+                self.config.get("agent.package_manager.torch_url_template_prefix", None)
+        if self.config.get("agent.package_manager.torch_url_template", None):
+            PytorchWheel.url_template = \
+                self.config.get("agent.package_manager.torch_url_template", None)

    def _init_python_ver_cuda_ver(self):
        if self.cuda is None:
--- a/clearml_agent/session.py
+++ b/clearml_agent/session.py
@@ -288,7 +288,7 @@ class Session(_Session):
    def get(self, service, action, version=None, headers=None,
            data=None, json=None, async_enable=False, **kwargs):
        return self._manual_request(service=service, action=action,
-                                    version=version, method="get", headers=headers,
+                                    version=version, method=Request.def_method, headers=headers,
                                    data=data, async_enable=async_enable,
                                    json=json or kwargs)

@@ -299,7 +299,7 @@ class Session(_Session):
                                    data=data, async_enable=async_enable,
                                    json=json or kwargs)

-    def _manual_request(self, service, action, version=None, method="get", headers=None,
+    def _manual_request(self, service, action, version=None, method=Request.def_method, headers=None,
            data=None, json=None, async_enable=False, **kwargs):

        res = self.send_request(service=service, action=action,
--- a/clearml_agent/version.py
+++ b/clearml_agent/version.py
@@ -1 +1 @@
-__version__ = '1.3.0'
+__version__ = '1.4.0'
Author	SHA1	Message	Date
allegroai	ba2db4e727	Version bump to v1.4.0	2022-09-29 18:21:04 +03:00
allegroai	077148be00	version bump	2022-09-16 17:29:42 +03:00
allegroai	594ee5842e	Allow to pverride pytorch lookup page: "agent.package_manager.torch_page / torch_nightly_page / torch_url_template_prefix"	2022-09-15 20:16:41 +03:00
allegroai	a69766bd8b	Add CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD to allow overriding child agent count command in k8s	2022-09-15 20:16:01 +03:00
allegroai	857a750eb1	Fix GCP load balancer not fwd GET request body, allow to change default request Action to Put/Post/Get. see api.http.default_method or CLEARML_API_DEFAULT_REQ_METHOD	2022-09-15 20:15:42 +03:00
allegroai	26aa50f1b5	Fix k8s glue extra_bash_init_cmd location in initial bash script	2022-09-02 23:50:03 +03:00
allegroai	8b4f1eefc2	Add more debug printouts in k8s glue	2022-09-02 23:49:28 +03:00
allegroai	97c2e21dcc	Fix resolving k8s pending queue may cause a queue with a uuid name to be created	2022-09-02 23:49:28 +03:00
allegroai	918dd39b87	Add docker ssh_ro_folder (default: "/.ssh") changed docker ssh_folder (default: "~/.ssh")	2022-09-02 23:49:27 +03:00
allegroai	7776e906c4	Fix second .ssh temp mount fails if container changes the files inside	2022-09-02 23:49:27 +03:00
allegroai	1bf865ec08	Fix name not escaped as regex (all services "get_all" use regex for name)	2022-09-02 23:49:27 +03:00
Luca Cerone	3f1ce847dc	Fixed documentation (#117 ) * Fixed documentation * Update README.md	2022-09-01 17:18:48 +03:00