Add support for setting mode on files applied by the agent

Fix docker container backwards compatibility for API <2.13
Fix default docker match rules resolver (used incorrect field "container" instead of "image") Remove "container" (image) match rule option from default docker image resolver
2025-06-26 18:16:15 +00:00 · 2023-07-04 14:37:58 +03:00 · 2023-07-04 14:37:18 +03:00 · 2023-07-04 14:35:54 +03:00 · 2023-07-04 14:34:43 +03:00 · 2023-07-03 11:08:59 +03:00
15 changed files with 270 additions and 91 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,6 @@ build/
 dist/
 *.egg-info

+# VSCode
+.vscode
+
--- a/clearml_agent/backend_api/config/default/agent.conf
+++ b/clearml_agent/backend_api/config/default/agent.conf
@@ -80,6 +80,16 @@
        # additional artifact repositories to use when installing python packages
        # extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]

+        # control the pytorch wheel resolving algorithm, options are: "pip", "direct"
+        # "pip" (default): would automatically detect the cuda version, and supply pip with the correct
+        # extra-index-url, based on pytorch.org tables
+        # "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
+        # and matching the automatically detected cuda version with the required pytorch wheel.
+        # if the exact cuda version is not found for the required pytorch wheel, it will try
+        # a lower cuda version until a match is found
+        #
+        # pytorch_resolve: "pip"
+
        # additional conda channels to use when installing with conda package manager
        conda_channels: ["pytorch", "conda-forge", "defaults", ]

@@ -88,19 +98,23 @@
        # force_repo_requirements_txt: false

        # set the priority packages to be installed before the rest of the required packages
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        # priority_packages: ["cython", "numpy", "setuptools", ]

        # set the optional priority packages to be installed before the rest of the required packages,
        # In case a package installation fails, the package will be ignored,
        # and the virtual environment process will continue
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        priority_optional_packages: ["pygobject", ]

        # set the post packages to be installed after all the rest of the required packages
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        # post_packages: ["horovod", ]

        # set the optional post packages to be installed after all the rest of the required packages,
        # In case a package installation fails, the package will be ignored,
        # and the virtual environment process will continue
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        # post_optional_packages: []

        # set to True to support torch nightly build installation,
@@ -259,10 +273,15 @@

    # Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
    # Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
+    # Custom variables may be specified using the docker_container_name_format_fields option.
    # Note: resulting name must start with an alphanumeric character and
    #       continue with alphanumeric characters, underscores (_), dots (.) and/or dashes (-)
    # docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"

+    # Specify custom variables for the docker_container_name_format option using a mapping of variable name
+    # to a (nested) task field (using "." as a task field separator, digits specify array index)
+    # docker_container_name_format_fields: { foo: "bar.moo" }
+
    # Apply top-level environment section from configuration into os.environ
    apply_environment: true
    # Top-level environment section is in the form of:
@@ -369,13 +388,6 @@
    #                 }
    #             },
    #             {
-    #                 "image": "better_container:tag",
-    #                 "arguments": "",
-    #                 "match": {
-    #                     "container": "replace_me_please"
-    #                 }
-    #             },
-    #             {
    #                 "image": "another_container:tag",
    #                 "arguments": "",
    #                 "match": {
--- a/clearml_agent/backend_api/session/defs.py
+++ b/clearml_agent/backend_api/session/defs.py
@@ -20,6 +20,7 @@ ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool,
 ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
    'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
 )
+ENV_FORCE_MAX_API_VERSION = EnvEntry("CLEARML_AGENT_FORCE_MAX_API_VERSION", type=str)

 """
 Experimental option to set the request method for all API requests and auth login.
--- a/clearml_agent/backend_api/session/session.py
+++ b/clearml_agent/backend_api/session/session.py
@@ -16,11 +16,11 @@ from requests.auth import HTTPBasicAuth
 from six.moves.urllib.parse import urlparse, urlunparse

 from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
-
 from .callresult import CallResult
 from .defs import (
    ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
-    ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD, )
+    ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD,
+    ENV_FORCE_MAX_API_VERSION)
 from .request import Request, BatchRequest
 from .token_manager import TokenManager
 from ..config import load
@@ -28,7 +28,6 @@ from ..utils import get_http_session_with_retry, urllib_log_warning_setup
 from ...backend_config.environment import backward_compatibility_support
 from ...version import __version__

-
 sys_random = SystemRandom()


@@ -64,6 +63,7 @@ class Session(TokenManager):
    default_files = "https://demofiles.demo.clear.ml"
    default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
    default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
+    force_max_api_version = ENV_FORCE_MAX_API_VERSION.get()

    # TODO: add requests.codes.gateway_timeout once we support async commits
    _retry_codes = [
@@ -199,6 +199,10 @@ class Session(TokenManager):
        # notice: this is across the board warning omission
        urllib_log_warning_setup(total_retries=http_retries_config.get('total', 0), display_warning_after=3)

+        if self.force_max_api_version and self.check_min_api_version(self.force_max_api_version):
+            print("Using forced API version {}".format(self.force_max_api_version))
+            Session.max_api_version = Session.api_version = str(self.force_max_api_version)
+
    def _setup_session(self, http_retries_config, initial_session=False, default_initial_connect_override=None):
        # type: (dict, bool, Optional[bool]) -> (dict, requests.Session)
        http_retries_config = http_retries_config or self.config.get(
--- a/clearml_agent/backend_config/utils.py
+++ b/clearml_agent/backend_config/utils.py
@@ -52,6 +52,7 @@ def apply_files(config):
        target_fmt = data.get("target_format", "string")
        overwrite = bool(data.get("overwrite", True))
        contents = data.get("contents")
+        mode = data.get("mode")

        target = Path(expanduser(expandvars(path)))

@@ -110,3 +111,14 @@ def apply_files(config):
        except Exception as ex:
            print("Skipped [{}]: failed saving file {} ({})".format(key, target, ex))
            continue
+
+        try:
+            if mode:
+                if isinstance(mode, int):
+                    mode = int(str(mode), 8)
+                else:
+                    mode = int(mode, 8)
+                target.chmod(mode)
+        except Exception as ex:
+            print("Skipped [{}]: failed setting mode {} for {} ({})".format(key, mode, target, ex))
+            continue
--- a/clearml_agent/commands/config.py
+++ b/clearml_agent/commands/config.py
@@ -44,7 +44,7 @@ def main():

    if conf_file.exists() and conf_file.is_file() and conf_file.stat().st_size > 0:
        print('Configuration file already exists: {}'.format(str(conf_file)))
-        print('Leaving setup, feel free to edit the configuration file.')
+        print('Leaving setup. If you\'ve previously initialized the ClearML SDK on this machine, manually add an \'agent\' section to this file.')
        return

    print(description, end='')
--- a/clearml_agent/commands/resolver.py
+++ b/clearml_agent/commands/resolver.py
@@ -109,15 +109,15 @@ def resolve_default_container(session, task_id, container_config):
                    match.get('script.binary', None), entry))
                continue

-        if match.get('container', None):
-            # noinspection PyBroadException
-            try:
-                if not re.search(match.get('container', None), requested_container.get('image', '')):
-                    continue
-            except Exception:
-                print('Failed parsing regular expression \"{}\" in rule: {}'.format(
-                    match.get('container', None), entry))
-                continue
+        # if match.get('image', None):
+        #     # noinspection PyBroadException
+        #     try:
+        #         if not re.search(match.get('image', None), requested_container.get('image', '')):
+        #             continue
+        #     except Exception:
+        #         print('Failed parsing regular expression \"{}\" in rule: {}'.format(
+        #             match.get('image', None), entry))
+        #         continue

        matched = True
        for req_section in ['script.requirements.pip', 'script.requirements.conda']:
@@ -156,8 +156,8 @@ def resolve_default_container(session, task_id, container_config):
            break

        if matched:
-            if not container_config.get('container'):
-                container_config['container'] = entry.get('image', None)
+            if not container_config.get('image'):
+                container_config['image'] = entry.get('image', None)
            if not container_config.get('arguments'):
                container_config['arguments'] = entry.get('arguments', None)
                container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -73,6 +73,8 @@ from clearml_agent.definitions import (
    ENV_FORCE_SYSTEM_SITE_PACKAGES,
    ENV_SERVICES_DOCKER_RESTART,
    ENV_CONFIG_BC_IN_STANDALONE,
+    ENV_FORCE_DOCKER_AGENT_REPO,
+    ENV_EXTRA_DOCKER_LABELS,
 )
 from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
 from clearml_agent.errors import (
@@ -318,6 +320,37 @@ def get_next_task(session, queue, get_task_info=False):
    return data


+def get_task_fields(session, task_id, fields: list, log=None) -> dict:
+    """
+    Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''}
+    """
+    result = session.send_request(
+        service='tasks',
+        action='get_all',
+        json={'id': [task_id], 'only_fields': list(fields), 'search_hidden': True},
+        method=Request.def_method,
+        async_enable=False,
+    )
+    # noinspection PyBroadException
+    try:
+        results = {}
+        result = result.json()['data']['tasks'][0]
+        for field in fields:
+            cur = result
+            for part in field.split("."):
+                if part.isdigit():
+                    cur = cur[part]
+                else:
+                    cur = cur.get(part, {})
+            results[field] = cur
+        return results
+    except Exception as ex:
+        if log:
+            log.error("Failed obtaining values for task fields {}: {}", fields, ex)
+        pass
+    return {}
+
+
 def get_task_container(session, task_id):
    """
    Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''}
@@ -339,16 +372,19 @@ def get_task_container(session, task_id):
            container = {}
    else:
        response = get_task(session, task_id, only_fields=["execution.docker_cmd"])
-        task_docker_cmd_parts = shlex.split(str(response.execution.docker_cmd or '').strip())
-        try:
-            container = dict(
-                container=task_docker_cmd_parts[0],
-                arguments=task_docker_cmd_parts[1:] if len(task_docker_cmd_parts[0]) > 1 else ''
-            )
-        except (ValueError, TypeError):
-            container = {}
+        container = {}
+        if response.execution:
+            task_docker_cmd_parts = shlex.split(str(response.execution.docker_cmd or '').strip())
+            if task_docker_cmd_parts:
+                try:
+                    container = dict(
+                        image=task_docker_cmd_parts[0],
+                        arguments=task_docker_cmd_parts[1:] if len(task_docker_cmd_parts[0]) > 1 else ''
+                    )
+                except (ValueError, TypeError):
+                    pass

-    if (not container or not container.get('container')) and session.check_min_api_version("2.13"):
+    if (not container or not container.get('image')) and session.check_min_api_version("2.13"):
        container = resolve_default_container(session=session, task_id=task_id, container_config=container)

    return container
@@ -889,11 +925,21 @@ class Worker(ServiceCommandSection):

            name_format = self._session.config.get('agent.docker_container_name_format', None)
            if name_format:
+                custom_fields = {}
+                name_format_fields = self._session.config.get('agent.docker_container_name_format_fields', None)
+                if name_format_fields:
+                    field_values = get_task_fields(task_session, task_id, name_format_fields.values(), log=self.log)
+                    custom_fields = {
+                        k: field_values.get(v)
+                        for k, v in name_format_fields.items()
+                    }
+
                try:
                    name = name_format.format(
                        task_id=re.sub(r'[^a-zA-Z0-9._-]', '-', task_id),
                        worker_id=re.sub(r'[^a-zA-Z0-9._-]', '-', worker_id),
-                        rand_string="".join(sys_random.choice(string.ascii_lowercase) for _ in range(32))
+                        rand_string="".join(sys_random.choice(string.ascii_lowercase) for _ in range(32)),
+                        **custom_fields,
                    )
                except Exception as ex:
                    print("Warning: failed generating docker container name: {}".format(ex))
@@ -3848,6 +3894,10 @@ class Worker(ServiceCommandSection):
        base_cmd += ['-l', self._worker_label.format(worker_id)]
        base_cmd += ['-l', self._parent_worker_label.format(parent_worker_id)]

+        extra_labels = ENV_EXTRA_DOCKER_LABELS.get()
+        for label in (extra_labels or []):
+            base_cmd += ['-l', label]
+
        self.debug("Command: {}".format(base_cmd), context="docker")

        # check if running inside a kubernetes
@@ -3930,6 +3980,7 @@ class Worker(ServiceCommandSection):
        # if we are running a RC version, install the same version in the docker
        # because the default latest, will be a release version (not RC)
        specify_version = ''
+        # noinspection PyBroadException
        try:
            from clearml_agent.version import __version__
            _version_parts = __version__.split('.')
@@ -3938,13 +3989,15 @@ class Worker(ServiceCommandSection):
        except:
            pass

+        force_agent_repo = ENV_FORCE_DOCKER_AGENT_REPO.get()
+
        if os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'):
            local_wheel = os.path.expanduser(os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'))
            docker_wheel = '/tmp/{}'.format(basename(local_wheel))
            base_cmd += ['-v', local_wheel + ':' + docker_wheel]
            clearml_agent_wheel = '\"{}\"'.format(docker_wheel)
-        elif os.environ.get('FORCE_CLEARML_AGENT_REPO'):
-            clearml_agent_wheel = os.environ.get('FORCE_CLEARML_AGENT_REPO')
+        elif force_agent_repo:
+            clearml_agent_wheel = force_agent_repo
        else:
            # clearml-agent{specify_version}
            clearml_agent_wheel = 'clearml-agent{specify_version}'.format(specify_version=specify_version)
--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@@ -173,6 +173,7 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig(
 )
 ENV_VENV_CACHE_PATH = EnvironmentConfig("CLEARML_AGENT_VENV_CACHE_PATH")
 ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_ARGS", type=list)
+ENV_EXTRA_DOCKER_LABELS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_LABELS", type=list)
 ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
 ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
 ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
@@ -180,6 +181,8 @@ ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV
 ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
 """ Maintain backwards compatible configuration when launching in standalone mode """

+ENV_FORCE_DOCKER_AGENT_REPO = EnvironmentConfig("FORCE_CLEARML_AGENT_REPO", "CLEARML_AGENT_DOCKER_AGENT_REPO")
+
 ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
 """
    Specify a restart value for a services agent task containers.
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@@ -9,7 +9,7 @@ import os
 import re
 import subprocess
 import tempfile
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 from copy import deepcopy
 from pathlib import Path
 from pprint import pformat
@@ -42,6 +42,7 @@ class K8sIntegration(Worker):

    K8S_DEFAULT_NAMESPACE = "clearml"
    AGENT_LABEL = "CLEARML=agent"
+    QUEUE_LABEL = "clearml-agent-queue"

    KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"

@@ -408,34 +409,50 @@ class K8sIntegration(Worker):

        return self._agent_label

-    def _get_used_pods(self):
-        # type: () -> Tuple[int, Set[str]]
-        # noinspection PyBroadException
+    RunningPod = namedtuple("RunningPod", "name queue namespace")
+
+    def _get_running_pods(self):
        try:
            kubectl_cmd = self.get_kubectl_command(
                "get pods",
-                output="jsonpath=\"{range .items[*]}{.metadata.name}{' '}{.metadata.namespace}{'\\n'}{end}\""
+                output="jsonpath=\"{{range .items[*]}}{{.metadata.name}}{{' '}}{{.metadata.namespace}}{{' '}}"
+                       "{{.metadata.labels.{}}}{{'\\n'}}{{end}}\"".format(self.QUEUE_LABEL)
            )
            self.log.debug("Getting used pods: {}".format(kubectl_cmd))
            output = stringify_bash_output(get_bash_output(kubectl_cmd, raise_error=True))

            if not output:
                # No such pod exist so we can use the pod_number we found
-                return 0, set([])
+                return []

            try:
-                items = output.splitlines()
-                current_pod_count = len(items)
-                namespaces = {item.rpartition(" ")[-1] for item in items}
-                self.log.debug(" - found {} pods in namespaces {}".format(current_pod_count, ", ".join(namespaces)))
-            except (KeyError, ValueError, TypeError, AttributeError) as ex:
-                print("Failed parsing used pods command response for cleanup: {}".format(ex))
-                return -1, set([])
+                return [
+                    self.RunningPod(
+                        name=parts[0],
+                        namespace=parts[1],
+                        queue=parts[2]
+                    )
+                    for parts in (line.split(" ") for line in output.splitlines())
+                ]
+            except Exception as ex:
+                raise Exception("Failed parsing used pods command response for cleanup: {}".format(ex))
+        except Exception as ex:
+            raise Exception('Failed obtaining used pods information: {}'.format(ex))

+    def _get_used_pods(self):
+        # type: () -> Tuple[int, Set[str]]
+        # noinspection PyBroadException
+        try:
+            items = self._get_running_pods()
+            if not items:
+                return 0, set([])
+            current_pod_count = len(items)
+            namespaces = {item.namespace for item in items}
+            self.log.debug(" - found {} pods in namespaces {}".format(current_pod_count, ", ".join(namespaces)))
            return current_pod_count, namespaces
        except Exception as ex:
-            print('Failed obtaining used pods information: {}'.format(ex))
-            return -2, set([])
+            self.log.debug("Failed getting used pods: {}", ex)
+            return -1, set([])

    def _is_same_tenant(self, task_session):
        if not task_session or task_session is self._session:
@@ -657,8 +674,8 @@ class K8sIntegration(Worker):
    def _get_pod_labels(self, queue, queue_name):
        return [
            self._get_agent_label(),
-            "clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)),
-            "clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name))
+            "{}={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue)),
+            "{}-name={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue_name))
        ]

    def _get_docker_args(self, docker_args, flags, target=None, convert=None):
--- a/clearml_agent/helper/package/pytorch.py
+++ b/clearml_agent/helper/package/pytorch.py
@@ -310,6 +310,12 @@ class PytorchRequirement(SimpleSubstitution):
            # yes this is for linux python 2.7 support, this is the only python 2.7 we support...
            if py_ver and py_ver[0] == '2' and len(parts) > 3 and not parts[3].endswith('u'):
                continue
+
+            # check if this an actual match
+            if not req.compare_version(v) or \
+                    (last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
+                continue
+
            # update the closest matched version (from above)
            if not closest_v:
                closest_v = v
@@ -318,10 +324,6 @@ class PytorchRequirement(SimpleSubstitution):
                    SimpleVersion.compare_versions(
                        version_a=v, op='>=', version_b=req.specs[0][1], num_parts=3):
                closest_v = v
-            # check if this an actual match
-            if not req.compare_version(v) or \
-                    (last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
-                continue

            url = '/'.join(torch_url.split('/')[:-1] + l.split('/'))
            last_v = v
@@ -475,6 +477,23 @@ class PytorchRequirement(SimpleSubstitution):
        return self.match_version(req, base).replace(" ", "\n")

    def replace(self, req):
+        # we first try to resolve things ourselves because pytorch pip is not always picking the correct
+        # versions from their pip repository
+
+        resolve_algorithm = str(self.config.get("agent.package_manager.pytorch_resolve", "pip")).lower()
+        if resolve_algorithm == "direct":
+            # noinspection PyBroadException
+            try:
+                new_req = self._replace(req)
+                if new_req:
+                    self._original_req.append((req, new_req))
+                return new_req
+            except Exception:
+                pass
+        elif resolve_algorithm not in ("direct", "pip"):
+            print("Warning: `agent.package_manager.pytorch_resolve={}` "
+                  "unrecognized, default to `pip`".format(resolve_algorithm))
+
        # check if package is already installed with system packages
        self.validate_python_version()

@@ -566,6 +585,19 @@ class PytorchRequirement(SimpleSubstitution):
        :param list_of_requirements: {'pip': ['a==1.0', ]}
        :return: {'pip': ['a==1.0', ]}
        """
+        def build_specific_version_req(a_line, a_name, a_new_req):
+            try:
+                r = Requirement.parse(a_line)
+                wheel_parts = r.uri.split("/")[-1].split('-')
+                version = str(wheel_parts[1].split('%')[0].split('+')[0])
+                new_r = Requirement.parse("{} == {} # {}".format(a_name, version, str(a_new_req)))
+                if new_r.line:
+                    # great it worked!
+                    return new_r.line
+            except:  # noqa
+                pass
+            return None
+
        if not self._original_req:
            return list_of_requirements
        try:
@@ -589,9 +621,18 @@ class PytorchRequirement(SimpleSubstitution):
                                    if req.local_file:
                                        lines[i] = '{}'.format(str(new_req))
                                    else:
-                                        lines[i] = '{} # {}'.format(str(req), str(new_req))
+                                        # try to rebuild requirements with specific version:
+                                        new_line = build_specific_version_req(line, req.req.name, new_req)
+                                        if new_line:
+                                            lines[i] = new_line
+                                        else:
+                                            lines[i] = '{} # {}'.format(str(req), str(new_req))
                            else:
-                                lines[i] = '{} # {}'.format(line, str(new_req))
+                                new_line = build_specific_version_req(line, req.req.name, new_req)
+                                if new_line:
+                                    lines[i] = new_line
+                                else:
+                                    lines[i] = '{} # {}'.format(line, str(new_req))
                            break
        except:
            pass
--- a/clearml_agent/helper/package/requirements.py
+++ b/clearml_agent/helper/package/requirements.py
@@ -240,6 +240,23 @@ class SimpleVersion:
        if not version_b:
            return True

+        # remove trailing "*" in both
+        if "*" in version_a:
+            ignore_sub_versions = True
+            while version_a.endswith(".*"):
+                version_a = version_a[:-2]
+            if version_a == "*":
+                version_a = ""
+            num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
+
+        if "*" in version_b:
+            ignore_sub_versions = True
+            while version_b.endswith(".*"):
+                version_b = version_b[:-2]
+            if version_b == "*":
+                version_b = ""
+            num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
+
        if not num_parts:
            num_parts = max(len(version_a.split('.')), len(version_b.split('.')), )

--- a/clearml_agent/helper/resource_monitor.py
+++ b/clearml_agent/helper/resource_monitor.py
@@ -139,42 +139,45 @@ class ResourceMonitor(object):
    def _daemon(self):
        seconds_since_started = 0
        reported = 0
-        while True:
-            last_report = time()
-            current_report_frequency = (
-                self._report_frequency if reported != 0 else self._first_report_sec
-            )
-            while (time() - last_report) < current_report_frequency:
-                # wait for self._sample_frequency seconds, if event set quit
-                if self._exit_event.wait(1 / self._sample_frequency):
-                    return
-                # noinspection PyBroadException
-                try:
-                    self._update_readouts()
-                except Exception as ex:
-                    log.warning("failed getting machine stats: %s", report_error(ex))
-                    self._failure()
+        try:
+            while True:
+                last_report = time()
+                current_report_frequency = (
+                    self._report_frequency if reported != 0 else self._first_report_sec
+                )
+                while (time() - last_report) < current_report_frequency:
+                    # wait for self._sample_frequency seconds, if event set quit
+                    if self._exit_event.wait(1 / self._sample_frequency):
+                        return
+                    # noinspection PyBroadException
+                    try:
+                        self._update_readouts()
+                    except Exception as ex:
+                        log.warning("failed getting machine stats: %s", report_error(ex))
+                        self._failure()

-            seconds_since_started += int(round(time() - last_report))
-            # check if we do not report any metric (so it means the last iteration will not be changed)
+                seconds_since_started += int(round(time() - last_report))
+                # check if we do not report any metric (so it means the last iteration will not be changed)

-            # if we do not have last_iteration, we just use seconds as iteration
+                # if we do not have last_iteration, we just use seconds as iteration

-            # start reporting only when we figured out, if this is seconds based, or iterations based
-            average_readouts = self._get_average_readouts()
-            stats = {
-                # 3 points after the dot
-                key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
-                for key, value in average_readouts.items()
-            }
+                # start reporting only when we figured out, if this is seconds based, or iterations based
+                average_readouts = self._get_average_readouts()
+                stats = {
+                    # 3 points after the dot
+                    key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
+                    for key, value in average_readouts.items()
+                }

-            # send actual report
-            if self.send_report(stats):
-                # clear readouts if this is update was sent
-                self._clear_readouts()
+                # send actual report
+                if self.send_report(stats):
+                    # clear readouts if this is update was sent
+                    self._clear_readouts()

-            # count reported iterations
-            reported += 1
+                # count reported iterations
+                reported += 1
+        except Exception as ex:
+            log.exception("Error reporting monitoring info: %s", str(ex))

    def _update_readouts(self):
        readouts = self._machine_stats()
--- a/clearml_agent/version.py
+++ b/clearml_agent/version.py
@@ -1 +1 @@
-__version__ = '1.5.2'
+__version__ = '1.5.3rc3'
--- a/docs/clearml.conf
+++ b/docs/clearml.conf
@@ -93,25 +93,39 @@ agent {
        # extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
        extra_index_url: []

+        # control the pytorch wheel resolving algorithm, options are: "pip", "direct"
+        # "pip" (default): would automatically detect the cuda version, and supply pip with the correct
+        # extra-index-url, based on pytorch.org tables
+        # "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
+        # and matching the automatically detected cuda version with the required pytorch wheel.
+        # if the exact cuda version is not found for the required pytorch wheel, it will try
+        # a lower cuda version until a match is found
+        #
+        # pytorch_resolve: "pip"
+
        # additional conda channels to use when installing with conda package manager
        conda_channels: ["pytorch", "conda-forge", "defaults", ]
        # conda_full_env_update: false
        # conda_env_as_base_docker: false

        # set the priority packages to be installed before the rest of the required packages
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        # priority_packages: ["cython", "numpy", "setuptools", ]

        # set the optional priority packages to be installed before the rest of the required packages,
        # In case a package installation fails, the package will be ignored,
        # and the virtual environment process will continue
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        # priority_optional_packages: ["pygobject", ]

        # set the post packages to be installed after all the rest of the required packages
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        # post_packages: ["horovod", ]

        # set the optional post packages to be installed after all the rest of the required packages,
        # In case a package installation fails, the package will be ignored,
        # and the virtual environment process will continue
+        # Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
        # post_optional_packages: []

        # set to True to support torch nightly build installation,
@@ -228,8 +242,6 @@ agent {
        #                 # no repository matching required
        #                 repository: ""
        #             }
-        #             # no container image matching required (allow to replace one requested container with another)
-        #             container: ""
        #             # no repository matching required
        #             project: ""
        #         }
@@ -469,6 +481,7 @@ sdk {
 #  target_format: format used to encode contents before writing into the target file. Supported values are json,
 #                 yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
 #  overwrite: overwrite the target file in case it exists. Default is true.
+#  mode: set the file mode after writing. use an integer value or a string (e.g. 600 / 777 etc.)
 #
 # Example:
 #   files {
Author	SHA1	Message	Date
allegroai	ccf752c4e4	Add support for setting mode on files applied by the agent	2023-07-04 14:37:58 +03:00
allegroai	3ed63e2154	Fix docker container backwards compatibility for API <2.13 Fix default docker match rules resolver (used incorrect field "container" instead of "image") Remove "container" (image) match rule option from default docker image resolver	2023-07-04 14:37:18 +03:00
allegroai	a535f93cd6	Add support for CLEARML_AGENT_FORCE_MAX_API_VERSION for testing	2023-07-04 14:35:54 +03:00
allegroai	b380ec54c6	Improve config file comments	2023-07-04 14:34:43 +03:00
allegroai	a1274299ce	Add support for CLEARML_AGENT_EXTRA_DOCKER_LABELS env var	2023-07-03 11:08:59 +03:00
allegroai	c77224af68	Add support for task field injection into container docker name	2023-07-03 11:07:12 +03:00
allegroai	95dadca45c	Refactor k8s glue running/used pods getter	2023-05-21 22:56:12 +03:00
allegroai	685918fd9b	Version bump to v1.5.3rc3	2023-05-21 22:54:38 +03:00
allegroai	bc85ddf78d	Fix pytorch direct resolve replacing wheel link with directly installed version	2023-05-21 22:53:51 +03:00
allegroai	5b5fb0b8a6	Add `agent.package_manager.pytorch_resolve` configuration setting with `pip` or `direct` values. `pip` sets extra index based on cuda and lets pip resolve, `direct` is the previous parsing algorithm that does the matching and downloading (default `pip`)	2023-05-21 22:53:11 +03:00
allegroai	fec0ce1756	Better message for agent init when an existing clearml.conf is found	2023-05-21 22:51:11 +03:00
allegroai	1e09b88b7a	Add alias `CLEARML_AGENT_DOCKER_AGENT_REPO` env var for the `FORCE_CLEARML_AGENT_REPO` env var	2023-05-21 22:50:01 +03:00
allegroai	b6ca0fa6a5	Print error on resource monitor failure	2023-05-11 16:18:11 +03:00