Version bump to v1.9.2

Fix reload method is found in the config object
Fix use req_token_expiration_sec when creating a task session and not the default value
2025-06-26 18:16:15 +00:00 · 2024-10-28 18:33:04 +02:00 · 2024-10-28 18:12:22 +02:00 · 2024-10-28 18:11:42 +02:00 · 2024-10-28 18:11:00 +02:00 · 2024-10-13 10:08:03 +03:00
29 changed files with 1032 additions and 248 deletions
--- a/clearml_agent/backend_api/config/default/agent.conf
+++ b/clearml_agent/backend_api/config/default/agent.conf
@@ -66,7 +66,7 @@
        type: pip,

        # specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
-        pip_version: ["<20.2 ; python_version < '3.10'", "<22.3 ; python_version >= '3.10'"],
+        pip_version: ["<20.2 ; python_version < '3.10'", "<22.3 ; python_version >= '3.10' and python_version <= '3.11'", ">=23,<24.3 ; python_version >= '3.12'"]
        # specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
        # poetry_version: "<2",
        # poetry_install_extra_args: ["-v"]
@@ -80,6 +80,14 @@
        # additional artifact repositories to use when installing python packages
        # extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]

+        # turn on the "--use-deprecated=legacy-resolver" flag for pip, to avoid package dependency version mismatch
+        # is any version restrictions are matched we add the "--use-deprecated=legacy-resolver" flag
+        # example: pip_legacy_resolver = [">=20.3,<24.3", ">99"]
+        # if pip==20.2 or pip==29.0 is installed we do nothing,
+        # if pip==21.1 or pip==101.1 is installed the flag is added
+        # disable the feature by passing an empty list
+        pip_legacy_resolver = [">=20.3,<24.3"]
+
        # control the pytorch wheel resolving algorithm, options are: "pip", "direct", "none"
        # Override with environment variable CLEARML_AGENT_PACKAGE_PYTORCH_RESOLVE
        # "pip" (default): would automatically detect the cuda version, and supply pip with the correct
@@ -218,6 +226,76 @@

        # optional arguments to pass to docker image
        # arguments: ["--ipc=host", ]
+
+        # Choose the default docker based on the Task properties,
+        # Notice: Enterprise feature, ignored otherwise
+        # Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project'
+        # Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme" string
+        "match_rules": [
+             {
+                 "image": "python:3.6-bullseye",
+                 "arguments": "--ipc=host",
+                 "match": {
+                     "script": {
+                         "binary": "python3.6$",
+                     },
+                 }
+             },
+             {
+                 "image": "python:3.7-bullseye",
+                 "arguments": "--ipc=host",
+                 "match": {
+                     "script": {
+                         "binary": "python3.7$",
+                     },
+                 }
+             },
+             {
+                 "image": "python:3.8-bullseye",
+                 "arguments": "--ipc=host",
+                 "match": {
+                     "script": {
+                         "binary": "python3.8$",
+                     },
+                 }
+             },
+             {
+                 "image": "python:3.9-bullseye",
+                 "arguments": "--ipc=host",
+                 "match": {
+                     "script": {
+                         "binary": "python3.9$",
+                     },
+                 }
+             },
+             {
+                 "image": "python:3.10-bullseye",
+                 "arguments": "--ipc=host",
+                 "match": {
+                     "script": {
+                         "binary": "python3.10$",
+                     },
+                 }
+             },
+             {
+                 "image": "python:3.11-bullseye",
+                 "arguments": "--ipc=host",
+                 "match": {
+                     "script": {
+                         "binary": "python3.11$",
+                     },
+                 }
+             },
+             {
+                 "image": "python:3.12-bullseye",
+                 "arguments": "--ipc=host",
+                 "match": {
+                     "script": {
+                         "binary": "python3.12$",
+                     },
+                 }
+             },
+        ]
    }

    # set the OS environments based on the Task's Environment section before launching the Task process.
@@ -289,6 +367,7 @@
        pip_cache: "/root/.cache/pip"
        poetry_cache: "/root/.cache/pypoetry"
        vcs_cache: "/root/.clearml/vcs-cache"
+        venvs_cache: "/root/.clearml/venvs-cache"
        venv_build: "~/.clearml/venvs-builds"
        pip_download: "/root/.clearml/pip-download-cache"
    }
--- a/clearml_agent/backend_api/session/defs.py
+++ b/clearml_agent/backend_api/session/defs.py
@@ -22,6 +22,9 @@ ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
    'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
 )
 ENV_FORCE_MAX_API_VERSION = EnvEntry("CLEARML_AGENT_FORCE_MAX_API_VERSION", type=str)
+# values are 0/None (task per node), 1/2 (multi-node reporting, colored console), -1 (only report rank 0 node)
+ENV_MULTI_NODE_SINGLE_TASK = EnvEntry("CLEARML_MULTI_NODE_SINGLE_TASK", type=int, default=None)
+

 """
 Experimental option to set the request method for all API requests and auth login.
--- a/clearml_agent/backend_api/session/session.py
+++ b/clearml_agent/backend_api/session/session.py
@@ -64,6 +64,8 @@ class Session(TokenManager):
    default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
    default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
    force_max_api_version = ENV_FORCE_MAX_API_VERSION.get()
+    server_version = "1.0.0"
+    user_id = None

    # TODO: add requests.codes.gateway_timeout once we support async commits
    _retry_codes = [
@@ -191,6 +193,8 @@ class Session(TokenManager):

            Session.api_version = str(api_version)
            Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
+            Session.server_version = token_dict.get('server_version', self.server_version)
+            Session.user_id = (token_dict.get("identity") or {}).get("user") or ""
        except (jwt.DecodeError, ValueError):
            pass

@@ -256,8 +260,9 @@ class Session(TokenManager):
        def parse(vault):
            # noinspection PyBroadException
            try:
-                print("Loaded {} vault: {}".format(
+                print("Loaded {} vault{}: {}".format(
                    vault.get("scope", ""),
+                    "" if not self.user_id else " for user {}".format(self.user_id),
                    (vault.get("description", None) or "")[:50] or vault.get("id", ""))
                )
                d = vault.get("data", None)
@@ -341,11 +346,12 @@ class Session(TokenManager):
                if self._propagate_exceptions_on_send:
                    raise
                sleep_time = sys_random.uniform(*self._request_exception_retry_timeout)
-                self._logger.error(
-                    "{} exception sending {} {}: {} (retrying in {:.1f}sec)".format(
-                        type(ex).__name__, method.upper(), url, str(ex), sleep_time
+                if self._logger:
+                    self._logger.error(
+                        "{} exception sending {} {}: {} (retrying in {:.1f}sec)".format(
+                            type(ex).__name__, method.upper(), url, str(ex), sleep_time
+                        )
                    )
-                )
                time.sleep(sleep_time)
                continue

@@ -364,11 +370,12 @@ class Session(TokenManager):
                res.status_code == requests.codes.service_unavailable
                and self.config.get("api.http.wait_on_maintenance_forever", True)
            ):
-                self._logger.warning(
-                    "Service unavailable: {} is undergoing maintenance, retrying...".format(
-                        host
+                if self._logger:
+                    self._logger.warning(
+                        "Service unavailable: {} is undergoing maintenance, retrying...".format(
+                            host
+                        )
                    )
-                )
                continue
            break
        self._session_requests += 1
@@ -649,11 +656,14 @@ class Session(TokenManager):
        """
        Return True if Session.api_version is greater or equal >= to min_api_version
        """
-        def version_tuple(v):
-            v = tuple(map(int, (v.split("."))))
-            return v + (0,) * max(0, 3 - len(v))
        return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))

+    @classmethod
+    def check_min_server_version(cls, min_server_version):
+        """
+        Return True if Session.server_version is greater or equal >= to min_server_version
+        """
+        return version_tuple(cls.server_version) >= version_tuple(str(min_server_version))
    def _do_refresh_token(self, current_token, exp=None):
        """ TokenManager abstract method implementation.
            Here we ignore the old token and simply obtain a new token.
@@ -731,3 +741,8 @@ class Session(TokenManager):
    def propagate_exceptions_on_send(self, value):
        # type: (bool) -> None
        self._propagate_exceptions_on_send = value
+
+
+def version_tuple(v):
+    v = tuple(map(int, (v.split("."))))
+    return v + (0,) * max(0, 3 - len(v))
--- a/clearml_agent/backend_config/utils.py
+++ b/clearml_agent/backend_config/utils.py
@@ -53,7 +53,7 @@ def apply_files(config):
        target_fmt = data.get("target_format", "string")
        overwrite = bool(data.get("overwrite", True))
        contents = data.get("contents")
-        mode = data.get("mode")
+        mode = data.get("mode", None)

        target = Path(expanduser(expandvars(path)))

--- a/clearml_agent/commands/resolver.py
+++ b/clearml_agent/commands/resolver.py
@@ -1,14 +1,16 @@
 import json
 import re
 import shlex
+from copy import copy

 from clearml_agent.backend_api.session import Request
+from clearml_agent.helper.docker_args import DockerArgsSanitizer
 from clearml_agent.helper.package.requirements import (
    RequirementsManager, MarkerRequirement,
    compare_version_rules, )


-def resolve_default_container(session, task_id, container_config):
+def resolve_default_container(session, task_id, container_config, ignore_match_rules=False):
    container_lookup = session.config.get('agent.default_docker.match_rules', None)
    if not session.check_min_api_version("2.13") or not container_lookup:
        return container_config
@@ -17,6 +19,12 @@ def resolve_default_container(session, task_id, container_config):
    try:
        session.verify_feature_set('advanced')
    except ValueError:
+        # ignoring matching rules only supported in higher tiers
+        return container_config
+
+    if ignore_match_rules:
+        print("INFO: default docker command line override, ignoring default docker container match rules")
+        # ignoring matching rules only supported in higher tiers
        return container_config

    result = session.send_request(
@@ -159,9 +167,10 @@ def resolve_default_container(session, task_id, container_config):
            if not container_config.get('image'):
                container_config['image'] = entry.get('image', None)
            if not container_config.get('arguments'):
-                container_config['arguments'] = entry.get('arguments', None)
-                container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
-            print('Matching default container with rule:\n{}'.format(json.dumps(entry)))
+                container_config['arguments'] = entry.get('arguments', None) or ''
+                if isinstance(container_config.get('arguments'), str):
+                    container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
+            print('INFO: Matching default container with rule:\n{}'.format(json.dumps(entry)))
            return container_config

    return container_config
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@@ -167,6 +167,7 @@ ENV_AGENT_GIT_USER = EnvironmentConfig("CLEARML_AGENT_GIT_USER", "TRAINS_AGENT_G
 ENV_AGENT_GIT_PASS = EnvironmentConfig("CLEARML_AGENT_GIT_PASS", "TRAINS_AGENT_GIT_PASS")
 ENV_AGENT_GIT_HOST = EnvironmentConfig("CLEARML_AGENT_GIT_HOST", "TRAINS_AGENT_GIT_HOST")
 ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig("CLEARML_AGENT_DISABLE_SSH_MOUNT", type=bool)
+ENV_AGENT_DEBUG_GET_NEXT_TASK = EnvironmentConfig("CLEARML_AGENT_DEBUG_GET_NEXT_TASK", type=bool)
 ENV_SSH_AUTH_SOCK = EnvironmentConfig("SSH_AUTH_SOCK")
 ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig("CLEARML_AGENT_EXEC_USER", "TRAINS_AGENT_EXEC_USER")
 ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig("CLEARML_AGENT_EXTRA_PYTHON_PATH", "TRAINS_AGENT_EXTRA_PYTHON_PATH")
--- a/clearml_agent/glue/definitions.py
+++ b/clearml_agent/glue/definitions.py
@@ -12,3 +12,9 @@ ENV_POD_MONITOR_LOG_BATCH_SIZE = EnvEntry("K8S_GLUE_POD_MONITOR_LOG_BATCH_SIZE",
 ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION = EnvEntry(
    "K8S_GLUE_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION", default=False, converter=bool
 )
+
+ENV_POD_USE_IMAGE_ENTRYPOINT = EnvEntry("K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT", default=False, converter=bool)
+"""
+Do not inject a cmd and args to the container's image when building the k8s template (depend on the built-in image
+entrypoint)
+"""
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@@ -25,6 +25,7 @@ from clearml_agent.definitions import (
    ENV_AGENT_GIT_USER,
    ENV_AGENT_GIT_PASS,
    ENV_FORCE_SYSTEM_SITE_PACKAGES,
+    ENV_AGENT_DEBUG_GET_NEXT_TASK,
 )
 from clearml_agent.errors import APIError, UsageError
 from clearml_agent.glue.errors import GetPodCountError
@@ -40,6 +41,7 @@ from clearml_agent.glue.definitions import (
    ENV_START_AGENT_SCRIPT_PATH,
    ENV_DEFAULT_EXECUTION_AGENT_ARGS,
    ENV_POD_AGENT_INSTALL_ARGS,
+    ENV_POD_USE_IMAGE_ENTRYPOINT,
 )


@@ -67,16 +69,23 @@ class K8sIntegration(Worker):
        'echo "ldconfig" >> /etc/profile',
        "/usr/sbin/sshd -p {port}"]

-    CONTAINER_BASH_SCRIPT = [
+    _CONTAINER_APT_SCRIPT_SECTION = [
        "export DEBIAN_FRONTEND='noninteractive'",
        "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
        "chown -R root /root/.cache/pip",
        "apt-get update",
        "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
+    ]
+
+    CONTAINER_BASH_SCRIPT = [
+        *(
+            '[ ! -z "$CLEARML_AGENT_SKIP_CONTAINER_APT" ] || {}'.format(line)
+            for line in _CONTAINER_APT_SCRIPT_SECTION
+        ),
        "declare LOCAL_PYTHON",
        "[ ! -z $LOCAL_PYTHON ] || for i in {{15..5}}; do which python3.$i && python3.$i -m pip --version && "
        "export LOCAL_PYTHON=$(which python3.$i) && break ; done",
-        "[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
+        '[ ! -z "$CLEARML_AGENT_SKIP_CONTAINER_APT" ] || [ ! -z "$LOCAL_PYTHON" ] || apt-get install -y python3-pip',
        "[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
        "{extra_bash_init_cmd}",
        "[ ! -z $CLEARML_AGENT_NO_UPDATE ] || $LOCAL_PYTHON -m pip install clearml-agent{agent_install_args}",
@@ -98,6 +107,7 @@ class K8sIntegration(Worker):
            num_of_services=20,
            base_pod_num=1,
            user_props_cb=None,
+            runtime_cb=None,
            overrides_yaml=None,
            template_yaml=None,
            clearml_conf_file=None,
@@ -125,6 +135,7 @@ class K8sIntegration(Worker):
        :param callable user_props_cb: An Optional callable allowing additional user properties to be specified
            when scheduling a task to run in a pod. Callable can receive an optional pod number and should return
            a dictionary of user properties (name and value). Signature is [[Optional[int]], Dict[str,str]]
+        :param callable runtime_cb: An Optional callable allowing additional task runtime to be specified (see user_props_cb)
        :param str overrides_yaml: YAML file containing the overrides for the pod (optional)
        :param str template_yaml: YAML file containing the template for the pod (optional).
            If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
@@ -159,6 +170,7 @@ class K8sIntegration(Worker):
        self.base_pod_num = base_pod_num
        self._edit_hyperparams_support = None
        self._user_props_cb = user_props_cb
+        self._runtime_cb = runtime_cb
        self.conf_file_content = None
        self.overrides_json_string = None
        self.template_dict = None
@@ -192,6 +204,14 @@ class K8sIntegration(Worker):
        self._min_cleanup_interval_per_ns_sec = 1.0
        self._last_pod_cleanup_per_ns = defaultdict(lambda: 0.)

+        self._server_supports_same_state_transition = (
+                self._session.feature_set != "basic" and self._session.check_min_server_version("3.22.3")
+        )
+
+    @property
+    def agent_label(self):
+        return self._get_agent_label()
+
    def _create_daemon_instance(self, cls_, **kwargs):
        return cls_(agent=self, **kwargs)

@@ -424,6 +444,9 @@ class K8sIntegration(Worker):
        """ Called when a resource (pod/job) was applied """
        pass

+    def ports_mode_supported_for_task(self, task_id: str, task_data):
+        return self.ports_mode
+
    def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_session=None, **_):
        print('Pulling task {} launching on kubernetes cluster'.format(task_id))
        session = task_session or self._session
@@ -433,7 +456,9 @@ class K8sIntegration(Worker):
        if self._is_same_tenant(task_session):
            try:
                print('Pushing task {} into temporary pending queue'.format(task_id))
-                _ = session.api_client.tasks.stop(task_id, force=True, status_reason="moving to k8s pending queue")
+
+                if not self._server_supports_same_state_transition:
+                    _ = session.api_client.tasks.stop(task_id, force=True, status_reason="moving to k8s pending queue")

                # Just make sure to clean up in case the task is stuck in the queue (known issue)
                self._session.api_client.queues.remove_task(
@@ -493,8 +518,10 @@ class K8sIntegration(Worker):
                )
            )

-        if self.ports_mode:
+        ports_mode = False
+        if self.ports_mode_supported_for_task(task_id, task_data):
            print("Kubernetes looking for available pod to use")
+            ports_mode = True

        # noinspection PyBroadException
        try:
@@ -505,12 +532,12 @@ class K8sIntegration(Worker):
        # Search for a free pod number
        pod_count = 0
        pod_number = self.base_pod_num
-        while self.ports_mode or self.max_pods_limit:
+        while ports_mode or self.max_pods_limit:
            pod_number = self.base_pod_num + pod_count

            try:
                items_count = self._get_pod_count(
-                    extra_labels=[self.limit_pod_label.format(pod_number=pod_number)] if self.ports_mode else None,
+                    extra_labels=[self.limit_pod_label.format(pod_number=pod_number)] if ports_mode else None,
                    msg="Looking for a free pod/port"
                )
            except GetPodCountError:
@@ -560,11 +587,11 @@ class K8sIntegration(Worker):
                break
            pod_count += 1

-        labels = self._get_pod_labels(queue, queue_name)
-        if self.ports_mode:
+        labels = self._get_pod_labels(queue, queue_name, task_data)
+        if ports_mode:
            labels.append(self.limit_pod_label.format(pod_number=pod_number))

-        if self.ports_mode:
+        if ports_mode:
            print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
        else:
            print("Kubernetes scheduling task id={}".format(task_id))
@@ -595,6 +622,7 @@ class K8sIntegration(Worker):
            task_id=task_id,
            queue=queue,
            namespace=namespace,
+            task_token=task_session.token.encode("ascii") if task_session else None,
        )

        print('kubectl output:\n{}\n{}'.format(error, output))
@@ -602,6 +630,14 @@ class K8sIntegration(Worker):
            send_log = "Running kubectl encountered an error: {}".format(error)
            self.log.error(send_log)
            self.send_logs(task_id, send_log.splitlines())
+
+            # Make sure to remove the task from our k8s pending queue
+            self._session.api_client.queues.remove_task(
+                task=task_id,
+                queue=self.k8s_pending_queue_id,
+            )
+            # Set task as failed
+            session.api_client.tasks.failed(task_id, force=True)
            return

        if pod_name:
@@ -609,25 +645,41 @@ class K8sIntegration(Worker):
                resource_name=pod_name, namespace=namespace, task_id=task_id, session=session
            )

+        self.set_task_info(
+            task_id=task_id, task_session=task_session, queue_name=queue_name, ports_mode=ports_mode,
+            pod_number=pod_number, pod_count=pod_count, task_data=task_data
+        )
+
+    def set_task_info(
+            self, task_id: str, task_session, task_data, queue_name: str, ports_mode: bool, pod_number, pod_count
+    ):
        user_props = {"k8s-queue": str(queue_name)}
-        if self.ports_mode:
-            user_props.update(
-                {
-                    "k8s-pod-number": pod_number,
-                    "k8s-pod-label": labels[0],
-                    "k8s-internal-pod-count": pod_count,
-                    "k8s-agent": self._get_agent_label(),
-                }
-            )
+        runtime = {}
+        if ports_mode:
+            agent_label = self._get_agent_label()
+            user_props.update({
+                "k8s-pod-number": pod_number,
+                "k8s-pod-label": agent_label,  # backwards-compatibility / legacy
+                "k8s-internal-pod-count": pod_count,
+                "k8s-agent": agent_label,
+            })

        if self._user_props_cb:
            # noinspection PyBroadException
            try:
-                custom_props = self._user_props_cb(pod_number) if self.ports_mode else self._user_props_cb()
+                custom_props = self._user_props_cb(pod_number) if ports_mode else self._user_props_cb()
                user_props.update(custom_props)
            except Exception:
                pass

+        if self._runtime_cb:
+            # noinspection PyBroadException
+            try:
+                custom_runtime = self._runtime_cb(pod_number) if ports_mode else self._runtime_cb()
+                runtime.update(custom_runtime)
+            except Exception:
+                pass
+
        if user_props:
            self._set_task_user_properties(
                task_id=task_id,
@@ -635,7 +687,38 @@ class K8sIntegration(Worker):
                **user_props
            )

-    def _get_pod_labels(self, queue, queue_name):
+        if runtime:
+            task_runtime = self._get_task_runtime(task_id) or {}
+            task_runtime.update(runtime)
+
+            try:
+                res = task_session.send_request(
+                    service='tasks', action='edit', method=Request.def_method,
+                    json={
+                        "task": task_id, "force": True, "runtime": task_runtime
+                    },
+                )
+                if not res.ok:
+                    raise Exception("failed setting runtime property")
+            except Exception as ex:
+                print("WARNING: failed setting custom runtime properties for task '{}': {}".format(task_id, ex))
+
+    def _get_task_runtime(self, task_id) -> Optional[dict]:
+        try:
+            res = self._session.send_request(
+                service='tasks', action='get_by_id', method=Request.def_method,
+                json={"task": task_id, "only_fields": ["runtime"]},
+            )
+            if not res.ok:
+                raise ValueError(f"request returned {res.status_code}")
+            data = res.json().get("data")
+            if not data or "task" not in data:
+                raise ValueError("empty data in result")
+            return data["task"].get("runtime", {})
+        except Exception as ex:
+            print(f"ERROR: Failed getting runtime properties for task {task_id}: {ex}")
+
+    def _get_pod_labels(self, queue, queue_name, task_data):
        return [
            self._get_agent_label(),
            "{}={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue)),
@@ -673,7 +756,7 @@ class K8sIntegration(Worker):

    def _create_template_container(
        self, pod_name: str, task_id: str, docker_image: str, docker_args: List[str],
-        docker_bash: str, clearml_conf_create_script: List[str], task_worker_id: str
+        docker_bash: str, clearml_conf_create_script: List[str], task_worker_id: str, task_token: str = None
    ) -> dict:
        container = self._get_docker_args(
            docker_args,
@@ -682,16 +765,32 @@ class K8sIntegration(Worker):
            convert=lambda env: {'name': env.partition("=")[0], 'value': env.partition("=")[2]},
        )

-        # Set worker ID
-        env_vars = container.get('env', [])
-        found_worker_id = False
-        for entry in env_vars:
-            if entry.get('name') == 'CLEARML_WORKER_ID':
-                entry['name'] = task_worker_id
-                found_worker_id = True
-        if not found_worker_id:
-            container['env'] = env_vars + [{'name': 'CLEARML_WORKER_ID', 'value': task_worker_id}]
+        def add_or_update_env_var(name, value):
+            env_vars = container.get('env', [])
+            for entry in env_vars:
+                if entry.get('name') == name:
+                    entry['value'] = value
+                    break
+            else:
+                container['env'] = env_vars + [{'name': name, 'value': value}]

+        # Set worker ID
+        add_or_update_env_var('CLEARML_WORKER_ID', task_worker_id)
+
+        if ENV_POD_USE_IMAGE_ENTRYPOINT.get():
+            # Don't add a cmd and args, just the image
+
+            # Add the task ID and token since we need it (it's usually in the init script passed to us
+            add_or_update_env_var('CLEARML_TASK_ID', task_id)
+            if task_token:
+                # TODO: find a way to base64 encode the token
+                add_or_update_env_var('CLEARML_AUTH_TOKEN', task_token)
+
+            return self._merge_containers(
+                container, dict(name=pod_name, image=docker_image)
+            )
+
+        # Create bash script for container and
        container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
            else self.container_bash_script

@@ -740,7 +839,8 @@ class K8sIntegration(Worker):
        task_id,
        namespace,
        template,
-        pod_number=None
+        pod_number=None,
+        task_token=None,
    ):
        if "apiVersion" not in template:
            template["apiVersion"] = "batch/v1" if self.using_jobs else "v1"
@@ -771,7 +871,7 @@ class K8sIntegration(Worker):
            spec.setdefault('backoffLimit', 0)
            spec_template = spec.setdefault('template', {})
            if labels:
-                # Place same labels fro any pod spawned by the job
+                # Place same labels for any pod spawned by the job
                place_labels(spec_template.setdefault('metadata', {}))

            spec = spec_template.setdefault('spec', {})
@@ -788,7 +888,8 @@ class K8sIntegration(Worker):
            docker_args=docker_args,
            docker_bash=docker_bash,
            clearml_conf_create_script=clearml_conf_create_script,
-            task_worker_id=task_worker_id
+            task_worker_id=task_worker_id,
+            task_token=task_token,
        )

        if containers:
@@ -935,7 +1036,7 @@ class K8sIntegration(Worker):
                result = self._session.get(
                    service='tasks',
                    action='get_all',
-                    json={"id": task_ids, "status": ["in_progress", "queued"], "only_fields": ["id", "status"]},
+                    json={"id": task_ids, "status": ["in_progress", "queued"], "only_fields": ["id", "status", "status_reason"]},
                    method=Request.def_method,
                )
                tasks_to_abort = result["tasks"]
@@ -945,9 +1046,13 @@ class K8sIntegration(Worker):
        for task in tasks_to_abort:
            task_id = task.get("id")
            status = task.get("status")
+            status_reason = (task.get("status_reason") or "").lower()
            if not task_id or not status:
                self.log.warning('Failed getting task information: id={}, status={}'.format(task_id, status))
                continue
+            if status == "queued" and "pushed back by policy manager" in status_reason:
+                # Task was pushed back to policy queue by policy manager, don't touch it
+                continue
            try:
                if status == "queued":
                    self._session.get(
@@ -981,6 +1086,9 @@ class K8sIntegration(Worker):

        return deleted_pods

+    def check_if_suspended(self) -> bool:
+        pass
+
    def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
        """
        :summary: Pull and run tasks from queues.
@@ -992,6 +1100,8 @@ class K8sIntegration(Worker):
        :param worker_params: Worker command line arguments
        :type worker_params: ``clearml_agent.helper.process.WorkerParams``
        """
+        # print("debug> running tasks loop")
+
        events_service = self.get_service(Events)

        # make sure we have a k8s pending queue
@@ -1023,12 +1133,19 @@ class K8sIntegration(Worker):
                    continue

            # iterate over queues (priority style, queues[0] is highest)
+            # print("debug> iterating over queues")
            for queue in queues:
                # delete old completed / failed pods
                self._cleanup_old_pods(namespaces, extra_msg="Cleanup cycle {cmd}")

+                if self.check_if_suspended():
+                    print("Agent is suspended, sleeping for {:.1f} seconds".format(self._polling_interval))
+                    sleep(self._polling_interval)
+                    break
+
                # get next task in queue
                try:
+                    # print(f"debug> getting tasks for queue {queue}")
                    response = self._get_next_task(queue=queue, get_task_info=self._impersonate_as_task_owner)
                except Exception as e:
                    print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
--- a/clearml_agent/helper/base.py
+++ b/clearml_agent/helper/base.py
@@ -543,6 +543,36 @@ def convert_cuda_version_to_int_10_base_str(cuda_version):
    return str(int(float(cuda_version)*10))


+def get_python_version(python_executable, log=None):
+    from clearml_agent.helper.process import Argv
+    try:
+        output = Argv(python_executable, "--version").get_output(
+            stderr=subprocess.STDOUT
+        )
+    except subprocess.CalledProcessError as ex:
+        # Windows returns 9009 code and suggests to install Python from Windows Store
+        if is_windows_platform() and ex.returncode == 9009:
+            if log:
+                log.debug("version not found: {}".format(ex))
+        else:
+            if log:
+                log.warning("error getting %s version: %s", python_executable, ex)
+        return None
+    except FileNotFoundError as ex:
+        if log:
+            log.debug("version not found: {}".format(ex))
+        return None
+
+    match = re.search(r"Python ({}(?:\.\d+)*)".format(r"\d+"), output)
+    if match:
+        if log:
+            log.debug("Found: {}".format(python_executable))
+        # only return major.minor version
+        return ".".join(str(match.group(1)).split(".")[:2])
+
+    return None
+
+
 class NonStrictAttrs(object):

    @classmethod
--- a/clearml_agent/helper/environment/converters.py
+++ b/clearml_agent/helper/environment/converters.py
@@ -69,7 +69,7 @@ def or_(*converters, **kwargs):
    return wrapper


-def strtobool (val):
+def strtobool(val):
    """Convert a string representation of truth to true (1) or false (0).

    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
--- a/clearml_agent/helper/package/base.py
+++ b/clearml_agent/helper/package/base.py
@@ -29,9 +29,12 @@ class PackageManager(object):
    _config_cache_max_entries = 'agent.venvs_cache.max_entries'
    _config_cache_free_space_threshold = 'agent.venvs_cache.free_space_threshold_gb'
    _config_cache_lock_timeout = 'agent.venvs_cache.lock_timeout'
+    _config_pip_legacy_resolver = 'agent.package_manager.pip_legacy_resolver'

    def __init__(self):
        self._cache_manager = None
+        self._existing_packages = []
+        self._base_install_flags = []

    @abc.abstractproperty
    def bin(self):
@@ -79,6 +82,23 @@ class PackageManager(object):
        # type: (Iterable[Text]) -> None
        pass

+    def add_extra_install_flags(self, extra_flags):  # type: (List[str]) -> None
+        if extra_flags:
+            extra_flags = [
+                e for e in extra_flags if e not in list(self._base_install_flags)
+            ]
+            self._base_install_flags = list(self._base_install_flags) + list(extra_flags)
+
+    def remove_extra_install_flags(self, extra_flags):  # type: (List[str]) -> bool
+        if extra_flags:
+            _base_install_flags = [
+                e for e in self._base_install_flags if e not in list(extra_flags)
+            ]
+            if self._base_install_flags != _base_install_flags:
+                self._base_install_flags = _base_install_flags
+                return True
+        return False
+
    def upgrade_pip(self):
        result = self._install(
            *select_for_platform(
@@ -87,19 +107,58 @@ class PackageManager(object):
            ),
            "--upgrade"
        )
-        packages = self.run_with_env(('list',), output=True).splitlines()
-        # p.split is ('pip', 'x.y.z')
-        pip = [p.split() for p in packages if len(p.split()) == 2 and p.split()[0] == 'pip']
-        if pip:
-            # noinspection PyBroadException
+
+        packages = (self.freeze(freeze_full_environment=True) or dict()).get("pip")
+        if packages:
+            from clearml_agent.helper.package.requirements import RequirementsManager
+            from .requirements import MarkerRequirement, SimpleVersion
+
+            # store existing packages so that we can check if we can skip preinstalled packages
+            # we will only check "@ file" "@ vcs" for exact match
+            self._existing_packages = RequirementsManager.parse_requirements_section_to_marker_requirements(
+                packages, skip_local_file_validation=True)
+
            try:
-                from .requirements import MarkerRequirement
-                pip = pip[0][1].split('.')
-                MarkerRequirement.pip_new_version = bool(int(pip[0]) >= 20)
-            except Exception:
-                pass
+                pip_pkg = next(p for p in self._existing_packages if p.name == "pip")
+            except StopIteration:
+                pip_pkg = None
+
+            # check if we need to list the pip version as well
+            if pip_pkg:
+                MarkerRequirement.pip_new_version = SimpleVersion.compare_versions(pip_pkg.version, ">=", "20")
+
+            # add --use-deprecated=legacy-resolver to pip install to avoid mismatched packages issues
+            self._add_legacy_resolver_flag(pip_pkg.version)
+
        return result

+    def _add_legacy_resolver_flag(self, pip_pkg_version):
+        if not self.session.config.get(self._config_pip_legacy_resolver, None):
+            return
+
+        from .requirements import SimpleVersion
+
+        match_versions = self.session.config.get(self._config_pip_legacy_resolver)
+        matched = False
+        for rule in match_versions:
+            matched = False
+            # make sure we match all the parts of the rule
+            for a_version in rule.split(","):
+                o, v = SimpleVersion.split_op_version(a_version.strip())
+                matched = SimpleVersion.compare_versions(pip_pkg_version, o, v)
+                if not matched:
+                    break
+            # if the rule is fully matched we have a match
+            if matched:
+                break
+
+        legacy_resolver_flags = ["--use-deprecated=legacy-resolver"]
+        if matched:
+            print("INFO: Using legacy resolver for PIP to avoid inconsistency with package versions!")
+            self.add_extra_install_flags(legacy_resolver_flags)
+        elif self.remove_extra_install_flags(legacy_resolver_flags):
+            print("INFO: removing pip legacy resolver!")
+
    def get_python_command(self, extra=()):
        # type: (...) -> Executable
        return Argv(self.bin, *extra)
@@ -149,6 +208,18 @@ class PackageManager(object):
                    return False
            except Exception:
                return False
+
+            try:
+                from .requirements import Requirement, MarkerRequirement
+                req = MarkerRequirement(Requirement.parse(package_name))
+
+                # if pip was part of the requirements, make sure we update the flags
+                # add --use-deprecated=legacy-resolver to pip install to avoid mismatched packages issues
+                if req.name == "pip" and req.version:
+                    PackageManager._selected_manager._add_legacy_resolver_flag(req.version)
+            except Exception as e:
+                print("WARNING: Error while parsing pip version legacy [{}]".format(e))
+
        return True

    @classmethod
@@ -219,6 +290,8 @@ class PackageManager(object):
        if not self._get_cache_manager():
            return

+        print('Adding venv into cache: {}'.format(source_folder))
+
        try:
            keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
            return self._get_cache_manager().add_entry(
--- a/clearml_agent/helper/package/pip_api/system.py
+++ b/clearml_agent/helper/package/pip_api/system.py
@@ -97,7 +97,7 @@ class SystemPip(PackageManager):
        return Argv(self.bin, '-m', 'pip', '--disable-pip-version-check', *command)

    def install_flags(self):
-        indices_args = tuple(
+        base_args = tuple(self._base_install_flags or []) + tuple(
            chain.from_iterable(('--extra-index-url', x) for x in PIP_EXTRA_INDICES)
        )

@@ -105,7 +105,7 @@ class SystemPip(PackageManager):
            ENV_PIP_EXTRA_INSTALL_FLAGS.get() or \
            self.session.config.get("agent.package_manager.extra_pip_install_flags", None)

-        return (indices_args + tuple(extra_pip_flags)) if extra_pip_flags else indices_args
+        return (base_args + tuple(extra_pip_flags)) if extra_pip_flags else base_args

    def download_flags(self):
        indices_args = tuple(
--- a/clearml_agent/helper/package/pip_api/venv.py
+++ b/clearml_agent/helper/package/pip_api/venv.py
@@ -37,7 +37,9 @@ class VirtualenvPip(SystemPip, PackageManager):

    def load_requirements(self, requirements):
        if isinstance(requirements, dict) and requirements.get("pip"):
-            requirements["pip"] = self.requirements_manager.replace(requirements["pip"])
+            requirements["pip"] = self.requirements_manager.replace(
+                requirements["pip"], existing_packages=self._existing_packages
+            )
        super(VirtualenvPip, self).load_requirements(requirements)
        self.requirements_manager.post_install(self.session, package_manager=self)

@@ -64,9 +66,18 @@ class VirtualenvPip(SystemPip, PackageManager):
        Only valid if instantiated with path.
        Use self.python as self.bin does not exist.
        """
-        self.session.command(
-            self.python, "-m", "virtualenv", self.path, *self.create_flags()
-        ).check_call()
+        # noinspection PyBroadException
+        try:
+            self.session.command(
+                self.python, "-m", "virtualenv", self.path, *self.create_flags()
+            ).check_call()
+        except Exception as ex:
+            # let's try with std library instead
+            print("WARNING: virtualenv call failed: {}\n INFO: Creating virtual environment with venv".format(ex))
+            self.session.command(
+                self.python, "-m", "venv", self.path, *self.create_flags()
+            ).check_call()
+
        return self

    def remove(self):
--- a/clearml_agent/helper/package/priority_req.py
+++ b/clearml_agent/helper/package/priority_req.py
@@ -53,12 +53,18 @@ class PriorityPackageRequirement(SimpleSubstitution):
        if not self._replaced_packages:
            return list_of_requirements

+        # we assume that both pip & setup tools are not in list_of_requirements, and we need to add them
+
        if "pip" in self._replaced_packages:
            full_freeze = PackageManager.out_of_scope_freeze(freeze_full_environment=True)
-            # now let's look for pip
-            pips = [line for line in full_freeze.get("pip", []) if line.split("==")[0] == "pip"]
-            if pips and "pip" in list_of_requirements:
-                list_of_requirements["pip"] = [pips[0]] + list_of_requirements["pip"]
+            if not full_freeze:
+                if "pip" in list_of_requirements:
+                    list_of_requirements["pip"] = [self._replaced_packages["pip"]] + list_of_requirements["pip"]
+            else:
+                # now let's look for pip
+                pips = [line for line in full_freeze.get("pip", []) if str(line.split("==")[0]).strip() == "pip"]
+                if pips and "pip" in list_of_requirements:
+                    list_of_requirements["pip"] = [pips[0]] + list_of_requirements["pip"]

        if "setuptools" in self._replaced_packages:
            try:
@@ -87,6 +93,20 @@ class PriorityPackageRequirement(SimpleSubstitution):
        return list_of_requirements


+class CachedPackageRequirement(PriorityPackageRequirement):
+
+    name = ("setuptools", "pip", )
+    optional_package_names = tuple()
+
+    def replace(self, req):
+        """
+        Put the requirement in the list for later conversion
+        :raises: ValueError if version is pre-release
+        """
+        self._replaced_packages[req.name] = req.line
+        return Text(req)
+
+
 class PackageCollectorRequirement(SimpleSubstitution):
    """
    This RequirementSubstitution class will allow you to have multiple instances of the same
--- a/clearml_agent/helper/package/requirements.py
+++ b/clearml_agent/helper/package/requirements.py
@@ -19,7 +19,7 @@ import logging
 from clearml_agent.definitions import PIP_EXTRA_INDICES
 from clearml_agent.helper.base import (
    warning, is_conda, which, join_lines, is_windows_platform,
-    convert_cuda_version_to_int_10_base_str, )
+    convert_cuda_version_to_int_10_base_str, dump_yaml, )
 from clearml_agent.helper.process import Argv, PathLike
 from clearml_agent.helper.gpu.gpustat import get_driver_cuda_version
 from clearml_agent.session import Session, normalize_cuda_version
@@ -94,6 +94,12 @@ class MarkerRequirement(object):
    def __repr__(self):
        return '{self.__class__.__name__}[{self}]'.format(self=self)

+    def __eq__(self, other):
+        return isinstance(other, MarkerRequirement) and str(self) == str(other)
+
+    def __hash__(self):
+        return str(self).__hash__()
+
    def format_specs(self, num_parts=None, max_num_parts=None):
        max_num_parts = max_num_parts or num_parts
        if max_num_parts is None or not self.specs:
@@ -116,6 +122,10 @@ class MarkerRequirement(object):
    def specs(self):  # type: () -> List[Tuple[Text, Text]]
        return self.req.specs

+    @property
+    def version(self):  # type: () -> Text
+        return self.specs[0][1] if self.specs else ""
+
    @specs.setter
    def specs(self, value):  # type: (List[Tuple[Text, Text]]) -> None
        self.req.specs = value
@@ -143,6 +153,8 @@ class MarkerRequirement(object):
        If the requested version is 1.2 the self.spec should be 1.2*
        etc.

+        usage: it returns the value of the following comparison: requested_version "op" self.version
+
        :param str requested_version:
        :param str op: '==', '>', '>=', '<=', '<', '~='
        :param int num_parts: number of parts to compare
@@ -152,7 +164,7 @@ class MarkerRequirement(object):
        if not self.specs:
            return True

-        version = self.specs[0][1]
+        version = self.version
        op = (op or self.specs[0][0]).strip()

        return SimpleVersion.compare_versions(
@@ -170,11 +182,21 @@ class MarkerRequirement(object):
        self.req.local_file = False
        return True

-    def validate_local_file_ref(self):
+    def is_local_package_ref(self):
        # if local file does not exist, remove the reference to it
        if self.vcs or self.editable or self.path or not self.local_file or not self.name or \
                not self.uri or not self.uri.startswith("file://"):
+            return False
+        return True
+
+    def is_vcs_ref(self):
+        return bool(self.vcs)
+
+    def validate_local_file_ref(self):
+        # if local file does not exist, remove the reference to it
+        if not self.is_local_package_ref():
            return
+
        local_path = Path(self.uri[len("file://"):])
        if not local_path.exists():
            local_path = Path(unquote(self.uri)[len("file://"):])
@@ -221,6 +243,19 @@ class SimpleVersion:
    _local_version_separators = re.compile(r"[\._-]")
    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)

+    @classmethod
+    def split_op_version(cls, line):
+        """
+        Split a string in the form of ">=1.2.3" into a (op, version), i.e. (">=", "1.2.3")
+        Notice is calling with only a version string (e.g. "1.2.3") default operator is "=="
+        which means you get ("==", "1.2.3")
+        :param line: string examples: "<=0.1.2"
+        :return: tuple of (op, version) example ("<=", "0.1.2")
+        """
+        match = r"\s*([>=<~!]*)\s*(\S*)\s*"
+        groups = re.match(match, line).groups()
+        return groups[0] or "==", groups[1]
+
    @classmethod
    def compare_versions(cls, version_a, op, version_b, ignore_sub_versions=True, num_parts=3):
        """
@@ -624,14 +659,54 @@ class RequirementsManager(object):
                return handler.replace(req)
        return None

-    def replace(self, requirements):  # type: (Text) -> Text
+    def replace(
+            self,
+            requirements,  # type: Text
+            existing_packages=None,  # type: List[MarkerRequirement]
+            pkg_skip_existing_local=True,  # type: bool
+            pkg_skip_existing_vcs=True,  # type: bool
+            pkg_skip_existing=True,  # type: bool
+    ):  # type: (...) -> Text
        parsed_requirements = self.parse_requirements_section_to_marker_requirements(
-            requirements=requirements, cwd=self._cwd)
+            requirements=requirements, cwd=self._cwd, skip_local_file_validation=True)

+        if parsed_requirements and existing_packages:
+            skipped_packages = None
+            if pkg_skip_existing:
+                skipped_packages = set(parsed_requirements) & set(existing_packages)
+            elif pkg_skip_existing_local or pkg_skip_existing_vcs:
+                existing_packages = [
+                    p for p in existing_packages if (
+                        (pkg_skip_existing_local and p.is_local_package_ref()) or
+                        (pkg_skip_existing_vcs and p.is_vcs_ref())
+                    )
+                ]
+                skipped_packages = set(parsed_requirements) & set(existing_packages)
+
+            if skipped_packages:
+                # maintain order
+                num_skipped_packages = len(parsed_requirements)
+                parsed_requirements = [p for p in parsed_requirements if p not in skipped_packages]
+                num_skipped_packages -= len(parsed_requirements)
+                print("Skipping {} pre-installed packages:\n{}Remaining {} additional packages to install".format(
+                    num_skipped_packages,
+                    dump_yaml(sorted([str(p) for p in skipped_packages])),
+                    len(parsed_requirements)
+                ))
+
+                # nothing to install!
+                if not parsed_requirements:
+                    return ""
+
+        # sanity check
        if not parsed_requirements:
            # return the original requirements just in case
            return requirements

+        # remove local file reference that do not exist
+        for p in parsed_requirements:
+            p.validate_local_file_ref()
+
        def replace_one(i, req):
            # type: (int, MarkerRequirement) -> Optional[Text]
            try:
@@ -805,7 +880,7 @@ class RequirementsManager(object):
                normalize_cuda_version(cudnn_version or 0))

    @staticmethod
-    def parse_requirements_section_to_marker_requirements(requirements, cwd=None):
+    def parse_requirements_section_to_marker_requirements(requirements, cwd=None, skip_local_file_validation=False):
        def safe_parse(req_str):
            # noinspection PyBroadException
            try:
@@ -815,7 +890,8 @@ class RequirementsManager(object):

        def create_req(x):
            r = MarkerRequirement(x)
-            r.validate_local_file_ref()
+            if not skip_local_file_validation:
+                r.validate_local_file_ref()
            return r

        if not requirements:
--- a/clearml_agent/helper/repo.py
+++ b/clearml_agent/helper/repo.py
@@ -598,7 +598,7 @@ class Git(VCS):

    def pull(self):
        self._set_ssh_url()
-        self.call("fetch", "--all", "--recurse-submodules", cwd=self.location)
+        self.call("fetch", "--all", "--tags", "--recurse-submodules", cwd=self.location)

    def _git_pass_auth_wrapper(self, func, *args, **kwargs):
        try:
@@ -936,7 +936,7 @@ def _locate_future_import(lines):


 def patch_add_task_init_call(local_filename):
-    if not local_filename or not Path(local_filename).is_file():
+    if not local_filename or not Path(local_filename).is_file() or not str(local_filename).lower().endswith(".py"):
        return

    idx_a = 0
--- a/clearml_agent/helper/resource_monitor.py
+++ b/clearml_agent/helper/resource_monitor.py
@@ -401,6 +401,7 @@ class ResourceMonitor(object):
                        fractions = self._fractions_handler.fractions
                        stats["gpu_fraction_{}".format(report_index)] = \
                            (fractions[i] if i < len(fractions) else fractions[-1]) if fractions else 1.0
+                    report_index += 1

            except Exception as ex:
                # something happened and we can't use gpu stats,
@@ -438,6 +439,7 @@ class ResourceMonitor(object):
 class GpuFractionsHandler:
    _number_re = re.compile(r"^clear\.ml/fraction(-\d+)?$")
    _mig_re = re.compile(r"^nvidia\.com/mig-(?P<compute>[0-9]+)g\.(?P<memory>[0-9]+)gb$")
+    _frac_gpu_injector_re = re.compile(r"^clearml-injector/fraction$")

    _gpu_name_to_memory_gb = {
        "A30": 24,
@@ -514,10 +516,14 @@ class GpuFractionsHandler:
        return 0

    @classmethod
-    def encode_fractions(cls, limits: dict) -> str:
-        if any(cls._number_re.match(x) for x in (limits or {})):
-            return ",".join(str(v) for k, v in sorted(limits.items()) if cls._number_re.match(k))
-        return ",".join(("{}:{}".format(k, v) for k, v in (limits or {}).items() if cls._mig_re.match(k)))
+    def encode_fractions(cls, limits: dict, annotations: dict) -> str:
+        if limits:
+            if any(cls._number_re.match(x) for x in (limits or {})):
+                return ",".join(str(v) for k, v in sorted(limits.items()) if cls._number_re.match(k))
+            return ",".join(("{}:{}".format(k, v) for k, v in (limits or {}).items() if cls._mig_re.match(k)))
+        elif annotations:
+            if any(cls._frac_gpu_injector_re.match(x) for x in (annotations or {})):
+                return ",".join(str(v) for k, v in sorted(annotations.items()) if cls._frac_gpu_injector_re.match(k))

    @staticmethod
    def decode_fractions(fractions: str) -> Union[List[float], Dict[str, int]]:
--- a/clearml_agent/interface/worker.py
+++ b/clearml_agent/interface/worker.py
@@ -67,7 +67,10 @@ DAEMON_ARGS = dict({
        'group': 'Docker support',
    },
    '--queue': {
-        'help': 'Queue ID(s)/Name(s) to pull tasks from (\'default\' queue)',
+        'help': 'Queue ID(s)/Name(s) to pull tasks from (\'default\' queue).'
+                ' Note that the queue list order determines priority, with the first listed queue having the'
+                ' highest priority. To change this behavior, use --order-fairness to pull from each queue in a'
+                ' round-robin order',
        'nargs': '+',
        'default': tuple(),
        'dest': 'queues',
@@ -112,8 +115,11 @@ DAEMON_ARGS = dict({
    '--dynamic-gpus': {
        'help': 'Allow to dynamically allocate gpus based on queue properties, '
                'configure with \'--queue <queue_name>=<num_gpus>\'.'
-                ' Example: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 single_gpu=1\''
-                ' Example Opportunistic: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 max_quad_gpus=1-4 \'',
+                ' Example: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 single_gpu=1\'.'
+                ' Example Opportunistic: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 max_quad_gpus=1-4\'.'
+                ' Note that the queue list order determines priority, with the first listed queue having the'
+                ' highest priority. To change this behavior, use --order-fairness to pull from each queue in a'
+                ' round-robin order',
        'action': 'store_true',
    },
    '--uptime': {
--- a/clearml_agent/version.py
+++ b/clearml_agent/version.py
@@ -1 +1 @@
-__version__ = '1.8.1'
+__version__ = '1.9.2'
--- a/docker/k8s-glue/glue-build-aws/Dockerfile
+++ b/docker/k8s-glue/glue-build-aws/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:22.04

 USER root
 WORKDIR /root
--- a/docker/k8s-glue/glue-build-aws/setup_aws.sh
+++ b/docker/k8s-glue/glue-build-aws/setup_aws.sh
@@ -4,7 +4,8 @@ curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip
 unzip awscliv2.zip
 ./aws/install

-curl -o kubectl https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/kubectl
+curl -o kubectl https://s3.us-west-2.amazonaws.com/amazon-eks/1.29.3/2024-04-19/bin/linux/amd64/kubectl
+#curl -o kubectl https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/kubectl
 #curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl
 chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin

--- a/docker/k8s-glue/glue-build-gcp/Dockerfile
+++ b/docker/k8s-glue/glue-build-gcp/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:22.04

 USER root
 WORKDIR /root
--- a/docker/k8s-glue/glue-build-gcp/setup_gcp.sh
+++ b/docker/k8s-glue/glue-build-gcp/setup_gcp.sh
@@ -1,6 +1,6 @@
 #!/bin/bash

-curl -LO https://dl.k8s.io/release/v1.21.0/bin/linux/amd64/kubectl
+curl -LO https://dl.k8s.io/release/v1.29.3/bin/linux/amd64/kubectl

 install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

--- a/docker/k8s-glue/glue-build/Dockerfile.alpine
+++ b/docker/k8s-glue/glue-build/Dockerfile.alpine
@@ -20,7 +20,7 @@ FROM python:${TAG} as target

 WORKDIR /app

-ARG KUBECTL_VERSION=1.24.0
+ARG KUBECTL_VERSION=1.29.3

 # Not sure about these ENV vars
 # ENV LC_ALL=en_US.UTF-8
--- a/docker/k8s-glue/glue-build/Dockerfile.bullseye
+++ b/docker/k8s-glue/glue-build/Dockerfile.bullseye
@@ -2,7 +2,7 @@ ARG TAG=3.7.17-slim-bullseye

 FROM python:${TAG} as target

-ARG KUBECTL_VERSION=1.22.4
+ARG KUBECTL_VERSION=1.29.3

 WORKDIR /app

--- a/docker/k8s-glue/task-pod-build/Dockerfile
+++ b/docker/k8s-glue/task-pod-build/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:22.04

 USER root
 WORKDIR /root
--- a/docker/services/entrypoint.sh
+++ b/docker/services/entrypoint.sh
@@ -33,4 +33,10 @@ if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
  fi
 fi

-clearml-agent daemon $DAEMON_OPTIONS --queue $QUEUES --docker "${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}" --cpu-only ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
+DOCKER_ARGS="--docker \"${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}\""
+
+if [ -n "$CLEARML_AGENT_NO_DOCKER" ]; then
+  DOCKER_ARGS=""
+fi
+
+clearml-agent daemon $DAEMON_OPTIONS --queue $QUEUES $DOCKER_ARGS --cpu-only ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
--- a/docs/clearml.conf
+++ b/docs/clearml.conf
@@ -308,6 +308,7 @@ agent {
    #     pip_cache: "/root/.cache/pip"
    #     poetry_cache: "/root/.cache/pypoetry"
    #     vcs_cache: "/root/.clearml/vcs-cache"
+    #     venvs_cache: "/root/.clearml/venvs-cache"
    #     venv_build: "~/.clearml/venvs-builds"
    #     pip_download: "/root/.clearml/pip-download-cache"
    # }
Author	SHA1	Message	Date
clearml	3273f76b46	Version bump to v1.9.2	2024-10-28 18:33:04 +02:00
clearml	9af0f9fe41	Fix reload method is found in the config object	2024-10-28 18:12:22 +02:00
clearml	205cd47cb9	Fix use req_token_expiration_sec when creating a task session and not the default value	2024-10-28 18:11:42 +02:00
clearml	0ff428bb96	Fix report index not advancing in resource monitoring causes more than one GPU not to be reported	2024-10-28 18:11:00 +02:00
Matteo Destro	bf8d9c96e9	Handle OSError when checking for is_file (#215 )	2024-10-13 10:08:03 +03:00
allegroai	a88487ff25	Add support for pip legacy resolver for versions specified in the `agent.package_manager.pip_legacy_resolver` configuration option Add skip existing packages	2024-09-22 22:36:06 +03:00
Jake Henning	785e22dc87	Version bump to v1.9.1	2024-09-02 01:04:49 +03:00
Jake Henning	6a2b778d53	Add default pip version support for Python 3.12	2024-09-02 01:03:52 +03:00
allegroai	b2c3702830	Version bump to v1.9.0	2024-08-28 23:18:26 +03:00
allegroai	6302d43990	Add support for skipping container apt installs using CLEARML_AGENT_SKIP_CONTAINER_APT env var in k8s Add runtime callback support for setting runtime properties per task in k8s Fix remove task from pending queue and set to failed when kubectl apply fails	2024-08-27 23:01:27 +03:00
allegroai	760bbca74e	Fix failed Task in services mode logged "User aborted" instead of failed, add Task reason string	2024-08-27 22:56:37 +03:00
allegroai	e63fd31420	Fix string format	2024-08-27 22:55:49 +03:00
allegroai	2ff9985db7	Add user ID to the vault loading print	2024-08-27 22:55:32 +03:00
allegroai	b8c762401b	Fix use same state transition if supported by the server (instead of stopping the task before re-enqueue)	2024-08-27 22:54:45 +03:00
allegroai	99e1e54f94	Add support for tasks containing only bash script or python module command	2024-08-27 22:53:14 +03:00
allegroai	a4d3b5bad6	Fix only set Task started status on node rank 0	2024-08-27 22:52:31 +03:00
allegroai	b21665ed6e	Fix do not cache venv cache if venv/python skip env var was set	2024-08-27 22:52:01 +03:00
Surya Teja	f877aa96e2	Update Docker base image to Ubuntu 22.04 and Kubectl to 1.29.3 (#201 )	2024-07-29 18:41:50 +03:00
pollfly	f99344d194	Add queue priority info to CLI help (#211 ) * add queue priority comment * Add --order-fairness info --------- Co-authored-by: Jake Henning <59198928+jkhenning@users.noreply.github.com>	2024-07-29 18:40:38 +03:00
allegroai	d9f2a1999a	Fix Only send pip freeze update on RANK 0, only update task status on exit on RANK 0	2024-07-29 17:40:24 +03:00
Valentin Schabschneider	79d0abe707	Add NO_DOCKER flag to clearml-agent-services entrypoint (#206 )	2024-07-26 19:09:54 +03:00
allegroai	6213ef4c02	Add /bin/bash -c "command" support. Task `binary` should be set to `/bin/bash` and entry_point should be set to `-c command`	2024-07-24 18:00:13 +03:00
allegroai	aef6aa9fc8	Fix a race condition where in rare conditions popping a Task from a queue that was aborted did not set it to started before the watchdog killed it. Does not happen in k8s/slurm	2024-07-24 17:59:46 +03:00
allegroai	0bb267115b	Add venvs_cache.path mount override for non-root containers (use: agent.docker_internal_mounts.venvs_cache)	2024-07-24 17:59:18 +03:00
allegroai	f89a92556f	Fix check logger is not None	2024-07-24 17:55:02 +03:00
allegroai	8ba4d75e80	Add CLEARML_TASK_ID and auth token to pod env vars in original entrypoint flow	2024-07-24 17:47:48 +03:00
allegroai	edc333ba5f	Add K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT to allow running images without overriding the entrypoint (useful for agents using prebuilt images in k8s)	2024-07-24 17:46:27 +03:00
allegroai	2f0553b873	Fix CLEARML_MULTI_NODE_SINGLE_TASK should be read once not every reported line	2024-07-24 17:45:02 +03:00
allegroai	b2a4bf08ac	Fix pass --docker only (i.e. no default container image) for --dynamic-gpus feature	2024-07-24 17:44:35 +03:00
allegroai	f18c6b809f	Fix slurm multi-node rank detection	2024-07-24 17:44:05 +03:00
allegroai	cd5b4d2186	Add "-m module args" in script entry now supports standalone script, standalone script is converted to "untitled.py" by default or if specified in working_dir such as <dir>:<target_file> for example ".:standalone.py"	2024-07-24 17:43:21 +03:00
allegroai	5f1bab6711	Add default docker match_rules for enterprise users, NOTICE: matching_rules are ignored if `--docker container` is passed in command line	2024-07-24 17:42:55 +03:00
allegroai	ab9b9db0c9	Add CLEARML_MULTI_NODE_SINGLE_TASK (values -1, 0, 1, 2) for easier multi-node singe Task workloads	2024-07-24 17:42:25 +03:00
allegroai	93df021108	Add support for .ipynb script entry files (install nbconvert in runtime, copnvert to python and execute the python script), including CLEARML_AGENT_FORCE_TASK_INIT patching of ipynb files (post python conversion)	2024-07-24 17:41:59 +03:00
allegroai	700ae85de0	Fix file mode should be optional in configuration `files` section	2024-07-24 17:41:06 +03:00
allegroai	f367c5a571	Fix git fetch did not update new tags #209	2024-07-24 17:39:53 +03:00
allegroai	ebc5944b44	Fix setting tasks that someone just marked as aborted to started - only force Task to started after dequeuing it otherwise lease it as is	2024-07-24 17:39:26 +03:00
allegroai	8f41002845	Add task.script.binary /bin/bash support Fix -m module $env to support parsing the $env before launching	2024-07-24 17:37:26 +03:00
allegroai	7e8670d57f	Find the correct python version when using a pre-installed python environment	2024-07-21 14:10:38 +03:00
allegroai	77de343863	Use "venv" module if virtualenv is not supported	2024-07-19 13:18:07 +03:00