Add custom build script support

Add extra configurations when starting daemon Propagate token to docker in case credentials are not available
2025-06-26 18:16:15 +00:00 · 2022-03-15 10:04:25 +02:00 · 2022-03-15 10:04:25 +02:00 · 531e514003
commit 531e514003
parent 2cd9e706c8
13 changed files with 504 additions and 178 deletions
--- a/clearml_agent/main.py
+++ b/clearml_agent/main.py
@ -12,7 +12,7 @@ from clearml_agent.definitions import FileBuffering, CONFIG_FILE
 from clearml_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
 from clearml_agent.helper.process import ExitStatus
 from . import interface, session, definitions, commands
-from .errors import ConfigFileNotFound, Sigterm, APIError
+from .errors import ConfigFileNotFound, Sigterm, APIError, CustomBuildScriptFailed
 from .helper.trace import PackageTrace
 from .interface import get_parser
@ -44,6 +44,8 @@ def run_command(parser, args, command_name):
        debug = command._session.debug_mode
        func = getattr(command, command_name)
        return func(**args_dict)
    except CustomBuildScriptFailed as e:
        command_class.exit(e.message, e.errno)
    except ConfigFileNotFound:
        message = 'Cannot find configuration file in "{}".\n' \
                  'To create a configuration file, run:\n' \
--- a/clearml_agent/backend_api/config/default/agent.conf
+++ b/clearml_agent/backend_api/config/default/agent.conf
@ -35,6 +35,11 @@
    # default false, only the working directory will be added to the PYHTONPATH
    # force_git_root_python_path: false
    # in docker mode, if container's entrypoint automatically activated a virtual environment
    # use the activated virtual environment and install everything there
    # set to False to disable, and always create a new venv inheriting from the system_site_packages
    # docker_use_activated_venv: true
    # select python package manager:
    # currently supported: pip, conda and poetry
    # if "pip" or "conda" are used, the agent installs the required packages
@ -269,4 +274,34 @@
    #       target_format: json
    #     }
    #   }
    # Specifies a custom environment setup script to be executed instead of installing a virtual environment.
    # If provided, this script is executed following Git cloning. Script command may include environment variable and
    # will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
    # The script can also be specified using the CLEARML_AGENT_CUSTOM_BUILD_SCRIPT environment variable.
    #
    # When running the script, the following environment variables will be set:
    # - CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
    #  contents in JSON format
    # - CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
    # - CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
    # - CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
    # - CLEARML_GIT_ROOT: path to the cloned Git repository
    # - CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
    #  this file must be in the following JSON format:
    #      ```json
    #      {
    #        "binary": "/absolute/path/to/python-executable",
    #        "entry_point": "/absolute/path/to/task-entrypoint-script",
    #        "working_dir": "/absolute/path/to/task-working/dir"
    #      }
    #      ```
    #  If provided, the agent will use these instead of the predefined task script section to execute the task and will
    #  skip virtual environment creation.
    #
    # In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
    # In case the custom script is specified but does not exist, or if the custom script does not write valid content
    # into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
    # standard flow.
    custom_build_script: ""
 }
--- a/clearml_agent/backend_api/session/defs.py
+++ b/clearml_agent/backend_api/session/defs.py
@ -15,6 +15,7 @@ ENV_NO_DEFAULT_SERVER = EnvEntry("CLEARML_NO_DEFAULT_SERVER", "TRAINS_NO_DEFAULT
 ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type=bool)
 ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
 ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
 ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
 ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
    'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
 )
--- a/clearml_agent/backend_api/session/session.py
+++ b/clearml_agent/backend_api/session/session.py
@ -206,7 +206,7 @@ class Session(TokenManager):
                http_retries_config = dict(**http_retries_config)
                http_retries_config['connect'] = connect_retries
-        return http_retries_config, get_http_session_with_retry(**http_retries_config)
+        return http_retries_config, get_http_session_with_retry(config=self.config or None, **http_retries_config)
    def load_vaults(self):
        if not self.check_min_api_version("2.15") or self.feature_set == "basic":
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@ -39,7 +39,9 @@ from clearml_agent.backend_api.services import queues as queues_api
 from clearml_agent.backend_api.services import tasks as tasks_api
 from clearml_agent.backend_api.services import workers as workers_api
 from clearml_agent.backend_api.session import CallResult
-from clearml_agent.backend_api.session.defs import ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION
+from clearml_agent.backend_api.session.defs import (
    ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
    ENV_VENV_CONFIGURED, )
 from clearml_agent.backend_config.defs import UptimeConf
 from clearml_agent.backend_config.utils import apply_environment, apply_files
 from clearml_agent.commands.base import resolve_names, ServiceCommandSection
@ -65,10 +67,17 @@ from clearml_agent.definitions import (
    ENV_SSH_AUTH_SOCK,
    ENV_AGENT_SKIP_PIP_VENV_INSTALL,
    ENV_EXTRA_DOCKER_ARGS,
    ENV_CUSTOM_BUILD_SCRIPT, ENV_AGENT_SKIP_PYTHON_ENV_INSTALL, WORKING_STANDALONE_DIR,
 )
 from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
-from clearml_agent.errors import APIError, CommandFailedError, Sigterm
+from clearml_agent.errors import (
    APIError,
    CommandFailedError,
    Sigterm,
    SkippedCustomBuildScript,
    CustomBuildScriptFailed,
 )
 from clearml_agent.helper.base import (
    return_list,
    print_parameters,
@ -218,7 +227,7 @@ class LiteralScriptManager(object):
            location = None
        location = location or (repo_info and repo_info.root)
        if not location:
-            location = Path(self.venv_folder, "code")
+            location = Path(self.venv_folder, WORKING_STANDALONE_DIR)
            location.mkdir(exist_ok=True, parents=True)
        log.debug("selected execution directory: %s", location)
        return Text(location), self.write(task, location, execution.entry_point)
@ -698,6 +707,9 @@ class Worker(ServiceCommandSection):
            )
            if self._impersonate_as_task_owner:
                docker_params["auth_token"] = task_session.token
            elif self._session.access_key is None or self._session.secret_key is None:
                # We're using a token right now
                docker_params["auth_token"] = self._session.token
            if self._worker_tags:
                docker_params["worker_tags"] = self._worker_tags
            if self._services_mode:
@ -720,7 +732,7 @@ class Worker(ServiceCommandSection):
                    else:
                        print("Warning: generated docker container name is invalid: {}".format(name))
-            full_docker_cmd = self.docker_image_func(**docker_params)
+            full_docker_cmd = self.docker_image_func(env_task_id=task_id, **docker_params)
            # if we are using the default docker, update back the Task:
            if default_docker:
@ -1258,6 +1270,7 @@ class Worker(ServiceCommandSection):
        self._session.print_configuration()
    def daemon(self, queues, log_level, foreground=False, docker=False, detached=False, order_fairness=False, **kwargs):
        self._apply_extra_configuration()
        # check that we have docker command if we need it
        if docker not in (False, None) and not check_if_command_exists("docker"):
@ -1292,8 +1305,12 @@ class Worker(ServiceCommandSection):
        # We are not running a daemon we are killing one.
        # find the pid send termination signal and leave
-        if kwargs.get('stop', False):
+        if kwargs.get('stop', False) is not False:
-            return 1 if not self._kill_daemon(dynamic_gpus=dynamic_gpus) else 0
+            return_code = 0
            for worker_id in kwargs.get('stop') or [None]:
                if not self._kill_daemon(dynamic_gpus=dynamic_gpus, worker_id=worker_id):
                    return_code = 1
            return return_code
        # if we do not need to create queues, make sure they are valid
        # match previous behaviour when we validated queue names before everything else
@ -1772,11 +1789,19 @@ class Worker(ServiceCommandSection):
                             "ERROR! Failed applying git diff, see diff above.".format(diff))
    def _apply_extra_configuration(self):
        # store a few things we updated in runtime (TODO: we should list theme somewhere)
        agent_config = self._session.config["agent"].copy()
        agent_config_keys = ["cuda_version", "cudnn_version", "default_python", "worker_id", "debug"]
        try:
            self._session.load_vaults()
        except Exception as ex:
            print("Error: failed applying extra configuration: {}".format(ex))
        # merge back
        for restore_key in agent_config_keys:
            if restore_key in agent_config:
                self._session.config["agent"][restore_key] = agent_config[restore_key]
        config = self._session.config
        default = config.get("agent.apply_environment", False)
        if ENV_ENABLE_ENV_CONFIG_SECTION.get(default=default):
@ -1829,13 +1854,7 @@ class Worker(ServiceCommandSection):
                requirements = None
        if not python_version:
-            try:
+            python_version = self._get_task_python_version(current_task)
                python_version = current_task.script.binary
                python_version = python_version.split('/')[-1].replace('python', '')
                # if we can cast it, we are good
                python_version = '{:.1f}'.format(float(python_version))
            except:
                python_version = None
        venv_folder, requirements_manager, is_cached = self.install_virtualenv(
            venv_dir=target, requested_python_version=python_version, execution_info=execution,
@ -1985,6 +2004,16 @@ class Worker(ServiceCommandSection):
        return
    def _get_task_python_version(self, task):
        # noinspection PyBroadException
        try:
            python_ver = task.script.binary
            python_ver = python_ver.split('/')[-1].replace('python', '')
            # if we can cast it, we are good
            return '{:.1f}'.format(float(python_ver))
        except Exception:
            pass
    @resolve_names
    def execute(
        self,
@ -2097,85 +2126,140 @@ class Worker(ServiceCommandSection):
        execution = self.get_execution_info(current_task)
-        if self._session.config.get("agent.package_manager.force_repo_requirements_txt", False):
+        python_ver = self._get_task_python_version(current_task)
-            requirements = None
+
-            print("[package_manager.force_repo_requirements_txt=true] "
+        freeze = None
-                  "Skipping requirements, using repository \"requirements.txt\" ")
+        repo_info = None
-        else:
+        script_dir = ""
        venv_folder = ""
        custom_build_script = self._session.config.get("agent.custom_build_script", "") or ENV_CUSTOM_BUILD_SCRIPT.get()
        if custom_build_script:
            try:
-                requirements = current_task.script.requirements
+                venv_folder = Path(self._session.config["agent.venvs_dir"], python_ver or "3")
-            except AttributeError:
+                venv_folder = Path(os.path.expanduser(os.path.expandvars(venv_folder.as_posix())))
                directory, vcs, repo_info = self.get_repo_info(
                    execution, current_task, str(venv_folder)
                )
                binary, entry_point, working_dir = self.run_custom_build_script(
                    custom_build_script,
                    current_task,
                    execution,
                    venv_folder=venv_folder,
                    git_root=vcs.location,
                )
                execution.entry_point = str(entry_point)
                execution.working_dir = str(working_dir)
                script_dir = str(working_dir)
                self.package_api = VirtualenvPip(
                    session=self._session,
                    interpreter=str(binary),
                    python=str(binary),
                    requirements_manager=RequirementsManager(self._session),
                    execution_info=execution,
                    path=venv_folder,
                )
                self.global_package_api = SystemPip(
                    session=self._session,
                    interpreter=str(binary),
                )
            except SkippedCustomBuildScript as ex:
                print("Warning: {}".format(str(ex)))
                custom_build_script = None
        if not custom_build_script:
            if self._session.config.get("agent.package_manager.force_repo_requirements_txt", False):
                requirements = None
                print("[package_manager.force_repo_requirements_txt=true] "
                      "Skipping requirements, using repository \"requirements.txt\" ")
            else:
                try:
                    requirements = current_task.script.requirements
                except AttributeError:
                    requirements = None
-        try:
+            alternative_code_folder = None
-            python_ver = current_task.script.binary
+            if ENV_AGENT_SKIP_PYTHON_ENV_INSTALL.get():
-            python_ver = python_ver.split('/')[-1].replace('python', '')
+                venv_folder, requirements_manager, is_cached = None, None, False
-            # if we can cast it, we are good
+                # we need to create a folder for the code to be dumped into
-            python_ver = '{:.1f}'.format(float(python_ver))
+                code_folder = self._session.config.get("agent.venvs_dir")
-        except:
+                code_folder = Path(os.path.expanduser(os.path.expandvars(code_folder)))
-            python_ver = None
+                # let's make sure it is clear from previous runs
                rm_tree(normalize_path(code_folder, WORKING_REPOSITORY_DIR))
                rm_tree(normalize_path(code_folder, WORKING_STANDALONE_DIR))
                if not code_folder.exists():
                    code_folder.mkdir(parents=True, exist_ok=True)
                alternative_code_folder = code_folder.as_posix()
            else:
                venv_folder, requirements_manager, is_cached = self.install_virtualenv(
                    standalone_mode=standalone_mode,
                    requested_python_version=python_ver,
                    execution_info=execution,
                    cached_requirements=requirements,
                )
-        venv_folder, requirements_manager, is_cached = self.install_virtualenv(
+                if not is_cached and not standalone_mode:
-            standalone_mode=standalone_mode,
+                    if self._default_pip:
-            requested_python_version=python_ver,
+                        self.package_api.install_packages(*self._default_pip)
            execution_info=execution,
            cached_requirements=requirements,
        )
-        if not is_cached and not standalone_mode:
+                    print("\n")
-            if self._default_pip:
+
-                self.package_api.install_packages(*self._default_pip)
+            # either use the venvs base folder for code or the cwd
            directory, vcs, repo_info = self.get_repo_info(
                execution, current_task, str(venv_folder or alternative_code_folder)
            )
            print("\n")
-        directory, vcs, repo_info = self.get_repo_info(
+            cwd = vcs.location if vcs and vcs.location else directory
            execution, current_task, venv_folder
        )
-        print("\n")
+            if not standalone_mode:
                if is_cached:
                    # reinstalling git / local packages
                    package_api = copy(self.package_api)
                    OnlyExternalRequirements.cwd = package_api.cwd = cwd
                    package_api.requirements_manager = self._get_requirements_manager(
                        base_interpreter=package_api.requirements_manager.get_interpreter(),
                        requirement_substitutions=[OnlyExternalRequirements]
                    )
                    # make sure we run the handlers
                    cached_requirements = \
                        {k: package_api.requirements_manager.replace(requirements[k] or '')
                         for k in requirements}
                    if str(cached_requirements.get('pip', '')).strip() \
                            or str(cached_requirements.get('conda', '')).strip():
                        package_api.load_requirements(cached_requirements)
                    # make sure we call the correct freeze
                    requirements_manager = package_api.requirements_manager
                elif requirements_manager:
                    self.install_requirements(
                        execution,
                        repo_info,
                        requirements_manager=requirements_manager,
                        cached_requirements=requirements,
                        cwd=cwd,
                    )
                elif not self.package_api:
                    # check if we have to manually configure package API, it will be readonly
                    self.package_api = SystemPip(session=self._session)
-        cwd = vcs.location if vcs and vcs.location else directory
+            # do not update the task packages if we are using conda,
            # it will most likely make the task environment unreproducible
            skip_freeze_update = self.is_conda and not self._session.config.get(
                "agent.package_manager.conda_full_env_update", False)
-        if not standalone_mode:
+            freeze = self.freeze_task_environment(
-            if is_cached:
+                task_id=current_task.id,
-                # reinstalling git / local packages
+                requirements_manager=requirements_manager,
-                package_api = copy(self.package_api)
+                add_venv_folder_cache=venv_folder,
-                OnlyExternalRequirements.cwd = package_api.cwd = cwd
+                execution_info=execution,
-                package_api.requirements_manager = self._get_requirements_manager(
+                update_requirements=not skip_freeze_update,
-                    base_interpreter=package_api.requirements_manager.get_interpreter(),
+            )
-                    requirement_substitutions=[OnlyExternalRequirements]
+            script_dir = (directory if isinstance(directory, Path) else Path(directory)).absolute().as_posix()
                )
                # make sure we run the handlers
                cached_requirements = \
                    {k: package_api.requirements_manager.replace(requirements[k] or '')
                     for k in requirements}
                if str(cached_requirements.get('pip', '')).strip() \
                        or str(cached_requirements.get('conda', '')).strip():
                    package_api.load_requirements(cached_requirements)
                # make sure we call the correct freeze
                requirements_manager = package_api.requirements_manager
            else:
                self.install_requirements(
                    execution,
                    repo_info,
                    requirements_manager=requirements_manager,
                    cached_requirements=requirements,
                    cwd=cwd,
                )
        # do not update the task packages if we are using conda,
        # it will most likely make the task environment unreproducible
        skip_freeze_update = self.is_conda and not self._session.config.get(
            "agent.package_manager.conda_full_env_update", False)
        freeze = self.freeze_task_environment(
            task_id=current_task.id,
            requirements_manager=requirements_manager,
            add_venv_folder_cache=venv_folder,
            execution_info=execution,
            update_requirements=not skip_freeze_update,
        )
        script_dir = (directory if isinstance(directory, Path) else Path(directory)).absolute().as_posix()
        # run code
        # print("Running task id [%s]:" % current_task.id)
@ -2185,7 +2269,9 @@ class Worker(ServiceCommandSection):
            extra.append(
                WorkerParams(optimization=optimization).get_optimization_flag()
            )
        # check if this is a module load, then load it.
        # noinspection PyBroadException
        try:
            if current_task.script.binary and current_task.script.binary.startswith('python') and \
                    execution.entry_point and execution.entry_point.split()[0].strip() == '-m':
@ -2193,7 +2279,7 @@ class Worker(ServiceCommandSection):
                extra.extend(shlex.split(execution.entry_point))
            else:
                extra.append(execution.entry_point)
-        except:
+        except Exception:
            extra.append(execution.entry_point)
        command = self.package_api.get_python_command(extra)
@ -2577,7 +2663,7 @@ class Worker(ServiceCommandSection):
                    python_version=getattr(self.package_api, 'python', ''),
                    cuda_version=self._session.config.get("agent.cuda_version"),
                    source_folder=add_venv_folder_cache,
-                    exclude_sub_folders=['task_repository', 'code'])
+                    exclude_sub_folders=[WORKING_REPOSITORY_DIR, WORKING_STANDALONE_DIR])
        # If do not update back requirements
        if not update_requirements:
@ -2852,28 +2938,122 @@ class Worker(ServiceCommandSection):
            )
        )
-    def install_virtualenv(
+    def run_custom_build_script(self, script, task, execution, venv_folder, git_root):
-            self,
+        # type: (str, tasks_api.Task, ExecutionInfo, Path, str)-> Tuple[Path, Path, Path]
            venv_dir=None,
            requested_python_version=None,
            standalone_mode=False,
            execution_info=None,
            cached_requirements=None,
    ):
        # type: (str, str, bool, ExecutionInfo, dict) -> Tuple[Path, RequirementsManager, bool]
        """
-        Install a new python virtual environment, removing the old one if exists
+        Run a custom env build script
-        If CLEARML_SKIP_PIP_VENV_INSTALL is set then an emtpy virtual env folder is created
+        :param script:
-        and package manager is configured to work with the global python interpreter (the interpreter
+        :return: A tuple containing:
-        path itself can be passed in this variable)
+            - a full path to a python executable
-        :return: virtualenv directory, requirements manager to use with task, True if there is a cached venv entry
+            - a new task entry_point (replacing the entry_point in the task's script section)
            - a new working directory (replacing the working_dir in the task's script section)
            - a requirements manager instance
        """
-        skip_pip_venv_install = ENV_AGENT_SKIP_PIP_VENV_INSTALL.get()
+        script = os.path.expanduser(os.path.expandvars(script))
        try:
            if not os.path.isfile(script):
                raise SkippedCustomBuildScript("Build script {} is not found".format(script))
        except OSError as ex:
            raise SkippedCustomBuildScript(str(ex))
        print("Running custom build script {}".format(script))
        script_output_file = NamedTemporaryFile(prefix="custom_build_script", suffix=".json", mode="wt", delete=False)
        os.environ["CLEARML_AGENT_CUSTOM_BUILD_SCRIPT"] = script
        os.environ["CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON"] = json.dumps(
            task.to_dict(), separators=(',', ':'), default=str
        )
        os.environ["CLEARML_CUSTOM_BUILD_OUTPUT"] = script_output_file.name
        os.environ["CLEARML_TASK_SCRIPT_ENTRY"] = execution.entry_point
        os.environ["CLEARML_TASK_WORKING_DIR"] = execution.working_dir
        os.environ["CLEARML_VENV_PATH"] = str(venv_folder)
        os.environ["CLEARML_GIT_ROOT"] = git_root
        try:
            subprocess.check_call([script])
        except subprocess.CalledProcessError as ex:
            raise CustomBuildScriptFailed(
                message="Custom build script failed with return code {}".format(ex.returncode),
                errno=ex.returncode
            )
        output = Path(script_output_file.name).read_text()
        if not output:
            raise SkippedCustomBuildScript("Build script {} is not found".format(script))
        try:
            output = json.loads(output)
            binary = Path(output["binary"])
            entry_point = Path(output["entry_point"])
            working_dir = Path(output["working_dir"])
        except ValueError as ex:
            raise SkippedCustomBuildScript(
                "Failed parsing build script output JSON ({}): {}".format(script_output_file.name, ex)
            )
        except KeyError as ex:
            raise SkippedCustomBuildScript("Build script output missing {} field".format(ex.args[0]))
        try:
            if not binary.is_file():
                raise SkippedCustomBuildScript(
                    "Invalid binary path returned from custom build script: {}".format(binary)
                )
            if not entry_point.is_file():
                raise SkippedCustomBuildScript(
                    "Invalid entrypoint path returned from custom build script: {}".format(entry_point)
                )
            if not working_dir.is_dir():
                raise SkippedCustomBuildScript(
                    "Invalid working dir returned from custom build script: {}".format(working_dir)
                )
        except OSError as ex:
            raise SkippedCustomBuildScript(str(ex))
        return binary, entry_point, working_dir
    def _get_skip_pip_venv_install(self, skip_pip_venv_install=None):
        if skip_pip_venv_install is None:
            skip_pip_venv_install = ENV_AGENT_SKIP_PIP_VENV_INSTALL.get()
        if skip_pip_venv_install:
            try:
                skip_pip_venv_install = bool(strtobool(skip_pip_venv_install))
            except ValueError:
                pass
        elif ENV_VENV_CONFIGURED.get() and ENV_DOCKER_IMAGE.get() and \
                self._session.config.get("agent.docker_use_activated_venv", True) and \
                self._session.config.get("agent.package_manager.system_site_packages", False):
            # if we are running inside a container, and virtual environment is already installed,
            # we should install directly into it, because we cannot inherit from the system packages
            skip_pip_venv_install = find_executable("python") or True
            # check if we are running inside a container:
            print(
                "Warning! Found python virtual environment [{}] already activated inside the container, "
                "installing packages into venv (pip does not support inherit/nested venv)".format(
                    skip_pip_venv_install if isinstance(skip_pip_venv_install, str) else ENV_VENV_CONFIGURED.get())
            )
        return skip_pip_venv_install
    def install_virtualenv(
        self,
        venv_dir=None,
        requested_python_version=None,
        standalone_mode=False,
        execution_info=None,
        cached_requirements=None,
    ):
        # type: (str, str, bool, ExecutionInfo, dict) -> Tuple[Path, RequirementsManager, bool]
        """
        Install a new python virtual environment, removing the old one if exists
        If skip_pip_venv_install is True or contains a string (or if CLEARML_SKIP_PIP_VENV_INSTALL is set)
        then an emtpy virtual env folder is created and package manager is configured to work with the global python
        interpreter (or using a custom interpreter if an interpreter path is passed in this variable)
        :return: virtualenv directory, requirements manager to use with task, True if there is a cached venv entry
        """
        skip_pip_venv_install = self._get_skip_pip_venv_install()
        if self._session.config.get("agent.ignore_requested_python_version", None):
            requested_python_version = ''
@ -2930,13 +3110,50 @@ class Worker(ServiceCommandSection):
            or not self.is_venv_update
        )
        if not standalone_mode:
            rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR))
            rm_tree(normalize_path(venv_dir, WORKING_STANDALONE_DIR))
        call_package_manager_create, requirements_manager = self._setup_package_api(
            executable_name=executable_name,
            executable_version_suffix=executable_version_suffix,
            venv_dir=venv_dir,
            execution_info=execution_info,
            standalone_mode=standalone_mode,
            skip_pip_venv_install=skip_pip_venv_install,
            first_time=first_time,
        )
        # check if we have a cached folder
        if cached_requirements and not skip_pip_venv_install and self.package_api.get_cached_venv(
            requirements=cached_requirements,
            docker_cmd=execution_info.docker_cmd if execution_info else None,
            python_version=self.package_api.python,
            cuda_version=self._session.config.get("agent.cuda_version"),
            destination_folder=Path(venv_dir)
        ):
            print('::: Using Cached environment {} :::'.format(self.package_api.get_last_used_entry_cache()))
            return venv_dir, requirements_manager, True
        # create the initial venv
        if not skip_pip_venv_install:
            if call_package_manager_create:
                self.package_api.create()
        else:
            if not venv_dir.exists():
                venv_dir.mkdir(parents=True, exist_ok=True)
        return venv_dir, requirements_manager, False
    def _setup_package_api(
        self, executable_name, executable_version_suffix, venv_dir, execution_info,
        standalone_mode, skip_pip_venv_install=False, first_time=False
    ):
        # type: (str, str, Path, ExecutionInfo, bool, bool, bool) -> Tuple[bool, RequirementsManager]
        requirements_manager = self._get_requirements_manager(
            base_interpreter=executable_name
        )
        if not standalone_mode:
            rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR))
        package_manager_params = dict(
            session=self._session,
            python=executable_version_suffix if self.is_conda else executable_name,
@ -2951,7 +3168,6 @@ class Worker(ServiceCommandSection):
        )
        call_package_manager_create = False
        if not self.is_conda:
            if standalone_mode or skip_pip_venv_install:
                # pip with standalone mode
@ -2959,7 +3175,10 @@ class Worker(ServiceCommandSection):
                if standalone_mode:
                    self.package_api = VirtualenvPip(**package_manager_params)
                else:
-                    self.package_api = self.global_package_api
+                    # we can change it, no one is going to use it anyhow
                    package_manager_params['path'] = None
                    package_manager_params['interpreter'] = executable_name
                    self.package_api = VirtualenvPip(**package_manager_params)
            else:
                if self.is_venv_update:
                    self.package_api = VenvUpdateAPI(
@ -2997,26 +3216,7 @@ class Worker(ServiceCommandSection):
                    venv_dir = new_venv_folder
                    self.package_api = get_conda(path=venv_dir)
-        # check if we have a cached folder
+        return call_package_manager_create, requirements_manager
        if cached_requirements and not skip_pip_venv_install and self.package_api.get_cached_venv(
            requirements=cached_requirements,
            docker_cmd=execution_info.docker_cmd if execution_info else None,
            python_version=package_manager_params['python'],
            cuda_version=self._session.config.get("agent.cuda_version"),
            destination_folder=Path(venv_dir)
        ):
            print('::: Using Cached environment {} :::'.format(self.package_api.get_last_used_entry_cache()))
            return venv_dir, requirements_manager, True
        # create the initial venv
        if not skip_pip_venv_install:
            if call_package_manager_create:
                self.package_api.create()
        else:
            if not venv_dir.exists():
                venv_dir.mkdir(parents=True, exist_ok=True)
        return venv_dir, requirements_manager, False
    def parse_requirements(self, reqs_file=None, overrides=None):
        os = None
@ -3266,6 +3466,7 @@ class Worker(ServiceCommandSection):
            worker_tags=None,
            name=None,
            mount_ssh=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
            env_task_id=None,
    ):
        docker = 'docker'
@ -3359,6 +3560,9 @@ class Worker(ServiceCommandSection):
        # update the docker image, so the system knows where it runs
        base_cmd += ['-e', 'CLEARML_DOCKER_IMAGE={} {}'.format(docker_image, ' '.join(docker_arguments or [])).strip()]
        if env_task_id:
            base_cmd += ['-e', 'CLEARML_TASK_ID={}'.format(env_task_id), ]
        if auth_token:
            # if auth token is passed then put it in the env var
            base_cmd += ['-e', '{}={}'.format(ENV_AGENT_AUTH_TOKEN.vars[0], auth_token)]
@ -3550,8 +3754,11 @@ class Worker(ServiceCommandSection):
        return command, script_dir
-    def _kill_daemon(self, dynamic_gpus=False):
+    def _kill_daemon(self, dynamic_gpus=False, worker_id=None):
-        worker_id, worker_name = self._generate_worker_id_name(dynamic_gpus=dynamic_gpus)
+        if not worker_id:
            worker_id, worker_name = self._generate_worker_id_name(dynamic_gpus=dynamic_gpus)
        else:
            worker_name = worker_id
        # Iterate over all running process
        for pid, uid, slot, file in sorted(Singleton.get_running_pids(), key=lambda x: x[1] or ''):
--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@ -126,6 +126,7 @@ DEFAULT_VENV_UPDATE_URL = (
    "https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
 )
 WORKING_REPOSITORY_DIR = "task_repository"
 WORKING_STANDALONE_DIR = "code"
 DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
 PIP_EXTRA_INDICES = [
 ]
@ -134,6 +135,7 @@ ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAG
 ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
 ENV_WORKER_TAGS = EnvironmentConfig('CLEARML_WORKER_TAGS')
 ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PIP_VENV_INSTALL')
 ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL', type=bool)
 ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
 ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
 ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
@ -147,6 +149,38 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEAR
 ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
 ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
 ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
 """
    Specifies a custom environment setup script to be executed instead of installing a virtual environment.
    If provided, this script is executed following Git cloning. Script command may include environment variable and
    will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
    The script can also be specified using the `agent.custom_build_script` configuration setting.
    When running the script, the following environment variables will be set:
    - CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
     contents in JSON format
    - CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
    - CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
    - CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
    - CLEARML_GIT_ROOT: path to the cloned Git repository
    - CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
     this file must be in the following JSON format:
         ```json
         {
           "binary": "/absolute/path/to/python-executable",
           "entry_point": "/absolute/path/to/task-entrypoint-script",
           "working_dir": "/absolute/path/to/task-working/dir"
         }
         ```
     If provided, the agent will use these instead of the predefined task script section to execute the task and will
     skip virtual environment creation.
    In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
    In case the custom script is specified but does not exist, or if the custom script does not write valid content
    into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
    standard flow.
 """
 class FileBuffering(IntEnum):
    """
--- a/clearml_agent/errors.py
+++ b/clearml_agent/errors.py
@ -84,3 +84,13 @@ class MissingPackageError(CommandFailedError):
    def __str__(self):
        return '{self.__class__.__name__}: ' \
               '"{self.name}" package is required. Please run "pip install {self.name}"'.format(self=self)
 class CustomBuildScriptFailed(CommandFailedError):
    def __init__(self, errno, *args, **kwargs):
        super(CustomBuildScriptFailed, self).__init__(*args, **kwargs)
        self.errno = errno
 class SkippedCustomBuildScript(CommandFailedError):
    pass
--- a/clearml_agent/helper/base.py
+++ b/clearml_agent/helper/base.py
@ -506,6 +506,38 @@ def is_conda(config):
    return config['agent.package_manager.type'].lower() == 'conda'
 def convert_cuda_version_to_float_single_digit_str(cuda_version):
    """
    Convert a cuda_version (string/float/int) into a float representation, e.g. 11.4
    Notice returns String Single digit only!
    :return str:
    """
    cuda_version = str(cuda_version or 0)
    # if we have patch version we parse it here
    cuda_version_parts = [int(v) for v in cuda_version.split('.')]
    if len(cuda_version_parts) > 1 or cuda_version_parts[0] < 60:
        cuda_version = 10 * cuda_version_parts[0]
        if len(cuda_version_parts) > 1:
            cuda_version += float(".{:d}".format(cuda_version_parts[1]))*10
        cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
    else:
        cuda_version = cuda_version_parts[0]
        cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
    return cuda_version_full
 def convert_cuda_version_to_int_10_base_str(cuda_version):
    """
    Convert a cuda_version (string/float/int) into an integer version, e.g. 112 for cuda 11.2
    Return string
    :return str:
    """
    cuda_version = convert_cuda_version_to_float_single_digit_str(cuda_version)
    return str(int(float(cuda_version)*10))
 class NonStrictAttrs(object):
    @classmethod
--- a/clearml_agent/helper/package/conda_api.py
+++ b/clearml_agent/helper/package/conda_api.py
@ -19,7 +19,9 @@ from clearml_agent.external.requirements_parser import parse
 from clearml_agent.external.requirements_parser.requirement import Requirement
 from clearml_agent.errors import CommandFailedError
-from clearml_agent.helper.base import rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo
+from clearml_agent.helper.base import (
    rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo,
    convert_cuda_version_to_float_single_digit_str, convert_cuda_version_to_int_10_base_str, )
 from clearml_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
 from clearml_agent.helper.package.requirements import SimpleVersion
 from clearml_agent.session import Session
@ -167,7 +169,7 @@ class CondaAPI(PackageManager):
                raise ValueError("Could not restore Conda environment, cannot find {}".format(
                    self.conda_pre_build_env_path))
-        output = Argv(
+        command = Argv(
            self.conda,
            "create",
            "--yes",
@ -175,7 +177,9 @@ class CondaAPI(PackageManager):
            "--prefix",
            self.path,
            "python={}".format(self.python),
-        ).get_output(stderr=DEVNULL)
+        )
        print('Executing Conda: {}'.format(command.serialize()))
        output = command.get_output(stderr=DEVNULL)
        match = re.search(
            r"\W*(.*activate) ({})".format(re.escape(str(self.path))), output
        )
@ -457,16 +461,8 @@ class CondaAPI(PackageManager):
            if not cuda_version:
                cuda_version = 0
            else:
-                cuda_version_full = str(cuda_version)
+                cuda_version_full = convert_cuda_version_to_float_single_digit_str(cuda_version)
-                # if we have patch version we parse it here
+                cuda_version = int(convert_cuda_version_to_int_10_base_str(cuda_version))
                cuda_version_parts = [int(v) for v in cuda_version.split('.')]
                if len(cuda_version_parts) > 1 or cuda_version_parts[0] < 60:
                    cuda_version = 10*cuda_version_parts[0]
                    if len(cuda_version_parts) > 1:
                        cuda_version += cuda_version_parts[1]
                else:
                    cuda_version = cuda_version_parts[0]
                    cuda_version_full = "{:.1f}".format(float(cuda_version)/10.)
        except Exception:
            cuda_version = 0
--- a/clearml_agent/helper/package/pip_api/venv.py
+++ b/clearml_agent/helper/package/pip_api/venv.py
@ -12,7 +12,7 @@ from ..requirements import RequirementsManager
 class VirtualenvPip(SystemPip, PackageManager):
    def __init__(self, session, python, requirements_manager, path, interpreter=None, execution_info=None, **kwargs):
-        # type: (Session, float, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
+        # type: (Session, str, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
        """
        Program interface to virtualenv pip.
        Must be given either path to virtualenv or source command.
@ -48,7 +48,7 @@ class VirtualenvPip(SystemPip, PackageManager):
        return Argv.conditional_flag(
            self.session.config["agent.package_manager.system_site_packages"],
            "--system-site-packages",
-        ) + ("--python", self._bin)
+        )
    def install_flags(self):
        """
@ -64,10 +64,6 @@ class VirtualenvPip(SystemPip, PackageManager):
        Only valid if instantiated with path.
        Use self.python as self.bin does not exist.
        """
        # Log virtualenv information to stdout
        self.session.command(
            self.python, "-m", "virtualenv", "--version"
        )
        self.session.command(
            self.python, "-m", "virtualenv", self.path, *self.create_flags()
        ).check_call()
--- a/clearml_agent/helper/package/pytorch.py
+++ b/clearml_agent/helper/package/pytorch.py
@ -174,36 +174,42 @@ class PytorchRequirement(SimpleSubstitution):
        self.log = self._session.get_logger(__name__)
        self.package_manager = self.config["agent.package_manager.type"].lower()
        self.os = os_name or self.get_platform()
-        self.cuda = "cuda{}".format(self.cuda_version).lower()
+        self.cuda = None
-        self.python_version_string = str(self.config["agent.default_python"])
+        self.python_version_string = None
-        self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
+        self.python_major_minor_str = None
-        if '.' not in self.python_major_minor_str:
+        self.python = None
-            raise PytorchResolutionError(
+        self.exceptions = []
                "invalid python version {!r} defined in configuration file, key 'agent.default_python': "
                "must have both major and minor parts of the version (for example: '3.7')".format(
                    self.python_version_string
                )
            )
        self.python = "python{}".format(self.python_major_minor_str)
        self.exceptions = [
            PytorchResolutionError(message)
            for message in (
                None,
                'cuda version "{}" is not supported'.format(self.cuda),
                'python version "{}" is not supported'.format(
                    self.python_version_string
                ),
            )
        ]
        try:
            self.validate_python_version()
        except PytorchResolutionError as e:
            self.log.warn("will not be able to install pytorch wheels: %s", e.args[0])
        self._original_req = []
    def _init_python_ver_cuda_ver(self):
        if self.cuda is None:
            self.cuda = "cuda{}".format(self.cuda_version).lower()
        if self.python_version_string is None:
            self.python_version_string = str(self.config["agent.default_python"])
        if self.python_major_minor_str is None:
            self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
            if '.' not in self.python_major_minor_str:
                raise PytorchResolutionError(
                    "invalid python version {!r} defined in configuration file, key 'agent.default_python': "
                    "must have both major and minor parts of the version (for example: '3.7')".format(
                        self.python_version_string
                    )
                )
        if self.python is None:
            self.python = "python{}".format(self.python_major_minor_str)
        if not self.exceptions:
            self.exceptions = [
                PytorchResolutionError(message)
                for message in (
                    None,
                    'cuda version "{}" is not supported'.format(self.cuda),
                    'python version "{}" is not supported'.format(
                        self.python_version_string
                    ),
                )
            ]
    @property
    def is_conda(self):
        return self.package_manager == "conda"
@ -216,6 +222,8 @@ class PytorchRequirement(SimpleSubstitution):
        """
        Make sure python version has both major and minor versions as required for choosing pytorch wheel
        """
        self._init_python_ver_cuda_ver()
        if self.is_pip and not self.python_major_minor_str:
            raise PytorchResolutionError(
                "invalid python version {!r} defined in configuration file, key 'agent.default_python': "
@ -294,6 +302,7 @@ class PytorchRequirement(SimpleSubstitution):
    def get_url_for_platform(self, req):
        # check if package is already installed with system packages
        self.validate_python_version()
        # noinspection PyBroadException
        try:
            if self.config.get("agent.package_manager.system_site_packages", None):
--- a/clearml_agent/helper/package/requirements.py
+++ b/clearml_agent/helper/package/requirements.py
@ -16,7 +16,9 @@ from pyhocon import ConfigTree
 import six
 import logging
 from clearml_agent.definitions import PIP_EXTRA_INDICES
-from clearml_agent.helper.base import warning, is_conda, which, join_lines, is_windows_platform
+from clearml_agent.helper.base import (
    warning, is_conda, which, join_lines, is_windows_platform,
    convert_cuda_version_to_int_10_base_str, )
 from clearml_agent.helper.process import Argv, PathLike
 from clearml_agent.helper.gpu.gpustat import get_driver_cuda_version
 from clearml_agent.session import Session, normalize_cuda_version
@ -474,7 +476,7 @@ class RequirementSubstitution(object):
    @property
    def cuda_version(self):
-        return self.config['agent.cuda_version']
+        return convert_cuda_version_to_int_10_base_str(self.config['agent.cuda_version'])
    @property
    def cudnn_version(self):
--- a/clearml_agent/interface/worker.py
+++ b/clearml_agent/interface/worker.py
@ -99,8 +99,10 @@ DAEMON_ARGS = dict({
        'aliases': ['-d'],
    },
    '--stop': {
-        'help': 'Stop the running agent (based on the same set of arguments)',
+        'help': 'Stop the running agent (based on the same set of arguments). '
-        'action': 'store_true',
+                'Optional: provide a list of specific local worker IDs to stop',
        'nargs': '*',
        'default': False,
    },
    '--dynamic-gpus': {
        'help': 'Allow to dynamically allocate gpus based on queue properties, '