Print error on resource monitor failure

Fix git+ssh:// links inside installed packages not being converted properly to HTTPS authenticated and vice versa
Support new Retry.DEFAULT_BACKOFF_MAX in a backwards-compatible way
2025-06-26 18:16:15 +00:00 · 2023-05-11 16:18:11 +03:00 · 2023-05-11 16:16:51 +03:00 · 2023-05-11 16:16:18 +03:00 · 2023-05-11 16:15:06 +03:00 · 2023-04-10 10:58:10 +03:00
8 changed files with 61 additions and 45 deletions
--- a/clearml_agent/backend_api/utils.py
+++ b/clearml_agent/backend_api/utils.py
@@ -86,7 +86,10 @@ def get_http_session_with_retry(
    session = requests.Session()

    if backoff_max is not None:
-        Retry.BACKOFF_MAX = backoff_max
+        if "BACKOFF_MAX" in vars(Retry):
+            Retry.BACKOFF_MAX = backoff_max
+        else:
+            Retry.DEFAULT_BACKOFF_MAX = backoff_max

    retry = Retry(
        total=total, connect=connect, read=read, redirect=redirect, status=status,
--- a/clearml_agent/backend_config/config.py
+++ b/clearml_agent/backend_config/config.py
@@ -297,6 +297,9 @@ class Config(object):
    def put(self, key, value):
        self._config.put(key, value)

+    def pop(self, key, default=None):
+        return self._config.pop(key, default=default)
+
    def to_dict(self):
        return self._config.as_plain_ordered_dict()

--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -40,6 +40,7 @@ from clearml_agent.backend_api.session import CallResult, Request
 from clearml_agent.backend_api.session.defs import (
    ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
    ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
+from clearml_agent.backend_config import Config
 from clearml_agent.backend_config.defs import UptimeConf
 from clearml_agent.backend_config.utils import apply_environment, apply_files
 from clearml_agent.backend_config.converters import text_to_int
@@ -71,6 +72,7 @@ from clearml_agent.definitions import (
    ENV_DOCKER_ARGS_FILTERS,
    ENV_FORCE_SYSTEM_SITE_PACKAGES,
    ENV_SERVICES_DOCKER_RESTART,
+    ENV_CONFIG_BC_IN_STANDALONE,
 )
 from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
 from clearml_agent.errors import (
@@ -3515,6 +3517,11 @@ class Worker(ServiceCommandSection):
        requirements_manager.translator.enabled = False
        print(requirements_manager.replace(contents))

+    def remove_non_backwards_compatible_entries(self, config: Config):
+        if not self._standalone_mode or not ENV_CONFIG_BC_IN_STANDALONE.get() or self._session.feature_set == "basic":
+            return
+        config.pop("agent.package_manager.pip_version")  # removed due to a breaking change in v1.5.1
+
    def get_docker_config_cmd(self, docker_args, clean_api_credentials=False):
        docker_image = str(ENV_DOCKER_IMAGE.get() or
                           self._session.config.get("agent.default_docker.image", "nvidia/cuda")) \
@@ -3537,6 +3544,7 @@ class Worker(ServiceCommandSection):
            DockerArgsSanitizer.sanitize_docker_command(self._session, self._docker_arguments) or ''))

        temp_config = deepcopy(self._session.config)
+        self.remove_non_backwards_compatible_entries(temp_config)
        mounted_cache_dir = temp_config.get(
            "agent.docker_internal_mounts.sdk_cache", self._docker_fixed_user_cache)
        mounted_pip_dl_dir = temp_config.get(
--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@@ -177,6 +177,8 @@ ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
 ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
 ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
 ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV")
+ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
+""" Maintain backwards compatible configuration when launching in standalone mode """

 ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
 """
--- a/clearml_agent/helper/package/external_req.py
+++ b/clearml_agent/helper/package/external_req.py
@@ -92,21 +92,14 @@ class ExternalRequirements(SimpleSubstitution):
                vcs_url = req_line[4:]
                # reverse replace
                vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
-                # remove ssh:// or git:// prefix for git detection and credentials
-                scheme = ''
-                full_vcs_url = vcs_url
-                if vcs_url and (vcs_url.startswith('ssh://') or vcs_url.startswith('git://')):
-                    scheme = 'ssh://'  # notice git:// is actually ssh://
-                    vcs_url = vcs_url[6:]
+                # notice git:// is actually ssh://
+                if vcs_url and vcs_url.startswith('git://'):
+                    vcs_url = vcs_url.replace('git://', 'ssh://', 1)

                from ..repo import Git
-                vcs = Git(session=session, url=full_vcs_url, location=None, revision=None)
+                vcs = Git(session=session, url=vcs_url, location=None, revision=None)
                vcs._set_ssh_url()
-                new_req_line = 'git+{}{}{}'.format(
-                    '' if scheme and '://' in vcs.url else scheme,
-                    vcs_url if session.config.get('agent.force_git_ssh_protocol', None) else vcs.url_with_auth,
-                    fragment
-                )
+                new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
                if new_req_line != req_line:
                    furl_line = furl(new_req_line)
                    print('Replacing original pip vcs \'{}\' with \'{}\''.format(
--- a/clearml_agent/helper/repo.py
+++ b/clearml_agent/helper/repo.py
@@ -320,6 +320,7 @@ class VCS(object):
                        self.url, new_url))
                    self.url = new_url
                return
+
            # rewrite ssh URLs only if either ssh port or ssh user are forced in config
            if parsed_url.scheme == "ssh" and (
                self.session.config.get('agent.force_git_ssh_port', None) or
@@ -334,6 +335,9 @@ class VCS(object):
                    print("Using SSH credentials - ssh url '{}' with ssh url '{}'".format(
                        self.url, new_url))
                    self.url = new_url
+                return
+            elif parsed_url.scheme == "ssh":
+                return

        if not self.session.config.agent.translate_ssh:
            return
@@ -343,7 +347,7 @@ class VCS(object):
                (ENV_AGENT_GIT_PASS.get() or self.session.config.get('agent.git_pass', None)):
            # only apply to a specific domain (if requested)
            config_domain = \
-                ENV_AGENT_GIT_HOST.get() or self.session.config.get("git_host", None)
+                ENV_AGENT_GIT_HOST.get() or self.session.config.get("agent.git_host", None)
            if config_domain and config_domain != furl(self.url).host:
                return

--- a/clearml_agent/helper/resource_monitor.py
+++ b/clearml_agent/helper/resource_monitor.py
@@ -139,42 +139,45 @@ class ResourceMonitor(object):
    def _daemon(self):
        seconds_since_started = 0
        reported = 0
-        while True:
-            last_report = time()
-            current_report_frequency = (
-                self._report_frequency if reported != 0 else self._first_report_sec
-            )
-            while (time() - last_report) < current_report_frequency:
-                # wait for self._sample_frequency seconds, if event set quit
-                if self._exit_event.wait(1 / self._sample_frequency):
-                    return
-                # noinspection PyBroadException
-                try:
-                    self._update_readouts()
-                except Exception as ex:
-                    log.warning("failed getting machine stats: %s", report_error(ex))
-                    self._failure()
+        try:
+            while True:
+                last_report = time()
+                current_report_frequency = (
+                    self._report_frequency if reported != 0 else self._first_report_sec
+                )
+                while (time() - last_report) < current_report_frequency:
+                    # wait for self._sample_frequency seconds, if event set quit
+                    if self._exit_event.wait(1 / self._sample_frequency):
+                        return
+                    # noinspection PyBroadException
+                    try:
+                        self._update_readouts()
+                    except Exception as ex:
+                        log.warning("failed getting machine stats: %s", report_error(ex))
+                        self._failure()

-            seconds_since_started += int(round(time() - last_report))
-            # check if we do not report any metric (so it means the last iteration will not be changed)
+                seconds_since_started += int(round(time() - last_report))
+                # check if we do not report any metric (so it means the last iteration will not be changed)

-            # if we do not have last_iteration, we just use seconds as iteration
+                # if we do not have last_iteration, we just use seconds as iteration

-            # start reporting only when we figured out, if this is seconds based, or iterations based
-            average_readouts = self._get_average_readouts()
-            stats = {
-                # 3 points after the dot
-                key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
-                for key, value in average_readouts.items()
-            }
+                # start reporting only when we figured out, if this is seconds based, or iterations based
+                average_readouts = self._get_average_readouts()
+                stats = {
+                    # 3 points after the dot
+                    key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
+                    for key, value in average_readouts.items()
+                }

-            # send actual report
-            if self.send_report(stats):
-                # clear readouts if this is update was sent
-                self._clear_readouts()
+                # send actual report
+                if self.send_report(stats):
+                    # clear readouts if this is update was sent
+                    self._clear_readouts()

-            # count reported iterations
-            reported += 1
+                # count reported iterations
+                reported += 1
+        except Exception as ex:
+            log.exception("Error reporting monitoring info: %s", str(ex))

    def _update_readouts(self):
        readouts = self._machine_stats()
--- a/docs/screenshots.gif
+++ b/docs/screenshots.gif
Author	SHA1	Message	Date
allegroai	b6ca0fa6a5	Print error on resource monitor failure	2023-05-11 16:18:11 +03:00
allegroai	307ec9213e	Fix git+ssh:// links inside installed packages not being converted properly to HTTPS authenticated and vice versa	2023-05-11 16:16:51 +03:00
allegroai	a78a25d966	Support new `Retry.DEFAULT_BACKOFF_MAX` in a backwards-compatible way	2023-05-11 16:16:18 +03:00
allegroai	ebb6231f5a	Add CLEARML_AGENT_STANDALONE_CONFIG_BC to support backwards compatibility in standalone mode	2023-05-11 16:15:06 +03:00
pollfly	e1d65cb280	Update clearml-agent gif (#137 )	2023-04-10 10:58:10 +03:00