Compare commits

...

5 Commits

8 changed files with 61 additions and 45 deletions

View File

@@ -86,7 +86,10 @@ def get_http_session_with_retry(
session = requests.Session()
if backoff_max is not None:
Retry.BACKOFF_MAX = backoff_max
if "BACKOFF_MAX" in vars(Retry):
Retry.BACKOFF_MAX = backoff_max
else:
Retry.DEFAULT_BACKOFF_MAX = backoff_max
retry = Retry(
total=total, connect=connect, read=read, redirect=redirect, status=status,

View File

@@ -297,6 +297,9 @@ class Config(object):
def put(self, key, value):
self._config.put(key, value)
def pop(self, key, default=None):
return self._config.pop(key, default=default)
def to_dict(self):
return self._config.as_plain_ordered_dict()

View File

@@ -40,6 +40,7 @@ from clearml_agent.backend_api.session import CallResult, Request
from clearml_agent.backend_api.session.defs import (
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
from clearml_agent.backend_config import Config
from clearml_agent.backend_config.defs import UptimeConf
from clearml_agent.backend_config.utils import apply_environment, apply_files
from clearml_agent.backend_config.converters import text_to_int
@@ -71,6 +72,7 @@ from clearml_agent.definitions import (
ENV_DOCKER_ARGS_FILTERS,
ENV_FORCE_SYSTEM_SITE_PACKAGES,
ENV_SERVICES_DOCKER_RESTART,
ENV_CONFIG_BC_IN_STANDALONE,
)
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
from clearml_agent.errors import (
@@ -3515,6 +3517,11 @@ class Worker(ServiceCommandSection):
requirements_manager.translator.enabled = False
print(requirements_manager.replace(contents))
def remove_non_backwards_compatible_entries(self, config: Config):
if not self._standalone_mode or not ENV_CONFIG_BC_IN_STANDALONE.get() or self._session.feature_set == "basic":
return
config.pop("agent.package_manager.pip_version") # removed due to a breaking change in v1.5.1
def get_docker_config_cmd(self, docker_args, clean_api_credentials=False):
docker_image = str(ENV_DOCKER_IMAGE.get() or
self._session.config.get("agent.default_docker.image", "nvidia/cuda")) \
@@ -3537,6 +3544,7 @@ class Worker(ServiceCommandSection):
DockerArgsSanitizer.sanitize_docker_command(self._session, self._docker_arguments) or ''))
temp_config = deepcopy(self._session.config)
self.remove_non_backwards_compatible_entries(temp_config)
mounted_cache_dir = temp_config.get(
"agent.docker_internal_mounts.sdk_cache", self._docker_fixed_user_cache)
mounted_pip_dl_dir = temp_config.get(

View File

@@ -177,6 +177,8 @@ ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV")
ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
""" Maintain backwards compatible configuration when launching in standalone mode """
ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
"""

View File

@@ -92,21 +92,14 @@ class ExternalRequirements(SimpleSubstitution):
vcs_url = req_line[4:]
# reverse replace
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
# remove ssh:// or git:// prefix for git detection and credentials
scheme = ''
full_vcs_url = vcs_url
if vcs_url and (vcs_url.startswith('ssh://') or vcs_url.startswith('git://')):
scheme = 'ssh://' # notice git:// is actually ssh://
vcs_url = vcs_url[6:]
# notice git:// is actually ssh://
if vcs_url and vcs_url.startswith('git://'):
vcs_url = vcs_url.replace('git://', 'ssh://', 1)
from ..repo import Git
vcs = Git(session=session, url=full_vcs_url, location=None, revision=None)
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
vcs._set_ssh_url()
new_req_line = 'git+{}{}{}'.format(
'' if scheme and '://' in vcs.url else scheme,
vcs_url if session.config.get('agent.force_git_ssh_protocol', None) else vcs.url_with_auth,
fragment
)
new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
if new_req_line != req_line:
furl_line = furl(new_req_line)
print('Replacing original pip vcs \'{}\' with \'{}\''.format(

View File

@@ -320,6 +320,7 @@ class VCS(object):
self.url, new_url))
self.url = new_url
return
# rewrite ssh URLs only if either ssh port or ssh user are forced in config
if parsed_url.scheme == "ssh" and (
self.session.config.get('agent.force_git_ssh_port', None) or
@@ -334,6 +335,9 @@ class VCS(object):
print("Using SSH credentials - ssh url '{}' with ssh url '{}'".format(
self.url, new_url))
self.url = new_url
return
elif parsed_url.scheme == "ssh":
return
if not self.session.config.agent.translate_ssh:
return
@@ -343,7 +347,7 @@ class VCS(object):
(ENV_AGENT_GIT_PASS.get() or self.session.config.get('agent.git_pass', None)):
# only apply to a specific domain (if requested)
config_domain = \
ENV_AGENT_GIT_HOST.get() or self.session.config.get("git_host", None)
ENV_AGENT_GIT_HOST.get() or self.session.config.get("agent.git_host", None)
if config_domain and config_domain != furl(self.url).host:
return

View File

@@ -139,42 +139,45 @@ class ResourceMonitor(object):
def _daemon(self):
seconds_since_started = 0
reported = 0
while True:
last_report = time()
current_report_frequency = (
self._report_frequency if reported != 0 else self._first_report_sec
)
while (time() - last_report) < current_report_frequency:
# wait for self._sample_frequency seconds, if event set quit
if self._exit_event.wait(1 / self._sample_frequency):
return
# noinspection PyBroadException
try:
self._update_readouts()
except Exception as ex:
log.warning("failed getting machine stats: %s", report_error(ex))
self._failure()
try:
while True:
last_report = time()
current_report_frequency = (
self._report_frequency if reported != 0 else self._first_report_sec
)
while (time() - last_report) < current_report_frequency:
# wait for self._sample_frequency seconds, if event set quit
if self._exit_event.wait(1 / self._sample_frequency):
return
# noinspection PyBroadException
try:
self._update_readouts()
except Exception as ex:
log.warning("failed getting machine stats: %s", report_error(ex))
self._failure()
seconds_since_started += int(round(time() - last_report))
# check if we do not report any metric (so it means the last iteration will not be changed)
seconds_since_started += int(round(time() - last_report))
# check if we do not report any metric (so it means the last iteration will not be changed)
# if we do not have last_iteration, we just use seconds as iteration
# if we do not have last_iteration, we just use seconds as iteration
# start reporting only when we figured out, if this is seconds based, or iterations based
average_readouts = self._get_average_readouts()
stats = {
# 3 points after the dot
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
for key, value in average_readouts.items()
}
# start reporting only when we figured out, if this is seconds based, or iterations based
average_readouts = self._get_average_readouts()
stats = {
# 3 points after the dot
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
for key, value in average_readouts.items()
}
# send actual report
if self.send_report(stats):
# clear readouts if this is update was sent
self._clear_readouts()
# send actual report
if self.send_report(stats):
# clear readouts if this is update was sent
self._clear_readouts()
# count reported iterations
reported += 1
# count reported iterations
reported += 1
except Exception as ex:
log.exception("Error reporting monitoring info: %s", str(ex))
def _update_readouts(self):
readouts = self._machine_stats()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

After

Width:  |  Height:  |  Size: 1018 KiB