mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b6ca0fa6a5 | ||
|
|
307ec9213e | ||
|
|
a78a25d966 | ||
|
|
ebb6231f5a | ||
|
|
e1d65cb280 |
@@ -86,7 +86,10 @@ def get_http_session_with_retry(
|
||||
session = requests.Session()
|
||||
|
||||
if backoff_max is not None:
|
||||
Retry.BACKOFF_MAX = backoff_max
|
||||
if "BACKOFF_MAX" in vars(Retry):
|
||||
Retry.BACKOFF_MAX = backoff_max
|
||||
else:
|
||||
Retry.DEFAULT_BACKOFF_MAX = backoff_max
|
||||
|
||||
retry = Retry(
|
||||
total=total, connect=connect, read=read, redirect=redirect, status=status,
|
||||
|
||||
@@ -297,6 +297,9 @@ class Config(object):
|
||||
def put(self, key, value):
|
||||
self._config.put(key, value)
|
||||
|
||||
def pop(self, key, default=None):
|
||||
return self._config.pop(key, default=default)
|
||||
|
||||
def to_dict(self):
|
||||
return self._config.as_plain_ordered_dict()
|
||||
|
||||
|
||||
@@ -40,6 +40,7 @@ from clearml_agent.backend_api.session import CallResult, Request
|
||||
from clearml_agent.backend_api.session.defs import (
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
|
||||
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
|
||||
from clearml_agent.backend_config import Config
|
||||
from clearml_agent.backend_config.defs import UptimeConf
|
||||
from clearml_agent.backend_config.utils import apply_environment, apply_files
|
||||
from clearml_agent.backend_config.converters import text_to_int
|
||||
@@ -71,6 +72,7 @@ from clearml_agent.definitions import (
|
||||
ENV_DOCKER_ARGS_FILTERS,
|
||||
ENV_FORCE_SYSTEM_SITE_PACKAGES,
|
||||
ENV_SERVICES_DOCKER_RESTART,
|
||||
ENV_CONFIG_BC_IN_STANDALONE,
|
||||
)
|
||||
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
|
||||
from clearml_agent.errors import (
|
||||
@@ -3515,6 +3517,11 @@ class Worker(ServiceCommandSection):
|
||||
requirements_manager.translator.enabled = False
|
||||
print(requirements_manager.replace(contents))
|
||||
|
||||
def remove_non_backwards_compatible_entries(self, config: Config):
|
||||
if not self._standalone_mode or not ENV_CONFIG_BC_IN_STANDALONE.get() or self._session.feature_set == "basic":
|
||||
return
|
||||
config.pop("agent.package_manager.pip_version") # removed due to a breaking change in v1.5.1
|
||||
|
||||
def get_docker_config_cmd(self, docker_args, clean_api_credentials=False):
|
||||
docker_image = str(ENV_DOCKER_IMAGE.get() or
|
||||
self._session.config.get("agent.default_docker.image", "nvidia/cuda")) \
|
||||
@@ -3537,6 +3544,7 @@ class Worker(ServiceCommandSection):
|
||||
DockerArgsSanitizer.sanitize_docker_command(self._session, self._docker_arguments) or ''))
|
||||
|
||||
temp_config = deepcopy(self._session.config)
|
||||
self.remove_non_backwards_compatible_entries(temp_config)
|
||||
mounted_cache_dir = temp_config.get(
|
||||
"agent.docker_internal_mounts.sdk_cache", self._docker_fixed_user_cache)
|
||||
mounted_pip_dl_dir = temp_config.get(
|
||||
|
||||
@@ -177,6 +177,8 @@ ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
|
||||
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
|
||||
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
|
||||
ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV")
|
||||
ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
|
||||
""" Maintain backwards compatible configuration when launching in standalone mode """
|
||||
|
||||
ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
|
||||
"""
|
||||
|
||||
@@ -92,21 +92,14 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
# remove ssh:// or git:// prefix for git detection and credentials
|
||||
scheme = ''
|
||||
full_vcs_url = vcs_url
|
||||
if vcs_url and (vcs_url.startswith('ssh://') or vcs_url.startswith('git://')):
|
||||
scheme = 'ssh://' # notice git:// is actually ssh://
|
||||
vcs_url = vcs_url[6:]
|
||||
# notice git:// is actually ssh://
|
||||
if vcs_url and vcs_url.startswith('git://'):
|
||||
vcs_url = vcs_url.replace('git://', 'ssh://', 1)
|
||||
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=full_vcs_url, location=None, revision=None)
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}{}'.format(
|
||||
'' if scheme and '://' in vcs.url else scheme,
|
||||
vcs_url if session.config.get('agent.force_git_ssh_protocol', None) else vcs.url_with_auth,
|
||||
fragment
|
||||
)
|
||||
new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
|
||||
@@ -320,6 +320,7 @@ class VCS(object):
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
return
|
||||
|
||||
# rewrite ssh URLs only if either ssh port or ssh user are forced in config
|
||||
if parsed_url.scheme == "ssh" and (
|
||||
self.session.config.get('agent.force_git_ssh_port', None) or
|
||||
@@ -334,6 +335,9 @@ class VCS(object):
|
||||
print("Using SSH credentials - ssh url '{}' with ssh url '{}'".format(
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
return
|
||||
elif parsed_url.scheme == "ssh":
|
||||
return
|
||||
|
||||
if not self.session.config.agent.translate_ssh:
|
||||
return
|
||||
@@ -343,7 +347,7 @@ class VCS(object):
|
||||
(ENV_AGENT_GIT_PASS.get() or self.session.config.get('agent.git_pass', None)):
|
||||
# only apply to a specific domain (if requested)
|
||||
config_domain = \
|
||||
ENV_AGENT_GIT_HOST.get() or self.session.config.get("git_host", None)
|
||||
ENV_AGENT_GIT_HOST.get() or self.session.config.get("agent.git_host", None)
|
||||
if config_domain and config_domain != furl(self.url).host:
|
||||
return
|
||||
|
||||
|
||||
@@ -139,42 +139,45 @@ class ResourceMonitor(object):
|
||||
def _daemon(self):
|
||||
seconds_since_started = 0
|
||||
reported = 0
|
||||
while True:
|
||||
last_report = time()
|
||||
current_report_frequency = (
|
||||
self._report_frequency if reported != 0 else self._first_report_sec
|
||||
)
|
||||
while (time() - last_report) < current_report_frequency:
|
||||
# wait for self._sample_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1 / self._sample_frequency):
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._update_readouts()
|
||||
except Exception as ex:
|
||||
log.warning("failed getting machine stats: %s", report_error(ex))
|
||||
self._failure()
|
||||
try:
|
||||
while True:
|
||||
last_report = time()
|
||||
current_report_frequency = (
|
||||
self._report_frequency if reported != 0 else self._first_report_sec
|
||||
)
|
||||
while (time() - last_report) < current_report_frequency:
|
||||
# wait for self._sample_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1 / self._sample_frequency):
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._update_readouts()
|
||||
except Exception as ex:
|
||||
log.warning("failed getting machine stats: %s", report_error(ex))
|
||||
self._failure()
|
||||
|
||||
seconds_since_started += int(round(time() - last_report))
|
||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||
seconds_since_started += int(round(time() - last_report))
|
||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
|
||||
# start reporting only when we figured out, if this is seconds based, or iterations based
|
||||
average_readouts = self._get_average_readouts()
|
||||
stats = {
|
||||
# 3 points after the dot
|
||||
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
||||
for key, value in average_readouts.items()
|
||||
}
|
||||
# start reporting only when we figured out, if this is seconds based, or iterations based
|
||||
average_readouts = self._get_average_readouts()
|
||||
stats = {
|
||||
# 3 points after the dot
|
||||
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
||||
for key, value in average_readouts.items()
|
||||
}
|
||||
|
||||
# send actual report
|
||||
if self.send_report(stats):
|
||||
# clear readouts if this is update was sent
|
||||
self._clear_readouts()
|
||||
# send actual report
|
||||
if self.send_report(stats):
|
||||
# clear readouts if this is update was sent
|
||||
self._clear_readouts()
|
||||
|
||||
# count reported iterations
|
||||
reported += 1
|
||||
# count reported iterations
|
||||
reported += 1
|
||||
except Exception as ex:
|
||||
log.exception("Error reporting monitoring info: %s", str(ex))
|
||||
|
||||
def _update_readouts(self):
|
||||
readouts = self._machine_stats()
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 1.1 MiB After Width: | Height: | Size: 1018 KiB |
Reference in New Issue
Block a user