mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ccf752c4e4 | ||
|
|
3ed63e2154 | ||
|
|
a535f93cd6 | ||
|
|
b380ec54c6 | ||
|
|
a1274299ce | ||
|
|
c77224af68 | ||
|
|
95dadca45c | ||
|
|
685918fd9b | ||
|
|
bc85ddf78d | ||
|
|
5b5fb0b8a6 | ||
|
|
fec0ce1756 | ||
|
|
1e09b88b7a | ||
|
|
b6ca0fa6a5 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,3 +11,6 @@ build/
|
||||
dist/
|
||||
*.egg-info
|
||||
|
||||
# VSCode
|
||||
.vscode
|
||||
|
||||
|
||||
@@ -80,6 +80,16 @@
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
|
||||
# control the pytorch wheel resolving algorithm, options are: "pip", "direct"
|
||||
# "pip" (default): would automatically detect the cuda version, and supply pip with the correct
|
||||
# extra-index-url, based on pytorch.org tables
|
||||
# "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
|
||||
# and matching the automatically detected cuda version with the required pytorch wheel.
|
||||
# if the exact cuda version is not found for the required pytorch wheel, it will try
|
||||
# a lower cuda version until a match is found
|
||||
#
|
||||
# pytorch_resolve: "pip"
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
|
||||
@@ -88,19 +98,23 @@
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
@@ -259,10 +273,15 @@
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
|
||||
# Custom variables may be specified using the docker_container_name_format_fields option.
|
||||
# Note: resulting name must start with an alphanumeric character and
|
||||
# continue with alphanumeric characters, underscores (_), dots (.) and/or dashes (-)
|
||||
# docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
|
||||
# Specify custom variables for the docker_container_name_format option using a mapping of variable name
|
||||
# to a (nested) task field (using "." as a task field separator, digits specify array index)
|
||||
# docker_container_name_format_fields: { foo: "bar.moo" }
|
||||
|
||||
# Apply top-level environment section from configuration into os.environ
|
||||
apply_environment: true
|
||||
# Top-level environment section is in the form of:
|
||||
@@ -369,13 +388,6 @@
|
||||
# }
|
||||
# },
|
||||
# {
|
||||
# "image": "better_container:tag",
|
||||
# "arguments": "",
|
||||
# "match": {
|
||||
# "container": "replace_me_please"
|
||||
# }
|
||||
# },
|
||||
# {
|
||||
# "image": "another_container:tag",
|
||||
# "arguments": "",
|
||||
# "match": {
|
||||
|
||||
@@ -20,6 +20,7 @@ ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool,
|
||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||
)
|
||||
ENV_FORCE_MAX_API_VERSION = EnvEntry("CLEARML_AGENT_FORCE_MAX_API_VERSION", type=str)
|
||||
|
||||
"""
|
||||
Experimental option to set the request method for all API requests and auth login.
|
||||
|
||||
@@ -16,11 +16,11 @@ from requests.auth import HTTPBasicAuth
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
|
||||
from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
|
||||
|
||||
from .callresult import CallResult
|
||||
from .defs import (
|
||||
ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD, )
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD,
|
||||
ENV_FORCE_MAX_API_VERSION)
|
||||
from .request import Request, BatchRequest
|
||||
from .token_manager import TokenManager
|
||||
from ..config import load
|
||||
@@ -28,7 +28,6 @@ from ..utils import get_http_session_with_retry, urllib_log_warning_setup
|
||||
from ...backend_config.environment import backward_compatibility_support
|
||||
from ...version import __version__
|
||||
|
||||
|
||||
sys_random = SystemRandom()
|
||||
|
||||
|
||||
@@ -64,6 +63,7 @@ class Session(TokenManager):
|
||||
default_files = "https://demofiles.demo.clear.ml"
|
||||
default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
|
||||
default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
|
||||
force_max_api_version = ENV_FORCE_MAX_API_VERSION.get()
|
||||
|
||||
# TODO: add requests.codes.gateway_timeout once we support async commits
|
||||
_retry_codes = [
|
||||
@@ -199,6 +199,10 @@ class Session(TokenManager):
|
||||
# notice: this is across the board warning omission
|
||||
urllib_log_warning_setup(total_retries=http_retries_config.get('total', 0), display_warning_after=3)
|
||||
|
||||
if self.force_max_api_version and self.check_min_api_version(self.force_max_api_version):
|
||||
print("Using forced API version {}".format(self.force_max_api_version))
|
||||
Session.max_api_version = Session.api_version = str(self.force_max_api_version)
|
||||
|
||||
def _setup_session(self, http_retries_config, initial_session=False, default_initial_connect_override=None):
|
||||
# type: (dict, bool, Optional[bool]) -> (dict, requests.Session)
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
|
||||
@@ -52,6 +52,7 @@ def apply_files(config):
|
||||
target_fmt = data.get("target_format", "string")
|
||||
overwrite = bool(data.get("overwrite", True))
|
||||
contents = data.get("contents")
|
||||
mode = data.get("mode")
|
||||
|
||||
target = Path(expanduser(expandvars(path)))
|
||||
|
||||
@@ -110,3 +111,14 @@ def apply_files(config):
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed saving file {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
try:
|
||||
if mode:
|
||||
if isinstance(mode, int):
|
||||
mode = int(str(mode), 8)
|
||||
else:
|
||||
mode = int(mode, 8)
|
||||
target.chmod(mode)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed setting mode {} for {} ({})".format(key, mode, target, ex))
|
||||
continue
|
||||
|
||||
@@ -44,7 +44,7 @@ def main():
|
||||
|
||||
if conf_file.exists() and conf_file.is_file() and conf_file.stat().st_size > 0:
|
||||
print('Configuration file already exists: {}'.format(str(conf_file)))
|
||||
print('Leaving setup, feel free to edit the configuration file.')
|
||||
print('Leaving setup. If you\'ve previously initialized the ClearML SDK on this machine, manually add an \'agent\' section to this file.')
|
||||
return
|
||||
|
||||
print(description, end='')
|
||||
|
||||
@@ -109,15 +109,15 @@ def resolve_default_container(session, task_id, container_config):
|
||||
match.get('script.binary', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('container', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('container', None), requested_container.get('image', '')):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('container', None), entry))
|
||||
continue
|
||||
# if match.get('image', None):
|
||||
# # noinspection PyBroadException
|
||||
# try:
|
||||
# if not re.search(match.get('image', None), requested_container.get('image', '')):
|
||||
# continue
|
||||
# except Exception:
|
||||
# print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
# match.get('image', None), entry))
|
||||
# continue
|
||||
|
||||
matched = True
|
||||
for req_section in ['script.requirements.pip', 'script.requirements.conda']:
|
||||
@@ -156,8 +156,8 @@ def resolve_default_container(session, task_id, container_config):
|
||||
break
|
||||
|
||||
if matched:
|
||||
if not container_config.get('container'):
|
||||
container_config['container'] = entry.get('image', None)
|
||||
if not container_config.get('image'):
|
||||
container_config['image'] = entry.get('image', None)
|
||||
if not container_config.get('arguments'):
|
||||
container_config['arguments'] = entry.get('arguments', None)
|
||||
container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
|
||||
|
||||
@@ -73,6 +73,8 @@ from clearml_agent.definitions import (
|
||||
ENV_FORCE_SYSTEM_SITE_PACKAGES,
|
||||
ENV_SERVICES_DOCKER_RESTART,
|
||||
ENV_CONFIG_BC_IN_STANDALONE,
|
||||
ENV_FORCE_DOCKER_AGENT_REPO,
|
||||
ENV_EXTRA_DOCKER_LABELS,
|
||||
)
|
||||
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
|
||||
from clearml_agent.errors import (
|
||||
@@ -318,6 +320,37 @@ def get_next_task(session, queue, get_task_info=False):
|
||||
return data
|
||||
|
||||
|
||||
def get_task_fields(session, task_id, fields: list, log=None) -> dict:
|
||||
"""
|
||||
Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''}
|
||||
"""
|
||||
result = session.send_request(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
json={'id': [task_id], 'only_fields': list(fields), 'search_hidden': True},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
results = {}
|
||||
result = result.json()['data']['tasks'][0]
|
||||
for field in fields:
|
||||
cur = result
|
||||
for part in field.split("."):
|
||||
if part.isdigit():
|
||||
cur = cur[part]
|
||||
else:
|
||||
cur = cur.get(part, {})
|
||||
results[field] = cur
|
||||
return results
|
||||
except Exception as ex:
|
||||
if log:
|
||||
log.error("Failed obtaining values for task fields {}: {}", fields, ex)
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def get_task_container(session, task_id):
|
||||
"""
|
||||
Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''}
|
||||
@@ -339,16 +372,19 @@ def get_task_container(session, task_id):
|
||||
container = {}
|
||||
else:
|
||||
response = get_task(session, task_id, only_fields=["execution.docker_cmd"])
|
||||
task_docker_cmd_parts = shlex.split(str(response.execution.docker_cmd or '').strip())
|
||||
try:
|
||||
container = dict(
|
||||
container=task_docker_cmd_parts[0],
|
||||
arguments=task_docker_cmd_parts[1:] if len(task_docker_cmd_parts[0]) > 1 else ''
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
container = {}
|
||||
container = {}
|
||||
if response.execution:
|
||||
task_docker_cmd_parts = shlex.split(str(response.execution.docker_cmd or '').strip())
|
||||
if task_docker_cmd_parts:
|
||||
try:
|
||||
container = dict(
|
||||
image=task_docker_cmd_parts[0],
|
||||
arguments=task_docker_cmd_parts[1:] if len(task_docker_cmd_parts[0]) > 1 else ''
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
if (not container or not container.get('container')) and session.check_min_api_version("2.13"):
|
||||
if (not container or not container.get('image')) and session.check_min_api_version("2.13"):
|
||||
container = resolve_default_container(session=session, task_id=task_id, container_config=container)
|
||||
|
||||
return container
|
||||
@@ -889,11 +925,21 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
name_format = self._session.config.get('agent.docker_container_name_format', None)
|
||||
if name_format:
|
||||
custom_fields = {}
|
||||
name_format_fields = self._session.config.get('agent.docker_container_name_format_fields', None)
|
||||
if name_format_fields:
|
||||
field_values = get_task_fields(task_session, task_id, name_format_fields.values(), log=self.log)
|
||||
custom_fields = {
|
||||
k: field_values.get(v)
|
||||
for k, v in name_format_fields.items()
|
||||
}
|
||||
|
||||
try:
|
||||
name = name_format.format(
|
||||
task_id=re.sub(r'[^a-zA-Z0-9._-]', '-', task_id),
|
||||
worker_id=re.sub(r'[^a-zA-Z0-9._-]', '-', worker_id),
|
||||
rand_string="".join(sys_random.choice(string.ascii_lowercase) for _ in range(32))
|
||||
rand_string="".join(sys_random.choice(string.ascii_lowercase) for _ in range(32)),
|
||||
**custom_fields,
|
||||
)
|
||||
except Exception as ex:
|
||||
print("Warning: failed generating docker container name: {}".format(ex))
|
||||
@@ -3848,6 +3894,10 @@ class Worker(ServiceCommandSection):
|
||||
base_cmd += ['-l', self._worker_label.format(worker_id)]
|
||||
base_cmd += ['-l', self._parent_worker_label.format(parent_worker_id)]
|
||||
|
||||
extra_labels = ENV_EXTRA_DOCKER_LABELS.get()
|
||||
for label in (extra_labels or []):
|
||||
base_cmd += ['-l', label]
|
||||
|
||||
self.debug("Command: {}".format(base_cmd), context="docker")
|
||||
|
||||
# check if running inside a kubernetes
|
||||
@@ -3930,6 +3980,7 @@ class Worker(ServiceCommandSection):
|
||||
# if we are running a RC version, install the same version in the docker
|
||||
# because the default latest, will be a release version (not RC)
|
||||
specify_version = ''
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
from clearml_agent.version import __version__
|
||||
_version_parts = __version__.split('.')
|
||||
@@ -3938,13 +3989,15 @@ class Worker(ServiceCommandSection):
|
||||
except:
|
||||
pass
|
||||
|
||||
force_agent_repo = ENV_FORCE_DOCKER_AGENT_REPO.get()
|
||||
|
||||
if os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'):
|
||||
local_wheel = os.path.expanduser(os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'))
|
||||
docker_wheel = '/tmp/{}'.format(basename(local_wheel))
|
||||
base_cmd += ['-v', local_wheel + ':' + docker_wheel]
|
||||
clearml_agent_wheel = '\"{}\"'.format(docker_wheel)
|
||||
elif os.environ.get('FORCE_CLEARML_AGENT_REPO'):
|
||||
clearml_agent_wheel = os.environ.get('FORCE_CLEARML_AGENT_REPO')
|
||||
elif force_agent_repo:
|
||||
clearml_agent_wheel = force_agent_repo
|
||||
else:
|
||||
# clearml-agent{specify_version}
|
||||
clearml_agent_wheel = 'clearml-agent{specify_version}'.format(specify_version=specify_version)
|
||||
|
||||
@@ -173,6 +173,7 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig(
|
||||
)
|
||||
ENV_VENV_CACHE_PATH = EnvironmentConfig("CLEARML_AGENT_VENV_CACHE_PATH")
|
||||
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_ARGS", type=list)
|
||||
ENV_EXTRA_DOCKER_LABELS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_LABELS", type=list)
|
||||
ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
|
||||
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
|
||||
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
|
||||
@@ -180,6 +181,8 @@ ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV
|
||||
ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
|
||||
""" Maintain backwards compatible configuration when launching in standalone mode """
|
||||
|
||||
ENV_FORCE_DOCKER_AGENT_REPO = EnvironmentConfig("FORCE_CLEARML_AGENT_REPO", "CLEARML_AGENT_DOCKER_AGENT_REPO")
|
||||
|
||||
ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
|
||||
"""
|
||||
Specify a restart value for a services agent task containers.
|
||||
|
||||
@@ -9,7 +9,7 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from collections import defaultdict, namedtuple
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
@@ -42,6 +42,7 @@ class K8sIntegration(Worker):
|
||||
|
||||
K8S_DEFAULT_NAMESPACE = "clearml"
|
||||
AGENT_LABEL = "CLEARML=agent"
|
||||
QUEUE_LABEL = "clearml-agent-queue"
|
||||
|
||||
KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"
|
||||
|
||||
@@ -408,34 +409,50 @@ class K8sIntegration(Worker):
|
||||
|
||||
return self._agent_label
|
||||
|
||||
def _get_used_pods(self):
|
||||
# type: () -> Tuple[int, Set[str]]
|
||||
# noinspection PyBroadException
|
||||
RunningPod = namedtuple("RunningPod", "name queue namespace")
|
||||
|
||||
def _get_running_pods(self):
|
||||
try:
|
||||
kubectl_cmd = self.get_kubectl_command(
|
||||
"get pods",
|
||||
output="jsonpath=\"{range .items[*]}{.metadata.name}{' '}{.metadata.namespace}{'\\n'}{end}\""
|
||||
output="jsonpath=\"{{range .items[*]}}{{.metadata.name}}{{' '}}{{.metadata.namespace}}{{' '}}"
|
||||
"{{.metadata.labels.{}}}{{'\\n'}}{{end}}\"".format(self.QUEUE_LABEL)
|
||||
)
|
||||
self.log.debug("Getting used pods: {}".format(kubectl_cmd))
|
||||
output = stringify_bash_output(get_bash_output(kubectl_cmd, raise_error=True))
|
||||
|
||||
if not output:
|
||||
# No such pod exist so we can use the pod_number we found
|
||||
return 0, set([])
|
||||
return []
|
||||
|
||||
try:
|
||||
items = output.splitlines()
|
||||
current_pod_count = len(items)
|
||||
namespaces = {item.rpartition(" ")[-1] for item in items}
|
||||
self.log.debug(" - found {} pods in namespaces {}".format(current_pod_count, ", ".join(namespaces)))
|
||||
except (KeyError, ValueError, TypeError, AttributeError) as ex:
|
||||
print("Failed parsing used pods command response for cleanup: {}".format(ex))
|
||||
return -1, set([])
|
||||
return [
|
||||
self.RunningPod(
|
||||
name=parts[0],
|
||||
namespace=parts[1],
|
||||
queue=parts[2]
|
||||
)
|
||||
for parts in (line.split(" ") for line in output.splitlines())
|
||||
]
|
||||
except Exception as ex:
|
||||
raise Exception("Failed parsing used pods command response for cleanup: {}".format(ex))
|
||||
except Exception as ex:
|
||||
raise Exception('Failed obtaining used pods information: {}'.format(ex))
|
||||
|
||||
def _get_used_pods(self):
|
||||
# type: () -> Tuple[int, Set[str]]
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
items = self._get_running_pods()
|
||||
if not items:
|
||||
return 0, set([])
|
||||
current_pod_count = len(items)
|
||||
namespaces = {item.namespace for item in items}
|
||||
self.log.debug(" - found {} pods in namespaces {}".format(current_pod_count, ", ".join(namespaces)))
|
||||
return current_pod_count, namespaces
|
||||
except Exception as ex:
|
||||
print('Failed obtaining used pods information: {}'.format(ex))
|
||||
return -2, set([])
|
||||
self.log.debug("Failed getting used pods: {}", ex)
|
||||
return -1, set([])
|
||||
|
||||
def _is_same_tenant(self, task_session):
|
||||
if not task_session or task_session is self._session:
|
||||
@@ -657,8 +674,8 @@ class K8sIntegration(Worker):
|
||||
def _get_pod_labels(self, queue, queue_name):
|
||||
return [
|
||||
self._get_agent_label(),
|
||||
"clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)),
|
||||
"clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name))
|
||||
"{}={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue)),
|
||||
"{}-name={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue_name))
|
||||
]
|
||||
|
||||
def _get_docker_args(self, docker_args, flags, target=None, convert=None):
|
||||
|
||||
@@ -310,6 +310,12 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
# yes this is for linux python 2.7 support, this is the only python 2.7 we support...
|
||||
if py_ver and py_ver[0] == '2' and len(parts) > 3 and not parts[3].endswith('u'):
|
||||
continue
|
||||
|
||||
# check if this an actual match
|
||||
if not req.compare_version(v) or \
|
||||
(last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
|
||||
continue
|
||||
|
||||
# update the closest matched version (from above)
|
||||
if not closest_v:
|
||||
closest_v = v
|
||||
@@ -318,10 +324,6 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
SimpleVersion.compare_versions(
|
||||
version_a=v, op='>=', version_b=req.specs[0][1], num_parts=3):
|
||||
closest_v = v
|
||||
# check if this an actual match
|
||||
if not req.compare_version(v) or \
|
||||
(last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
|
||||
continue
|
||||
|
||||
url = '/'.join(torch_url.split('/')[:-1] + l.split('/'))
|
||||
last_v = v
|
||||
@@ -475,6 +477,23 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
return self.match_version(req, base).replace(" ", "\n")
|
||||
|
||||
def replace(self, req):
|
||||
# we first try to resolve things ourselves because pytorch pip is not always picking the correct
|
||||
# versions from their pip repository
|
||||
|
||||
resolve_algorithm = str(self.config.get("agent.package_manager.pytorch_resolve", "pip")).lower()
|
||||
if resolve_algorithm == "direct":
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
new_req = self._replace(req)
|
||||
if new_req:
|
||||
self._original_req.append((req, new_req))
|
||||
return new_req
|
||||
except Exception:
|
||||
pass
|
||||
elif resolve_algorithm not in ("direct", "pip"):
|
||||
print("Warning: `agent.package_manager.pytorch_resolve={}` "
|
||||
"unrecognized, default to `pip`".format(resolve_algorithm))
|
||||
|
||||
# check if package is already installed with system packages
|
||||
self.validate_python_version()
|
||||
|
||||
@@ -566,6 +585,19 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
:param list_of_requirements: {'pip': ['a==1.0', ]}
|
||||
:return: {'pip': ['a==1.0', ]}
|
||||
"""
|
||||
def build_specific_version_req(a_line, a_name, a_new_req):
|
||||
try:
|
||||
r = Requirement.parse(a_line)
|
||||
wheel_parts = r.uri.split("/")[-1].split('-')
|
||||
version = str(wheel_parts[1].split('%')[0].split('+')[0])
|
||||
new_r = Requirement.parse("{} == {} # {}".format(a_name, version, str(a_new_req)))
|
||||
if new_r.line:
|
||||
# great it worked!
|
||||
return new_r.line
|
||||
except: # noqa
|
||||
pass
|
||||
return None
|
||||
|
||||
if not self._original_req:
|
||||
return list_of_requirements
|
||||
try:
|
||||
@@ -589,9 +621,18 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
if req.local_file:
|
||||
lines[i] = '{}'.format(str(new_req))
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(str(req), str(new_req))
|
||||
# try to rebuild requirements with specific version:
|
||||
new_line = build_specific_version_req(line, req.req.name, new_req)
|
||||
if new_line:
|
||||
lines[i] = new_line
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(str(req), str(new_req))
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(line, str(new_req))
|
||||
new_line = build_specific_version_req(line, req.req.name, new_req)
|
||||
if new_line:
|
||||
lines[i] = new_line
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(line, str(new_req))
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -240,6 +240,23 @@ class SimpleVersion:
|
||||
if not version_b:
|
||||
return True
|
||||
|
||||
# remove trailing "*" in both
|
||||
if "*" in version_a:
|
||||
ignore_sub_versions = True
|
||||
while version_a.endswith(".*"):
|
||||
version_a = version_a[:-2]
|
||||
if version_a == "*":
|
||||
version_a = ""
|
||||
num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
|
||||
|
||||
if "*" in version_b:
|
||||
ignore_sub_versions = True
|
||||
while version_b.endswith(".*"):
|
||||
version_b = version_b[:-2]
|
||||
if version_b == "*":
|
||||
version_b = ""
|
||||
num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
|
||||
|
||||
if not num_parts:
|
||||
num_parts = max(len(version_a.split('.')), len(version_b.split('.')), )
|
||||
|
||||
|
||||
@@ -139,42 +139,45 @@ class ResourceMonitor(object):
|
||||
def _daemon(self):
|
||||
seconds_since_started = 0
|
||||
reported = 0
|
||||
while True:
|
||||
last_report = time()
|
||||
current_report_frequency = (
|
||||
self._report_frequency if reported != 0 else self._first_report_sec
|
||||
)
|
||||
while (time() - last_report) < current_report_frequency:
|
||||
# wait for self._sample_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1 / self._sample_frequency):
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._update_readouts()
|
||||
except Exception as ex:
|
||||
log.warning("failed getting machine stats: %s", report_error(ex))
|
||||
self._failure()
|
||||
try:
|
||||
while True:
|
||||
last_report = time()
|
||||
current_report_frequency = (
|
||||
self._report_frequency if reported != 0 else self._first_report_sec
|
||||
)
|
||||
while (time() - last_report) < current_report_frequency:
|
||||
# wait for self._sample_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1 / self._sample_frequency):
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._update_readouts()
|
||||
except Exception as ex:
|
||||
log.warning("failed getting machine stats: %s", report_error(ex))
|
||||
self._failure()
|
||||
|
||||
seconds_since_started += int(round(time() - last_report))
|
||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||
seconds_since_started += int(round(time() - last_report))
|
||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
|
||||
# start reporting only when we figured out, if this is seconds based, or iterations based
|
||||
average_readouts = self._get_average_readouts()
|
||||
stats = {
|
||||
# 3 points after the dot
|
||||
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
||||
for key, value in average_readouts.items()
|
||||
}
|
||||
# start reporting only when we figured out, if this is seconds based, or iterations based
|
||||
average_readouts = self._get_average_readouts()
|
||||
stats = {
|
||||
# 3 points after the dot
|
||||
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
||||
for key, value in average_readouts.items()
|
||||
}
|
||||
|
||||
# send actual report
|
||||
if self.send_report(stats):
|
||||
# clear readouts if this is update was sent
|
||||
self._clear_readouts()
|
||||
# send actual report
|
||||
if self.send_report(stats):
|
||||
# clear readouts if this is update was sent
|
||||
self._clear_readouts()
|
||||
|
||||
# count reported iterations
|
||||
reported += 1
|
||||
# count reported iterations
|
||||
reported += 1
|
||||
except Exception as ex:
|
||||
log.exception("Error reporting monitoring info: %s", str(ex))
|
||||
|
||||
def _update_readouts(self):
|
||||
readouts = self._machine_stats()
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '1.5.2'
|
||||
__version__ = '1.5.3rc3'
|
||||
|
||||
@@ -93,25 +93,39 @@ agent {
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
extra_index_url: []
|
||||
|
||||
# control the pytorch wheel resolving algorithm, options are: "pip", "direct"
|
||||
# "pip" (default): would automatically detect the cuda version, and supply pip with the correct
|
||||
# extra-index-url, based on pytorch.org tables
|
||||
# "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
|
||||
# and matching the automatically detected cuda version with the required pytorch wheel.
|
||||
# if the exact cuda version is not found for the required pytorch wheel, it will try
|
||||
# a lower cuda version until a match is found
|
||||
#
|
||||
# pytorch_resolve: "pip"
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
# conda_full_env_update: false
|
||||
# conda_env_as_base_docker: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
@@ -228,8 +242,6 @@ agent {
|
||||
# # no repository matching required
|
||||
# repository: ""
|
||||
# }
|
||||
# # no container image matching required (allow to replace one requested container with another)
|
||||
# container: ""
|
||||
# # no repository matching required
|
||||
# project: ""
|
||||
# }
|
||||
@@ -469,6 +481,7 @@ sdk {
|
||||
# target_format: format used to encode contents before writing into the target file. Supported values are json,
|
||||
# yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
|
||||
# overwrite: overwrite the target file in case it exists. Default is true.
|
||||
# mode: set the file mode after writing. use an integer value or a string (e.g. 600 / 777 etc.)
|
||||
#
|
||||
# Example:
|
||||
# files {
|
||||
|
||||
Reference in New Issue
Block a user