Compare commits

..

6 Commits

10 changed files with 132 additions and 27 deletions

3
.gitignore vendored
View File

@@ -11,3 +11,6 @@ build/
dist/
*.egg-info
# VSCode
.vscode

View File

@@ -80,6 +80,16 @@
# additional artifact repositories to use when installing python packages
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
# control the pytorch wheel resolving algorithm, options are: "pip", "direct"
# "pip" (default): would automatically detect the cuda version, and supply pip with the correct
# extra-index-url, based on pytorch.org tables
# "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
# and matching the automatically detected cuda version with the required pytorch wheel.
# if the exact cuda version is not found for the required pytorch wheel, it will try
# a lower cuda version until a match is found
#
# pytorch_resolve: "pip"
# additional conda channels to use when installing with conda package manager
conda_channels: ["pytorch", "conda-forge", "defaults", ]

View File

@@ -44,7 +44,7 @@ def main():
if conf_file.exists() and conf_file.is_file() and conf_file.stat().st_size > 0:
print('Configuration file already exists: {}'.format(str(conf_file)))
print('Leaving setup, feel free to edit the configuration file.')
print('Leaving setup. If you\'ve previously initialized the ClearML SDK on this machine, manually add an \'agent\' section to this file.')
return
print(description, end='')

View File

@@ -73,6 +73,7 @@ from clearml_agent.definitions import (
ENV_FORCE_SYSTEM_SITE_PACKAGES,
ENV_SERVICES_DOCKER_RESTART,
ENV_CONFIG_BC_IN_STANDALONE,
ENV_FORCE_DOCKER_AGENT_REPO,
)
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
from clearml_agent.errors import (
@@ -3930,6 +3931,7 @@ class Worker(ServiceCommandSection):
# if we are running a RC version, install the same version in the docker
# because the default latest, will be a release version (not RC)
specify_version = ''
# noinspection PyBroadException
try:
from clearml_agent.version import __version__
_version_parts = __version__.split('.')
@@ -3938,13 +3940,15 @@ class Worker(ServiceCommandSection):
except:
pass
force_agent_repo = ENV_FORCE_DOCKER_AGENT_REPO.get()
if os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'):
local_wheel = os.path.expanduser(os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'))
docker_wheel = '/tmp/{}'.format(basename(local_wheel))
base_cmd += ['-v', local_wheel + ':' + docker_wheel]
clearml_agent_wheel = '\"{}\"'.format(docker_wheel)
elif os.environ.get('FORCE_CLEARML_AGENT_REPO'):
clearml_agent_wheel = os.environ.get('FORCE_CLEARML_AGENT_REPO')
elif force_agent_repo:
clearml_agent_wheel = force_agent_repo
else:
# clearml-agent{specify_version}
clearml_agent_wheel = 'clearml-agent{specify_version}'.format(specify_version=specify_version)

View File

@@ -180,6 +180,9 @@ ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV
ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
""" Maintain backwards compatible configuration when launching in standalone mode """
ENV_FORCE_DOCKER_AGENT_REPO = EnvironmentConfig("FORCE_CLEARML_AGENT_REPO", "CLEARML_AGENT_DOCKER_AGENT_REPO")
ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
"""
Specify a restart value for a services agent task containers.

View File

@@ -9,7 +9,7 @@ import os
import re
import subprocess
import tempfile
from collections import defaultdict
from collections import defaultdict, namedtuple
from copy import deepcopy
from pathlib import Path
from pprint import pformat
@@ -42,6 +42,7 @@ class K8sIntegration(Worker):
K8S_DEFAULT_NAMESPACE = "clearml"
AGENT_LABEL = "CLEARML=agent"
QUEUE_LABEL = "clearml-agent-queue"
KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"
@@ -408,34 +409,50 @@ class K8sIntegration(Worker):
return self._agent_label
def _get_used_pods(self):
# type: () -> Tuple[int, Set[str]]
# noinspection PyBroadException
RunningPod = namedtuple("RunningPod", "name queue namespace")
def _get_running_pods(self):
try:
kubectl_cmd = self.get_kubectl_command(
"get pods",
output="jsonpath=\"{range .items[*]}{.metadata.name}{' '}{.metadata.namespace}{'\\n'}{end}\""
output="jsonpath=\"{{range .items[*]}}{{.metadata.name}}{{' '}}{{.metadata.namespace}}{{' '}}"
"{{.metadata.labels.{}}}{{'\\n'}}{{end}}\"".format(self.QUEUE_LABEL)
)
self.log.debug("Getting used pods: {}".format(kubectl_cmd))
output = stringify_bash_output(get_bash_output(kubectl_cmd, raise_error=True))
if not output:
# No such pod exist so we can use the pod_number we found
return 0, set([])
return []
try:
items = output.splitlines()
current_pod_count = len(items)
namespaces = {item.rpartition(" ")[-1] for item in items}
self.log.debug(" - found {} pods in namespaces {}".format(current_pod_count, ", ".join(namespaces)))
except (KeyError, ValueError, TypeError, AttributeError) as ex:
print("Failed parsing used pods command response for cleanup: {}".format(ex))
return -1, set([])
return [
self.RunningPod(
name=parts[0],
namespace=parts[1],
queue=parts[2]
)
for parts in (line.split(" ") for line in output.splitlines())
]
except Exception as ex:
raise Exception("Failed parsing used pods command response for cleanup: {}".format(ex))
except Exception as ex:
raise Exception('Failed obtaining used pods information: {}'.format(ex))
def _get_used_pods(self):
# type: () -> Tuple[int, Set[str]]
# noinspection PyBroadException
try:
items = self._get_running_pods()
if not items:
return 0, set([])
current_pod_count = len(items)
namespaces = {item.namespace for item in items}
self.log.debug(" - found {} pods in namespaces {}".format(current_pod_count, ", ".join(namespaces)))
return current_pod_count, namespaces
except Exception as ex:
print('Failed obtaining used pods information: {}'.format(ex))
return -2, set([])
self.log.debug("Failed getting used pods: {}", ex)
return -1, set([])
def _is_same_tenant(self, task_session):
if not task_session or task_session is self._session:
@@ -657,8 +674,8 @@ class K8sIntegration(Worker):
def _get_pod_labels(self, queue, queue_name):
return [
self._get_agent_label(),
"clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)),
"clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name))
"{}={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue)),
"{}-name={}".format(self.QUEUE_LABEL, self._safe_k8s_label_value(queue_name))
]
def _get_docker_args(self, docker_args, flags, target=None, convert=None):

View File

@@ -310,6 +310,12 @@ class PytorchRequirement(SimpleSubstitution):
# yes this is for linux python 2.7 support, this is the only python 2.7 we support...
if py_ver and py_ver[0] == '2' and len(parts) > 3 and not parts[3].endswith('u'):
continue
# check if this an actual match
if not req.compare_version(v) or \
(last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
continue
# update the closest matched version (from above)
if not closest_v:
closest_v = v
@@ -318,10 +324,6 @@ class PytorchRequirement(SimpleSubstitution):
SimpleVersion.compare_versions(
version_a=v, op='>=', version_b=req.specs[0][1], num_parts=3):
closest_v = v
# check if this an actual match
if not req.compare_version(v) or \
(last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
continue
url = '/'.join(torch_url.split('/')[:-1] + l.split('/'))
last_v = v
@@ -475,6 +477,23 @@ class PytorchRequirement(SimpleSubstitution):
return self.match_version(req, base).replace(" ", "\n")
def replace(self, req):
# we first try to resolve things ourselves because pytorch pip is not always picking the correct
# versions from their pip repository
resolve_algorithm = str(self.config.get("agent.package_manager.pytorch_resolve", "pip")).lower()
if resolve_algorithm == "direct":
# noinspection PyBroadException
try:
new_req = self._replace(req)
if new_req:
self._original_req.append((req, new_req))
return new_req
except Exception:
pass
elif resolve_algorithm not in ("direct", "pip"):
print("Warning: `agent.package_manager.pytorch_resolve={}` "
"unrecognized, default to `pip`".format(resolve_algorithm))
# check if package is already installed with system packages
self.validate_python_version()
@@ -566,6 +585,19 @@ class PytorchRequirement(SimpleSubstitution):
:param list_of_requirements: {'pip': ['a==1.0', ]}
:return: {'pip': ['a==1.0', ]}
"""
def build_specific_version_req(a_line, a_name, a_new_req):
try:
r = Requirement.parse(a_line)
wheel_parts = r.uri.split("/")[-1].split('-')
version = str(wheel_parts[1].split('%')[0].split('+')[0])
new_r = Requirement.parse("{} == {} # {}".format(a_name, version, str(a_new_req)))
if new_r.line:
# great it worked!
return new_r.line
except: # noqa
pass
return None
if not self._original_req:
return list_of_requirements
try:
@@ -589,9 +621,18 @@ class PytorchRequirement(SimpleSubstitution):
if req.local_file:
lines[i] = '{}'.format(str(new_req))
else:
lines[i] = '{} # {}'.format(str(req), str(new_req))
# try to rebuild requirements with specific version:
new_line = build_specific_version_req(line, req.req.name, new_req)
if new_line:
lines[i] = new_line
else:
lines[i] = '{} # {}'.format(str(req), str(new_req))
else:
lines[i] = '{} # {}'.format(line, str(new_req))
new_line = build_specific_version_req(line, req.req.name, new_req)
if new_line:
lines[i] = new_line
else:
lines[i] = '{} # {}'.format(line, str(new_req))
break
except:
pass

View File

@@ -240,6 +240,23 @@ class SimpleVersion:
if not version_b:
return True
# remove trailing "*" in both
if "*" in version_a:
ignore_sub_versions = True
while version_a.endswith(".*"):
version_a = version_a[:-2]
if version_a == "*":
version_a = ""
num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
if "*" in version_b:
ignore_sub_versions = True
while version_b.endswith(".*"):
version_b = version_b[:-2]
if version_b == "*":
version_b = ""
num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
if not num_parts:
num_parts = max(len(version_a.split('.')), len(version_b.split('.')), )

View File

@@ -1 +1 @@
__version__ = '1.5.2'
__version__ = '1.5.3rc3'

View File

@@ -93,6 +93,16 @@ agent {
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
extra_index_url: []
# control the pytorch wheel resolving algorithm, options are: "pip", "direct"
# "pip" (default): would automatically detect the cuda version, and supply pip with the correct
# extra-index-url, based on pytorch.org tables
# "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
# and matching the automatically detected cuda version with the required pytorch wheel.
# if the exact cuda version is not found for the required pytorch wheel, it will try
# a lower cuda version until a match is found
#
# pytorch_resolve: "pip"
# additional conda channels to use when installing with conda package manager
conda_channels: ["pytorch", "conda-forge", "defaults", ]
# conda_full_env_update: false