Compare commits

...

13 Commits

Author SHA1 Message Date
allegroai
d9b9b4984b Version bump to v0.17.2 2021-03-04 20:12:50 +02:00
allegroai
8a46dc6b03 Update default_docker in docs 2021-03-04 20:07:34 +02:00
allegroai
205f9dd816 Fix k8s glue does not pass docker environment variables
Remove deprecated flags
2021-03-03 15:07:06 +02:00
allegroai
9dfa1294e2 Add agent.enable_task_env set the OS environment based on the Environment section of the Task. 2021-02-28 19:47:44 +02:00
allegroai
f019905720 Fix venv cache support for local folders 2021-02-28 19:47:09 +02:00
allegroai
9c257858dd Fix venv cache support for local folders 2021-02-23 18:54:38 +02:00
allegroai
2006ab20dd Fix conda support for git+http links 2021-02-23 12:46:06 +02:00
allegroai
0caf31719c Fix venv caching always reinstall git repositories and local repositories 2021-02-23 12:45:34 +02:00
allegroai
5da7184276 Add agent.ignore_requested_python_version (control for multi python environments) 2021-02-23 12:45:00 +02:00
allegroai
50fccdab96 PEP8 2021-02-23 12:44:26 +02:00
allegroai
77d6ff6630 Fix docker mode without venvs cache dir 2021-02-17 00:04:07 +02:00
allegroai
99614702ea Add missing default configuration value 2021-02-17 00:03:42 +02:00
allegroai
58cb344ee6 Upgrade pynvml add detect CUDA version from driver level 2021-02-17 00:03:16 +02:00
12 changed files with 2062 additions and 73 deletions

View File

@@ -149,6 +149,9 @@
# arguments: ["--ipc=host", ]
}
# set the OS environments based on the Task's Environment section before launching the Task process.
enable_task_env: false
# set the initial bash script to execute at the startup of any docker.
# all lines will be executed regardless of their exit code.
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version

View File

@@ -12,7 +12,7 @@ import sys
import shutil
import traceback
from collections import defaultdict
from copy import deepcopy
from copy import deepcopy, copy
from datetime import datetime
from distutils.spawn import find_executable
from functools import partial, cmp_to_key
@@ -73,7 +73,7 @@ from clearml_agent.helper.os.daemonize import daemonize_process
from clearml_agent.helper.package.base import PackageManager
from clearml_agent.helper.package.conda_api import CondaAPI
from clearml_agent.helper.package.post_req import PostRequirement
from clearml_agent.helper.package.external_req import ExternalRequirements
from clearml_agent.helper.package.external_req import ExternalRequirements, OnlyExternalRequirements
from clearml_agent.helper.package.pip_api.system import SystemPip
from clearml_agent.helper.package.pip_api.venv import VirtualenvPip
from clearml_agent.helper.package.poetry_api import PoetryConfig, PoetryAPI
@@ -440,11 +440,12 @@ class Worker(ServiceCommandSection):
return kwargs
def _get_requirements_manager(self, os_override=None, base_interpreter=None):
def _get_requirements_manager(self, os_override=None, base_interpreter=None, requirement_substitutions=None):
requirements_manager = RequirementsManager(
self._session, base_interpreter=base_interpreter
)
for requirement_cls in self._requirement_substitutions:
requirement_substitutions = requirement_substitutions or self._requirement_substitutions
for requirement_cls in requirement_substitutions:
if os_override and issubclass(requirement_cls, PytorchRequirement):
requirement_cls = partial(requirement_cls, os_override=os_override)
requirements_manager.register(requirement_cls)
@@ -1468,7 +1469,19 @@ class Worker(ServiceCommandSection):
directory, vcs, repo_info = self.get_repo_info(execution, current_task, venv_folder.as_posix())
if not is_cached:
if is_cached:
# reinstalling git / local packages
package_api = copy(self.package_api)
package_api.requirements_manager = self._get_requirements_manager(
base_interpreter=package_api.requirements_manager.get_interpreter(),
requirement_substitutions=[OnlyExternalRequirements]
)
# make sure we run the handlers
cached_requirements = \
{k: package_api.requirements_manager.replace(requirements[k] or '')
for k in requirements}
package_api.load_requirements(cached_requirements)
else:
self.install_requirements(
execution,
repo_info,
@@ -1477,6 +1490,7 @@ class Worker(ServiceCommandSection):
cwd=vcs.location if vcs and vcs.location else directory,
package_api=self.global_package_api if install_globally else None,
)
freeze = self.freeze_task_environment(
task_id=task_id, requirements_manager=requirements_manager, update_requirements=False)
script_dir = directory
@@ -1721,7 +1735,23 @@ class Worker(ServiceCommandSection):
print("\n")
if not is_cached and not standalone_mode:
if is_cached and not standalone_mode:
# reinstalling git / local packages
package_api = copy(self.package_api)
package_api.requirements_manager = self._get_requirements_manager(
base_interpreter=package_api.requirements_manager.get_interpreter(),
requirement_substitutions=[OnlyExternalRequirements]
)
package_api.cwd = vcs.location if vcs and vcs.location else directory
# make sure we run the handlers
cached_requirements = \
{k: package_api.requirements_manager.replace(requirements[k] or '')
for k in requirements}
if str(cached_requirements.get('pip', '')).strip() \
or str(cached_requirements.get('conda', '')).strip():
package_api.load_requirements(cached_requirements)
elif not is_cached and not standalone_mode:
self.install_requirements(
execution,
repo_info,
@@ -1793,6 +1823,12 @@ class Worker(ServiceCommandSection):
if repo_info:
self._update_commit_id(current_task.id, execution, repo_info)
# get Task Environments and update the process
if self._session.config.get('agent.enable_task_env', None):
hyper_params = self._get_task_os_env(current_task)
if hyper_params:
os.environ.update(hyper_params)
# Add the script CWD to the python path
python_path = get_python_path(script_dir, execution.entry_point, self.package_api, is_conda_env=self.is_conda)
if ENV_TASK_EXTRA_PYTHON_PATH.get():
@@ -1870,6 +1906,20 @@ class Worker(ServiceCommandSection):
return 1 if exit_code is None else exit_code
def _get_task_os_env(self, current_task):
if not self._session.check_min_api_version('2.9'):
return None
# noinspection PyBroadException
try:
hyper_params = self._session.get(
service="tasks", action="get_hyper_params", tasks=[current_task.id])
hyper_params = {
str(p['name']): str(p['value'])
for p in hyper_params['params'][0]['hyperparams'] if p['section'] == 'Environment'}
return hyper_params
except Exception:
return None
def set_docker_variables(self, docker):
temp_config, docker_image_func = self.get_docker_config_cmd(docker)
self.dump_config(self.temp_config_path, config=temp_config)
@@ -2334,9 +2384,14 @@ class Worker(ServiceCommandSection):
Install a new python virtual environment, removing the old one if exists
:return: virtualenv directory, requirements manager to use with task, True if there is a cached venv entry
"""
requested_python_version = requested_python_version or \
Text(self._session.config.get("agent.python_binary", None)) or \
Text(self._session.config.get("agent.default_python", None))
if self._session.config.get("agent.ignore_requested_python_version", None):
requested_python_version = ''
requested_python_version = \
requested_python_version or \
Text(self._session.config.get("agent.python_binary", None)) or \
Text(self._session.config.get("agent.default_python", None))
if self.is_conda:
executable_version_suffix = \
requested_python_version[max(requested_python_version.find('python'), 0):].replace('python', '')
@@ -2371,6 +2426,7 @@ class Worker(ServiceCommandSection):
if not standalone_mode:
rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR))
package_manager_params = dict(
session=self._session,
python=executable_version_suffix if self.is_conda else executable_name,
@@ -2501,7 +2557,7 @@ class Worker(ServiceCommandSection):
temp_config.put("agent.git_pass", (ENV_AGENT_GIT_PASS.get() or
self._session.config.get("agent.git_pass", None)))
if temp_config.get("agent.venvs_cache.path"):
if temp_config.get("agent.venvs_cache.path", None):
temp_config.put("agent.venvs_cache.path", '/root/.clearml/venvs-cache')
self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
@@ -2518,7 +2574,7 @@ class Worker(ServiceCommandSection):
self._session.config["agent.vcs_cache.path"])).expanduser().as_posix()
host_venvs_cache = Path(os.path.expandvars(
self._session.config["agent.venvs_cache.path"])).expanduser().as_posix() \
if self._session.config.get("agent.venvs_cache.path") else None
if self._session.config.get("agent.venvs_cache.path", None) else None
host_ssh_cache = self._host_ssh_cache
host_apt_cache = Path(os.path.expandvars(self._session.config.get(
@@ -2571,7 +2627,7 @@ class Worker(ServiceCommandSection):
mounted_cache_dir = temp_config.get("sdk.storage.cache.default_base_dir")
mounted_pip_dl_dir = temp_config.get("agent.pip_download_cache.path")
mounted_vcs_cache = temp_config.get("agent.vcs_cache.path")
mounted_venvs_cache = temp_config.get("agent.venvs_cache.path")
mounted_venvs_cache = temp_config.get("agent.venvs_cache.path", "")
# Make sure we have created the configuration file for the executor
if not self.dump_config(self.temp_config_path, config=temp_config):

View File

@@ -16,7 +16,7 @@ def parse(reqstr):
filename = getattr(reqstr, 'name', None)
try:
# Python 2.x compatibility
if not isinstance(reqstr, basestring):
if not isinstance(reqstr, basestring): # noqa
reqstr = reqstr.read()
except NameError:
# Python 3.x only

View File

@@ -36,8 +36,7 @@ class K8sIntegration(Worker):
KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \
"--image {docker_image} " \
"--restart=Never --replicas=1 " \
"--generator=run-pod/v1 " \
"--restart=Never " \
"--namespace={namespace}"
KUBECTL_DELETE_CMD = "kubectl delete pods " \
@@ -273,13 +272,13 @@ class K8sIntegration(Worker):
return
if task_data.execution.docker_cmd:
docker_parts = task_data.execution.docker_cmd
docker_cmd = task_data.execution.docker_cmd
else:
docker_parts = str(ENV_DOCKER_IMAGE.get() or
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
docker_cmd = str(ENV_DOCKER_IMAGE.get() or
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
# take the first part, this is the docker image name (not arguments)
docker_parts = docker_parts.split()
docker_parts = docker_cmd.split()
docker_image = docker_parts[0]
docker_args = docker_parts[1:] if len(docker_parts) > 1 else []
@@ -355,7 +354,7 @@ class K8sIntegration(Worker):
else:
output, error = self._kubectl_run(
create_clearml_conf=create_clearml_conf,
labels=labels, docker_image=docker_image,
labels=labels, docker_image=docker_cmd,
task_data=task_data,
task_id=task_id, queue=queue, queue_name=safe_queue_name)

View File

@@ -20,6 +20,7 @@ import platform
import sys
import time
from datetime import datetime
from typing import Optional
import psutil
from ..gpu import pynvml as N
@@ -390,3 +391,34 @@ def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
'''
return GPUStatCollection.new_query(shutdown=shutdown, per_process_stats=per_process_stats,
get_driver_info=get_driver_info)
def get_driver_cuda_version():
# type: () -> Optional[str]
"""
:return: Return detected CUDA version from driver. On fail return value is None.
Example: `110` is cuda version 11.0
"""
# noinspection PyBroadException
try:
N.nvmlInit()
except BaseException:
return None
# noinspection PyBroadException
try:
cuda_version = str(N.nvmlSystemGetCudaDriverVersion())
except BaseException:
# noinspection PyBroadException
try:
cuda_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
except BaseException:
cuda_version = ''
# noinspection PyBroadException
try:
N.nvmlShutdown()
except BaseException:
return None
return cuda_version[:3] if cuda_version else None

File diff suppressed because it is too large Load Diff

View File

@@ -505,6 +505,8 @@ class CondaAPI(PackageManager):
reqs.append(m)
# if we have a conda list, the rest should be installed with pip,
# this means any experiment that was executed with pip environment,
# will be installed using pip
if requirements.get('conda', None) is not None:
for r in requirements['pip']:
try:
@@ -518,7 +520,7 @@ class CondaAPI(PackageManager):
# skip over local files (we cannot change the version to a local file)
if m.local_file:
continue
m_name = m.name.lower()
m_name = (m.name or '').lower()
if m_name in conda_supported_req_names:
# this package is in the conda list,
# make sure that if we changed version and we match it in conda
@@ -555,7 +557,7 @@ class CondaAPI(PackageManager):
# conform conda packages (version/name)
for r in reqs:
# change _ to - in name but not the prefix _ (as this is conda prefix)
if not r.name.startswith('_') and not requirements.get('conda', None):
if r.name and not r.name.startswith('_') and not requirements.get('conda', None):
r.name = r.name.replace('_', '-')
# remove .post from version numbers, it fails ~= version, and change == to ~=
if r.specs and r.specs[0]:

View File

@@ -17,6 +17,16 @@ class ExternalRequirements(SimpleSubstitution):
self.post_install_req_lookup = OrderedDict()
def match(self, req):
# match local folder building:
# noinspection PyBroadException
try:
if not req.name and req.req and not req.req.editable and not req.req.vcs and \
req.req.line and req.req.line.strip().split('#')[0] and \
not req.req.line.strip().split('#')[0].lower().endswith('.whl'):
return True
except Exception:
pass
# match both editable or code or unparsed
if not (not req.name or req.req and (req.req.editable or req.req.vcs)):
return False
@@ -104,3 +114,20 @@ class ExternalRequirements(SimpleSubstitution):
list_of_requirements[k] += [self.post_install_req_lookup.get(r, '')
for r in self.post_install_req_lookup.keys() if r in original_requirements]
return list_of_requirements
class OnlyExternalRequirements(ExternalRequirements):
def __init__(self, *args, **kwargs):
super(OnlyExternalRequirements, self).__init__(*args, **kwargs)
def match(self, req):
return not super(OnlyExternalRequirements, self).match(req)
def replace(self, req):
"""
Replace a requirement
:raises: ValueError if version is pre-release
"""
# Do not store the skipped requirements
# mark skip package
return Text('')

View File

@@ -17,6 +17,7 @@ import six
from clearml_agent.definitions import PIP_EXTRA_INDICES
from clearml_agent.helper.base import warning, is_conda, which, join_lines, is_windows_platform
from clearml_agent.helper.process import Argv, PathLike
from clearml_agent.helper.gpu.gpustat import get_driver_cuda_version
from clearml_agent.session import Session, normalize_cuda_version
from clearml_agent.external.requirements_parser import parse
from clearml_agent.external.requirements_parser.requirement import Requirement
@@ -446,6 +447,7 @@ class RequirementsManager(object):
'cu'+agent['cuda_version'] if self.found_cuda else 'cpu')
self.translator = RequirementsTranslator(session, interpreter=base_interpreter,
cache_dir=pip_cache_dir.as_posix())
self._base_interpreter = base_interpreter
def register(self, cls): # type: (Type[RequirementSubstitution]) -> None
self.handlers.append(cls(self._session))
@@ -529,6 +531,9 @@ class RequirementsManager(object):
pass
return requirements
def get_interpreter(self):
return self._base_interpreter
@staticmethod
def get_cuda_version(config): # type: (ConfigTree) -> (Text, Text)
# we assume os.environ already updated the config['agent.cuda_version'] & config['agent.cudnn_version']
@@ -537,6 +542,9 @@ class RequirementsManager(object):
if cuda_version and cudnn_version:
return normalize_cuda_version(cuda_version), normalize_cuda_version(cudnn_version)
if not cuda_version:
cuda_version = get_driver_cuda_version()
if not cuda_version and is_windows_platform():
try:
cuda_vers = [int(k.replace('CUDA_PATH_V', '').replace('_', '')) for k in os.environ.keys()
@@ -601,4 +609,3 @@ class RequirementsManager(object):
return (normalize_cuda_version(cuda_version or 0),
normalize_cuda_version(cudnn_version or 0))

View File

@@ -7,7 +7,7 @@ import re
import subprocess
import sys
from contextlib import contextmanager
from copy import deepcopy
from copy import copy
from distutils.spawn import find_executable
from itertools import chain, repeat, islice
from os.path import devnull
@@ -276,9 +276,9 @@ class CommandSequence(Executable):
self.commands = []
for c in commands:
if isinstance(c, CommandSequence):
self.commands.extend(deepcopy(c.commands))
self.commands.extend([copy(p) for p in c.commands])
elif isinstance(c, Argv):
self.commands.append(deepcopy(c))
self.commands.append(copy(c))
else:
self.commands.append(Argv(*c, log=self._log))

View File

@@ -1 +1 @@
__version__ = '0.17.1'
__version__ = '0.17.2'

View File

@@ -24,7 +24,9 @@ agent {
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
force_git_ssh_protocol: false
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
# force_git_ssh_port: ""
# force_git_ssh_port: 0
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
# force_git_ssh_user: git
# unique name of this worker, if None, created based on hostname:process_id
# Overridden with os environment: CLEARML_WORKER_NAME
@@ -139,12 +141,15 @@ agent {
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host"]
}
# set the OS environments based on the Task's Environment section before launching the Task process.
enable_task_env: false
# CUDA versions used for Conda setup & solving PyTorch wheel packages
# it Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
# cuda_version: 10.1