mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
13 Commits
v0.17.2rc2
...
0.17.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d9b9b4984b | ||
|
|
8a46dc6b03 | ||
|
|
205f9dd816 | ||
|
|
9dfa1294e2 | ||
|
|
f019905720 | ||
|
|
9c257858dd | ||
|
|
2006ab20dd | ||
|
|
0caf31719c | ||
|
|
5da7184276 | ||
|
|
50fccdab96 | ||
|
|
77d6ff6630 | ||
|
|
99614702ea | ||
|
|
58cb344ee6 |
@@ -149,6 +149,9 @@
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
|
||||
@@ -12,7 +12,7 @@ import sys
|
||||
import shutil
|
||||
import traceback
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from copy import deepcopy, copy
|
||||
from datetime import datetime
|
||||
from distutils.spawn import find_executable
|
||||
from functools import partial, cmp_to_key
|
||||
@@ -73,7 +73,7 @@ from clearml_agent.helper.os.daemonize import daemonize_process
|
||||
from clearml_agent.helper.package.base import PackageManager
|
||||
from clearml_agent.helper.package.conda_api import CondaAPI
|
||||
from clearml_agent.helper.package.post_req import PostRequirement
|
||||
from clearml_agent.helper.package.external_req import ExternalRequirements
|
||||
from clearml_agent.helper.package.external_req import ExternalRequirements, OnlyExternalRequirements
|
||||
from clearml_agent.helper.package.pip_api.system import SystemPip
|
||||
from clearml_agent.helper.package.pip_api.venv import VirtualenvPip
|
||||
from clearml_agent.helper.package.poetry_api import PoetryConfig, PoetryAPI
|
||||
@@ -440,11 +440,12 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
return kwargs
|
||||
|
||||
def _get_requirements_manager(self, os_override=None, base_interpreter=None):
|
||||
def _get_requirements_manager(self, os_override=None, base_interpreter=None, requirement_substitutions=None):
|
||||
requirements_manager = RequirementsManager(
|
||||
self._session, base_interpreter=base_interpreter
|
||||
)
|
||||
for requirement_cls in self._requirement_substitutions:
|
||||
requirement_substitutions = requirement_substitutions or self._requirement_substitutions
|
||||
for requirement_cls in requirement_substitutions:
|
||||
if os_override and issubclass(requirement_cls, PytorchRequirement):
|
||||
requirement_cls = partial(requirement_cls, os_override=os_override)
|
||||
requirements_manager.register(requirement_cls)
|
||||
@@ -1468,7 +1469,19 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
directory, vcs, repo_info = self.get_repo_info(execution, current_task, venv_folder.as_posix())
|
||||
|
||||
if not is_cached:
|
||||
if is_cached:
|
||||
# reinstalling git / local packages
|
||||
package_api = copy(self.package_api)
|
||||
package_api.requirements_manager = self._get_requirements_manager(
|
||||
base_interpreter=package_api.requirements_manager.get_interpreter(),
|
||||
requirement_substitutions=[OnlyExternalRequirements]
|
||||
)
|
||||
# make sure we run the handlers
|
||||
cached_requirements = \
|
||||
{k: package_api.requirements_manager.replace(requirements[k] or '')
|
||||
for k in requirements}
|
||||
package_api.load_requirements(cached_requirements)
|
||||
else:
|
||||
self.install_requirements(
|
||||
execution,
|
||||
repo_info,
|
||||
@@ -1477,6 +1490,7 @@ class Worker(ServiceCommandSection):
|
||||
cwd=vcs.location if vcs and vcs.location else directory,
|
||||
package_api=self.global_package_api if install_globally else None,
|
||||
)
|
||||
|
||||
freeze = self.freeze_task_environment(
|
||||
task_id=task_id, requirements_manager=requirements_manager, update_requirements=False)
|
||||
script_dir = directory
|
||||
@@ -1721,7 +1735,23 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
print("\n")
|
||||
|
||||
if not is_cached and not standalone_mode:
|
||||
if is_cached and not standalone_mode:
|
||||
# reinstalling git / local packages
|
||||
package_api = copy(self.package_api)
|
||||
package_api.requirements_manager = self._get_requirements_manager(
|
||||
base_interpreter=package_api.requirements_manager.get_interpreter(),
|
||||
requirement_substitutions=[OnlyExternalRequirements]
|
||||
)
|
||||
package_api.cwd = vcs.location if vcs and vcs.location else directory
|
||||
# make sure we run the handlers
|
||||
cached_requirements = \
|
||||
{k: package_api.requirements_manager.replace(requirements[k] or '')
|
||||
for k in requirements}
|
||||
if str(cached_requirements.get('pip', '')).strip() \
|
||||
or str(cached_requirements.get('conda', '')).strip():
|
||||
package_api.load_requirements(cached_requirements)
|
||||
|
||||
elif not is_cached and not standalone_mode:
|
||||
self.install_requirements(
|
||||
execution,
|
||||
repo_info,
|
||||
@@ -1793,6 +1823,12 @@ class Worker(ServiceCommandSection):
|
||||
if repo_info:
|
||||
self._update_commit_id(current_task.id, execution, repo_info)
|
||||
|
||||
# get Task Environments and update the process
|
||||
if self._session.config.get('agent.enable_task_env', None):
|
||||
hyper_params = self._get_task_os_env(current_task)
|
||||
if hyper_params:
|
||||
os.environ.update(hyper_params)
|
||||
|
||||
# Add the script CWD to the python path
|
||||
python_path = get_python_path(script_dir, execution.entry_point, self.package_api, is_conda_env=self.is_conda)
|
||||
if ENV_TASK_EXTRA_PYTHON_PATH.get():
|
||||
@@ -1870,6 +1906,20 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
return 1 if exit_code is None else exit_code
|
||||
|
||||
def _get_task_os_env(self, current_task):
|
||||
if not self._session.check_min_api_version('2.9'):
|
||||
return None
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
hyper_params = self._session.get(
|
||||
service="tasks", action="get_hyper_params", tasks=[current_task.id])
|
||||
hyper_params = {
|
||||
str(p['name']): str(p['value'])
|
||||
for p in hyper_params['params'][0]['hyperparams'] if p['section'] == 'Environment'}
|
||||
return hyper_params
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def set_docker_variables(self, docker):
|
||||
temp_config, docker_image_func = self.get_docker_config_cmd(docker)
|
||||
self.dump_config(self.temp_config_path, config=temp_config)
|
||||
@@ -2334,9 +2384,14 @@ class Worker(ServiceCommandSection):
|
||||
Install a new python virtual environment, removing the old one if exists
|
||||
:return: virtualenv directory, requirements manager to use with task, True if there is a cached venv entry
|
||||
"""
|
||||
requested_python_version = requested_python_version or \
|
||||
Text(self._session.config.get("agent.python_binary", None)) or \
|
||||
Text(self._session.config.get("agent.default_python", None))
|
||||
if self._session.config.get("agent.ignore_requested_python_version", None):
|
||||
requested_python_version = ''
|
||||
|
||||
requested_python_version = \
|
||||
requested_python_version or \
|
||||
Text(self._session.config.get("agent.python_binary", None)) or \
|
||||
Text(self._session.config.get("agent.default_python", None))
|
||||
|
||||
if self.is_conda:
|
||||
executable_version_suffix = \
|
||||
requested_python_version[max(requested_python_version.find('python'), 0):].replace('python', '')
|
||||
@@ -2371,6 +2426,7 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
if not standalone_mode:
|
||||
rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR))
|
||||
|
||||
package_manager_params = dict(
|
||||
session=self._session,
|
||||
python=executable_version_suffix if self.is_conda else executable_name,
|
||||
@@ -2501,7 +2557,7 @@ class Worker(ServiceCommandSection):
|
||||
temp_config.put("agent.git_pass", (ENV_AGENT_GIT_PASS.get() or
|
||||
self._session.config.get("agent.git_pass", None)))
|
||||
|
||||
if temp_config.get("agent.venvs_cache.path"):
|
||||
if temp_config.get("agent.venvs_cache.path", None):
|
||||
temp_config.put("agent.venvs_cache.path", '/root/.clearml/venvs-cache')
|
||||
|
||||
self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
|
||||
@@ -2518,7 +2574,7 @@ class Worker(ServiceCommandSection):
|
||||
self._session.config["agent.vcs_cache.path"])).expanduser().as_posix()
|
||||
host_venvs_cache = Path(os.path.expandvars(
|
||||
self._session.config["agent.venvs_cache.path"])).expanduser().as_posix() \
|
||||
if self._session.config.get("agent.venvs_cache.path") else None
|
||||
if self._session.config.get("agent.venvs_cache.path", None) else None
|
||||
host_ssh_cache = self._host_ssh_cache
|
||||
|
||||
host_apt_cache = Path(os.path.expandvars(self._session.config.get(
|
||||
@@ -2571,7 +2627,7 @@ class Worker(ServiceCommandSection):
|
||||
mounted_cache_dir = temp_config.get("sdk.storage.cache.default_base_dir")
|
||||
mounted_pip_dl_dir = temp_config.get("agent.pip_download_cache.path")
|
||||
mounted_vcs_cache = temp_config.get("agent.vcs_cache.path")
|
||||
mounted_venvs_cache = temp_config.get("agent.venvs_cache.path")
|
||||
mounted_venvs_cache = temp_config.get("agent.venvs_cache.path", "")
|
||||
|
||||
# Make sure we have created the configuration file for the executor
|
||||
if not self.dump_config(self.temp_config_path, config=temp_config):
|
||||
|
||||
@@ -16,7 +16,7 @@ def parse(reqstr):
|
||||
filename = getattr(reqstr, 'name', None)
|
||||
try:
|
||||
# Python 2.x compatibility
|
||||
if not isinstance(reqstr, basestring):
|
||||
if not isinstance(reqstr, basestring): # noqa
|
||||
reqstr = reqstr.read()
|
||||
except NameError:
|
||||
# Python 3.x only
|
||||
|
||||
@@ -36,8 +36,7 @@ class K8sIntegration(Worker):
|
||||
|
||||
KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \
|
||||
"--image {docker_image} " \
|
||||
"--restart=Never --replicas=1 " \
|
||||
"--generator=run-pod/v1 " \
|
||||
"--restart=Never " \
|
||||
"--namespace={namespace}"
|
||||
|
||||
KUBECTL_DELETE_CMD = "kubectl delete pods " \
|
||||
@@ -273,13 +272,13 @@ class K8sIntegration(Worker):
|
||||
return
|
||||
|
||||
if task_data.execution.docker_cmd:
|
||||
docker_parts = task_data.execution.docker_cmd
|
||||
docker_cmd = task_data.execution.docker_cmd
|
||||
else:
|
||||
docker_parts = str(ENV_DOCKER_IMAGE.get() or
|
||||
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
|
||||
docker_cmd = str(ENV_DOCKER_IMAGE.get() or
|
||||
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
|
||||
|
||||
# take the first part, this is the docker image name (not arguments)
|
||||
docker_parts = docker_parts.split()
|
||||
docker_parts = docker_cmd.split()
|
||||
docker_image = docker_parts[0]
|
||||
docker_args = docker_parts[1:] if len(docker_parts) > 1 else []
|
||||
|
||||
@@ -355,7 +354,7 @@ class K8sIntegration(Worker):
|
||||
else:
|
||||
output, error = self._kubectl_run(
|
||||
create_clearml_conf=create_clearml_conf,
|
||||
labels=labels, docker_image=docker_image,
|
||||
labels=labels, docker_image=docker_cmd,
|
||||
task_data=task_data,
|
||||
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ import platform
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import psutil
|
||||
from ..gpu import pynvml as N
|
||||
@@ -390,3 +391,34 @@ def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||
'''
|
||||
return GPUStatCollection.new_query(shutdown=shutdown, per_process_stats=per_process_stats,
|
||||
get_driver_info=get_driver_info)
|
||||
|
||||
|
||||
def get_driver_cuda_version():
|
||||
# type: () -> Optional[str]
|
||||
"""
|
||||
:return: Return detected CUDA version from driver. On fail return value is None.
|
||||
Example: `110` is cuda version 11.0
|
||||
"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlInit()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||
except BaseException:
|
||||
cuda_version = ''
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlShutdown()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
return cuda_version[:3] if cuda_version else None
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -505,6 +505,8 @@ class CondaAPI(PackageManager):
|
||||
reqs.append(m)
|
||||
|
||||
# if we have a conda list, the rest should be installed with pip,
|
||||
# this means any experiment that was executed with pip environment,
|
||||
# will be installed using pip
|
||||
if requirements.get('conda', None) is not None:
|
||||
for r in requirements['pip']:
|
||||
try:
|
||||
@@ -518,7 +520,7 @@ class CondaAPI(PackageManager):
|
||||
# skip over local files (we cannot change the version to a local file)
|
||||
if m.local_file:
|
||||
continue
|
||||
m_name = m.name.lower()
|
||||
m_name = (m.name or '').lower()
|
||||
if m_name in conda_supported_req_names:
|
||||
# this package is in the conda list,
|
||||
# make sure that if we changed version and we match it in conda
|
||||
@@ -555,7 +557,7 @@ class CondaAPI(PackageManager):
|
||||
# conform conda packages (version/name)
|
||||
for r in reqs:
|
||||
# change _ to - in name but not the prefix _ (as this is conda prefix)
|
||||
if not r.name.startswith('_') and not requirements.get('conda', None):
|
||||
if r.name and not r.name.startswith('_') and not requirements.get('conda', None):
|
||||
r.name = r.name.replace('_', '-')
|
||||
# remove .post from version numbers, it fails ~= version, and change == to ~=
|
||||
if r.specs and r.specs[0]:
|
||||
|
||||
@@ -17,6 +17,16 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
self.post_install_req_lookup = OrderedDict()
|
||||
|
||||
def match(self, req):
|
||||
# match local folder building:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not req.name and req.req and not req.req.editable and not req.req.vcs and \
|
||||
req.req.line and req.req.line.strip().split('#')[0] and \
|
||||
not req.req.line.strip().split('#')[0].lower().endswith('.whl'):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# match both editable or code or unparsed
|
||||
if not (not req.name or req.req and (req.req.editable or req.req.vcs)):
|
||||
return False
|
||||
@@ -104,3 +114,20 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
list_of_requirements[k] += [self.post_install_req_lookup.get(r, '')
|
||||
for r in self.post_install_req_lookup.keys() if r in original_requirements]
|
||||
return list_of_requirements
|
||||
|
||||
|
||||
class OnlyExternalRequirements(ExternalRequirements):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(OnlyExternalRequirements, self).__init__(*args, **kwargs)
|
||||
|
||||
def match(self, req):
|
||||
return not super(OnlyExternalRequirements, self).match(req)
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
# Do not store the skipped requirements
|
||||
# mark skip package
|
||||
return Text('')
|
||||
|
||||
@@ -17,6 +17,7 @@ import six
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES
|
||||
from clearml_agent.helper.base import warning, is_conda, which, join_lines, is_windows_platform
|
||||
from clearml_agent.helper.process import Argv, PathLike
|
||||
from clearml_agent.helper.gpu.gpustat import get_driver_cuda_version
|
||||
from clearml_agent.session import Session, normalize_cuda_version
|
||||
from clearml_agent.external.requirements_parser import parse
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
@@ -446,6 +447,7 @@ class RequirementsManager(object):
|
||||
'cu'+agent['cuda_version'] if self.found_cuda else 'cpu')
|
||||
self.translator = RequirementsTranslator(session, interpreter=base_interpreter,
|
||||
cache_dir=pip_cache_dir.as_posix())
|
||||
self._base_interpreter = base_interpreter
|
||||
|
||||
def register(self, cls): # type: (Type[RequirementSubstitution]) -> None
|
||||
self.handlers.append(cls(self._session))
|
||||
@@ -529,6 +531,9 @@ class RequirementsManager(object):
|
||||
pass
|
||||
return requirements
|
||||
|
||||
def get_interpreter(self):
|
||||
return self._base_interpreter
|
||||
|
||||
@staticmethod
|
||||
def get_cuda_version(config): # type: (ConfigTree) -> (Text, Text)
|
||||
# we assume os.environ already updated the config['agent.cuda_version'] & config['agent.cudnn_version']
|
||||
@@ -537,6 +542,9 @@ class RequirementsManager(object):
|
||||
if cuda_version and cudnn_version:
|
||||
return normalize_cuda_version(cuda_version), normalize_cuda_version(cudnn_version)
|
||||
|
||||
if not cuda_version:
|
||||
cuda_version = get_driver_cuda_version()
|
||||
|
||||
if not cuda_version and is_windows_platform():
|
||||
try:
|
||||
cuda_vers = [int(k.replace('CUDA_PATH_V', '').replace('_', '')) for k in os.environ.keys()
|
||||
@@ -601,4 +609,3 @@ class RequirementsManager(object):
|
||||
|
||||
return (normalize_cuda_version(cuda_version or 0),
|
||||
normalize_cuda_version(cudnn_version or 0))
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from copy import copy
|
||||
from distutils.spawn import find_executable
|
||||
from itertools import chain, repeat, islice
|
||||
from os.path import devnull
|
||||
@@ -276,9 +276,9 @@ class CommandSequence(Executable):
|
||||
self.commands = []
|
||||
for c in commands:
|
||||
if isinstance(c, CommandSequence):
|
||||
self.commands.extend(deepcopy(c.commands))
|
||||
self.commands.extend([copy(p) for p in c.commands])
|
||||
elif isinstance(c, Argv):
|
||||
self.commands.append(deepcopy(c))
|
||||
self.commands.append(copy(c))
|
||||
else:
|
||||
self.commands.append(Argv(*c, log=self._log))
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '0.17.1'
|
||||
__version__ = '0.17.2'
|
||||
|
||||
@@ -24,7 +24,9 @@ agent {
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: ""
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Overridden with os environment: CLEARML_WORKER_NAME
|
||||
@@ -139,12 +141,15 @@ agent {
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
|
||||
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host"]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# CUDA versions used for Conda setup & solving PyTorch wheel packages
|
||||
# it Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
|
||||
Reference in New Issue
Block a user