mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5446aed9cf | ||
|
|
b94ec85461 | ||
|
|
f55f4f7535 | ||
|
|
c87da3a079 | ||
|
|
c3590a53a8 | ||
|
|
a4315722ab | ||
|
|
c901bd331c | ||
|
|
df97f170a2 | ||
|
|
a30a2dad66 | ||
|
|
2432f5bb68 | ||
|
|
341086d86a | ||
|
|
1163c96438 | ||
|
|
4c120d7cd0 | ||
|
|
966a9758b8 | ||
|
|
f58071fc74 | ||
|
|
8712c5e636 |
@@ -61,7 +61,7 @@ It is a zero configuration fire-and-forget execution agent, providing a full ML/
|
||||
We think Kubernetes is awesome, but it should be a choice.
|
||||
We designed `clearml-agent` so you can run bare-metal or inside a pod with any mix that fits your environment.
|
||||
|
||||
Find Dockerfiles in [docker](./docker) dir and a helm Chart in https://github.com/allegroai/clearml-helm-charts
|
||||
Find Dockerfiles in the [docker](./docker) dir and a helm Chart in https://github.com/allegroai/clearml-helm-charts
|
||||
#### Benefits of integrating existing K8s with ClearML-Agent
|
||||
- ClearML-Agent adds the missing scheduling capabilities to K8s
|
||||
- Allowing for more flexible automation from code
|
||||
|
||||
@@ -11,7 +11,11 @@
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# Notice: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# To learn how to generate git token GitHub/Bitbucket/GitLab:
|
||||
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
|
||||
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
|
||||
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# git_host: ""
|
||||
|
||||
@@ -16,6 +16,7 @@ ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
|
||||
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
|
||||
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
|
||||
ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
|
||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||
)
|
||||
|
||||
@@ -27,9 +27,9 @@ except Exception:
|
||||
|
||||
host_description = """
|
||||
Editing configuration file: {CONFIG_FILE}
|
||||
Enter the url of the clearml-server's Web service, for example: {HOST}
|
||||
Enter the url of the clearml-server's Web service, for example: {HOST} or https://app.clear.ml
|
||||
""".format(
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[0],
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[-1],
|
||||
HOST=def_host,
|
||||
)
|
||||
|
||||
@@ -84,7 +84,7 @@ def main():
|
||||
host = input_url('API Host', api_server)
|
||||
else:
|
||||
print(host_description)
|
||||
host = input_url('WEB Host', '')
|
||||
host = input_url('WEB Host', 'https://app.clear.ml')
|
||||
|
||||
parsed_host = verify_url(host)
|
||||
api_host, files_host, web_host = parse_host(parsed_host, allow_input=True)
|
||||
@@ -116,9 +116,15 @@ def main():
|
||||
print('Enter git username for repository cloning (leave blank for SSH key authentication): [] ', end='')
|
||||
git_user = input()
|
||||
if git_user.strip():
|
||||
print('Enter password for user \'{}\': '.format(git_user), end='')
|
||||
print(
|
||||
"Git personal token is equivalent to a password, to learn how to generate a token:\n"
|
||||
" GitHub: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token\n" # noqa
|
||||
" Bitbucket: https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/\n"
|
||||
" GitLab: https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html\n"
|
||||
)
|
||||
print('Enter git password token for user \'{}\': '.format(git_user), end='')
|
||||
git_pass = input()
|
||||
print('Git repository cloning will be using user={} password={}'.format(git_user, git_pass))
|
||||
print('Git repository cloning will be using user={} token={}'.format(git_user, git_pass))
|
||||
else:
|
||||
git_user = None
|
||||
git_pass = None
|
||||
|
||||
@@ -41,7 +41,7 @@ from clearml_agent.backend_api.services import workers as workers_api
|
||||
from clearml_agent.backend_api.session import CallResult
|
||||
from clearml_agent.backend_api.session.defs import (
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
|
||||
ENV_VENV_CONFIGURED, )
|
||||
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
|
||||
from clearml_agent.backend_config.defs import UptimeConf
|
||||
from clearml_agent.backend_config.utils import apply_environment, apply_files
|
||||
from clearml_agent.commands.base import resolve_names, ServiceCommandSection
|
||||
@@ -639,7 +639,7 @@ class Worker(ServiceCommandSection):
|
||||
pass
|
||||
|
||||
def run_one_task(self, queue, task_id, worker_args, docker=None, task_session=None):
|
||||
# type: (Text, Text, WorkerParams, Optional[Text]) -> ()
|
||||
# type: (Text, Text, WorkerParams, Optional[Text]) -> int
|
||||
"""
|
||||
Run one task pulled from queue.
|
||||
:param queue: ID of queue that task was pulled from
|
||||
@@ -647,6 +647,8 @@ class Worker(ServiceCommandSection):
|
||||
:param worker_args: Worker command line arguments
|
||||
:param task_session: The session for running operations on the passed task
|
||||
:param docker: Docker image in which the execution task will run
|
||||
|
||||
:return: exit code (0 is success)
|
||||
"""
|
||||
# start new process and execute task id
|
||||
# "Running task '{}'".format(task_id)
|
||||
@@ -848,6 +850,8 @@ class Worker(ServiceCommandSection):
|
||||
# unregister this worker, it was killed
|
||||
self._unregister()
|
||||
|
||||
return status
|
||||
|
||||
def get_task_session(self, user, company):
|
||||
"""
|
||||
Get task session for the user by cloning the agent session
|
||||
@@ -1878,6 +1882,9 @@ class Worker(ServiceCommandSection):
|
||||
base_interpreter=package_api.requirements_manager.get_interpreter(),
|
||||
requirement_substitutions=[OnlyExternalRequirements],
|
||||
)
|
||||
# manually update the current state,
|
||||
# for the external git reference chance (in the replace callback)
|
||||
package_api.requirements_manager.update_installed_packages_state(package_api.freeze())
|
||||
# make sure we run the handlers
|
||||
cached_requirements = \
|
||||
{k: package_api.requirements_manager.replace(requirements[k] or '')
|
||||
@@ -2098,7 +2105,7 @@ class Worker(ServiceCommandSection):
|
||||
)
|
||||
try:
|
||||
self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
|
||||
self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
|
||||
status = self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
|
||||
finally:
|
||||
self.stop_monitor()
|
||||
self._unregister()
|
||||
@@ -2106,7 +2113,7 @@ class Worker(ServiceCommandSection):
|
||||
if full_monitoring and self.temp_config_path:
|
||||
safe_remove_file(self._session.config_file)
|
||||
Singleton.close_pid_file()
|
||||
return
|
||||
return status if ENV_PROPAGATE_EXITCODE.get() else 0
|
||||
|
||||
self._apply_extra_configuration()
|
||||
|
||||
@@ -2174,8 +2181,22 @@ class Worker(ServiceCommandSection):
|
||||
if not custom_build_script:
|
||||
if self._session.config.get("agent.package_manager.force_repo_requirements_txt", False):
|
||||
requirements = None
|
||||
print("[package_manager.force_repo_requirements_txt=true] "
|
||||
"Skipping requirements, using repository \"requirements.txt\" ")
|
||||
print("\n[package_manager.force_repo_requirements_txt=true] "
|
||||
"Skipping requirements, using repository \"requirements.txt\" \n")
|
||||
elif self._session.config.get("agent.package_manager.force_original_requirements", False):
|
||||
try:
|
||||
requirements = current_task.script.requirements
|
||||
if isinstance(requirements, dict):
|
||||
if 'org_pip' in requirements:
|
||||
requirements['pip'] = requirements['org_pip']
|
||||
print("\n[package_manager.force_original_requirements=true] "
|
||||
"Using original requirements: \n{}\n".format(requirements['org_pip']))
|
||||
if 'org_conda' in requirements:
|
||||
requirements['conda'] = requirements['org_conda']
|
||||
print("\n[package_manager.force_original_requirements=true] "
|
||||
"Using original requirements: \n{}\n".format(requirements['org_conda']))
|
||||
except AttributeError:
|
||||
requirements = None
|
||||
else:
|
||||
try:
|
||||
requirements = current_task.script.requirements
|
||||
@@ -2226,6 +2247,9 @@ class Worker(ServiceCommandSection):
|
||||
base_interpreter=package_api.requirements_manager.get_interpreter(),
|
||||
requirement_substitutions=[OnlyExternalRequirements]
|
||||
)
|
||||
# manually update the current state,
|
||||
# for the external git reference chance (in the replace callback)
|
||||
package_api.requirements_manager.update_installed_packages_state(package_api.freeze())
|
||||
# make sure we run the handlers
|
||||
cached_requirements = \
|
||||
{k: package_api.requirements_manager.replace(requirements[k] or '')
|
||||
@@ -2790,7 +2814,7 @@ class Worker(ServiceCommandSection):
|
||||
if self._session.debug_mode and temp_file:
|
||||
rm_file(temp_file.name)
|
||||
# call post installation callback
|
||||
requirements_manager.post_install(self._session)
|
||||
requirements_manager.post_install(self._session, package_manager=package_api)
|
||||
# mark as successful installation
|
||||
repo_requirements_installed = True
|
||||
|
||||
@@ -3175,6 +3199,10 @@ class Worker(ServiceCommandSection):
|
||||
if standalone_mode:
|
||||
self.package_api = VirtualenvPip(**package_manager_params)
|
||||
else:
|
||||
if not Path(executable_name).is_file():
|
||||
executable_name_path = find_executable(executable_name)
|
||||
print("Interpreter '{}' found at '{}'".format(executable_name, executable_name_path))
|
||||
executable_name = executable_name_path
|
||||
# we can change it, no one is going to use it anyhow
|
||||
package_manager_params['path'] = None
|
||||
package_manager_params['interpreter'] = executable_name
|
||||
@@ -3609,11 +3637,11 @@ class Worker(ServiceCommandSection):
|
||||
' libsm6 libxext6 libxrender-dev libglib2.0-0' if install_opencv_libs else ""),
|
||||
"[ ! -z $(which git) ] || export CLEARML_APT_INSTALL=\"$CLEARML_APT_INSTALL git\"",
|
||||
"declare LOCAL_PYTHON",
|
||||
"for i in {{10..5}}; do which {python_single_digit}.$i && " +
|
||||
"[ ! -z $LOCAL_PYTHON ] || for i in {{15..5}}; do which {python_single_digit}.$i && " +
|
||||
"{python_single_digit}.$i -m pip --version && " +
|
||||
"export LOCAL_PYTHON=$(which {python_single_digit}.$i) && break ; done",
|
||||
"[ ! -z $LOCAL_PYTHON ] || export CLEARML_APT_INSTALL=\"$CLEARML_APT_INSTALL {python_single_digit}-pip\"", # noqa
|
||||
"[ -z \"$CLEARML_APT_INSTALL\" ] || (apt-get update && apt-get install -y $CLEARML_APT_INSTALL)",
|
||||
"[ -z \"$CLEARML_APT_INSTALL\" ] || (apt-get update -y ; apt-get install -y $CLEARML_APT_INSTALL)",
|
||||
]
|
||||
|
||||
if preprocess_bash_script:
|
||||
|
||||
@@ -69,7 +69,7 @@ class K8sIntegration(Worker):
|
||||
"apt-get update",
|
||||
"apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
"declare LOCAL_PYTHON",
|
||||
"for i in {{10..5}}; do which python3.$i && python3.$i -m pip --version && "
|
||||
"[ ! -z $LOCAL_PYTHON ] || for i in {{15..5}}; do which python3.$i && python3.$i -m pip --version && "
|
||||
"export LOCAL_PYTHON=$(which python3.$i) && break ; done",
|
||||
"[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
|
||||
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
|
||||
|
||||
@@ -424,7 +424,7 @@ class CondaAPI(PackageManager):
|
||||
finally:
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
self.requirements_manager.post_install(self.session)
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
|
||||
def load_requirements(self, requirements):
|
||||
# if we are in read only mode, do not uninstall anything
|
||||
@@ -642,7 +642,7 @@ class CondaAPI(PackageManager):
|
||||
finally:
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
self.requirements_manager.post_install(self.session)
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
return True
|
||||
|
||||
def _parse_conda_result_bad_packges(self, result_dict):
|
||||
|
||||
@@ -46,11 +46,10 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
post_install_req = self.post_install_req
|
||||
self.post_install_req = []
|
||||
for req in post_install_req:
|
||||
try:
|
||||
freeze_base = PackageManager.out_of_scope_freeze() or ''
|
||||
except:
|
||||
freeze_base = ''
|
||||
|
||||
if self.is_already_installed(req):
|
||||
print("No need to reinstall \'{}\' from VCS, "
|
||||
"the exact same version is already installed".format(req.name))
|
||||
continue
|
||||
req_line = self._add_vcs_credentials(req, session)
|
||||
|
||||
# if we have older pip version we have to make sure we replace back the package name with the
|
||||
@@ -175,5 +174,11 @@ class OnlyExternalRequirements(ExternalRequirements):
|
||||
# Do not store the skipped requirements
|
||||
# mark skip package
|
||||
if super(OnlyExternalRequirements, self).match(req):
|
||||
if self.is_already_installed(req):
|
||||
print("No need to reinstall \'{}\' from VCS, "
|
||||
"the exact same version is already installed".format(req.name))
|
||||
return Text('')
|
||||
|
||||
return self._add_vcs_credentials(req, self._session)
|
||||
|
||||
return Text('')
|
||||
|
||||
@@ -39,7 +39,7 @@ class VirtualenvPip(SystemPip, PackageManager):
|
||||
if isinstance(requirements, dict) and requirements.get("pip"):
|
||||
requirements["pip"] = self.requirements_manager.replace(requirements["pip"])
|
||||
super(VirtualenvPip, self).load_requirements(requirements)
|
||||
self.requirements_manager.post_install(self.session)
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
|
||||
def create_flags(self):
|
||||
"""
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import sys
|
||||
import platform
|
||||
from furl import furl
|
||||
import urllib.parse
|
||||
from operator import itemgetter
|
||||
@@ -245,10 +246,15 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
return "macos"
|
||||
raise RuntimeError("unrecognized OS")
|
||||
|
||||
@staticmethod
|
||||
def get_arch():
|
||||
return str(platform.machine()).lower()
|
||||
|
||||
def _get_link_from_torch_page(self, req, torch_url):
|
||||
links_parser = LinksHTMLParser()
|
||||
links_parser.feed(requests.get(torch_url, timeout=10).text)
|
||||
platform_wheel = "win" if self.get_platform() == "windows" else self.get_platform()
|
||||
arch_wheel = self.get_arch()
|
||||
py_ver = self.python_major_minor_str.replace('.', '')
|
||||
url = None
|
||||
last_v = None
|
||||
@@ -269,8 +275,11 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
continue
|
||||
if len(parts) < 3 or not parts[2].endswith(py_ver):
|
||||
continue
|
||||
if len(parts) < 5 or platform_wheel not in parts[4]:
|
||||
if len(parts) < 5 or platform_wheel not in parts[4].lower():
|
||||
continue
|
||||
if len(parts) < 5 or arch_wheel not in parts[4].lower():
|
||||
continue
|
||||
|
||||
# yes this is for linux python 2.7 support, this is the only python 2.7 we support...
|
||||
if py_ver and py_ver[0] == '2' and len(parts) > 3 and not parts[3].endswith('u'):
|
||||
continue
|
||||
|
||||
@@ -179,7 +179,7 @@ class MarkerRequirement(object):
|
||||
if self.remove_local_file_ref():
|
||||
# print warning
|
||||
logging.getLogger(__name__).warning(
|
||||
'Local file not found [{}], references removed !'.format(line))
|
||||
'Local file not found [{}], references removed'.format(line))
|
||||
|
||||
|
||||
class SimpleVersion:
|
||||
@@ -437,6 +437,7 @@ class RequirementSubstitution(object):
|
||||
self.config = session.config # type: ConfigTree
|
||||
self.suffix = '.post{config[agent.cuda_version]}.dev{config[agent.cudnn_version]}'.format(config=self.config)
|
||||
self.package_manager = self.config['agent.package_manager.type']
|
||||
self._is_already_installed_cb = None
|
||||
|
||||
@abstractmethod
|
||||
def match(self, req): # type: (MarkerRequirement) -> bool
|
||||
@@ -452,6 +453,20 @@ class RequirementSubstitution(object):
|
||||
"""
|
||||
pass
|
||||
|
||||
def set_is_already_installed_cb(self, cb):
|
||||
self._is_already_installed_cb = cb
|
||||
|
||||
def is_already_installed(self, req):
|
||||
if not self._is_already_installed_cb:
|
||||
return False
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
return self._is_already_installed_cb(req)
|
||||
except BaseException as ex:
|
||||
# debug could not resolve something
|
||||
print("Warning: Requirements post install callback exception (check if package installed): {}".format(ex))
|
||||
return False
|
||||
|
||||
def post_scan_add_req(self): # type: () -> Optional[MarkerRequirement]
|
||||
"""
|
||||
Allows the RequirementSubstitution to add an extra line/requirements after
|
||||
@@ -562,6 +577,7 @@ class RequirementsManager(object):
|
||||
cache_dir=pip_cache_dir.as_posix())
|
||||
self._base_interpreter = base_interpreter
|
||||
self._cwd = None
|
||||
self._installed_parsed_packages = set()
|
||||
|
||||
def register(self, cls): # type: (Type[RequirementSubstitution]) -> None
|
||||
self.handlers.append(cls(self._session))
|
||||
@@ -619,7 +635,9 @@ class RequirementsManager(object):
|
||||
|
||||
return join_lines(result)
|
||||
|
||||
def post_install(self, session):
|
||||
def post_install(self, session, package_manager=None):
|
||||
if package_manager:
|
||||
self.update_installed_packages_state(package_manager.freeze())
|
||||
for h in self.handlers:
|
||||
try:
|
||||
h.post_install(session)
|
||||
@@ -641,6 +659,34 @@ class RequirementsManager(object):
|
||||
def get_interpreter(self):
|
||||
return self._base_interpreter
|
||||
|
||||
def update_installed_packages_state(self, requirements):
|
||||
"""
|
||||
Updates internal Installed Packages objects, so that later we can detect
|
||||
if we already have a pre-installed package
|
||||
:param requirements: is the output of a freeze() call, i.e. dict {'pip': "package==version"}
|
||||
"""
|
||||
requirements = requirements if not isinstance(requirements, dict) else requirements.get("pip")
|
||||
self._installed_parsed_packages = self.parse_requirements_section_to_marker_requirements(
|
||||
requirements=requirements, cwd=self._cwd)
|
||||
for h in self.handlers:
|
||||
h.set_is_already_installed_cb(self._callback_is_already_installed)
|
||||
|
||||
def _callback_is_already_installed(self, req):
|
||||
for p in (self._installed_parsed_packages or []):
|
||||
if p.name != req.name:
|
||||
continue
|
||||
# if this is version control package, only return true of both installed and requests specify commit ID
|
||||
if req.vcs:
|
||||
return p.vcs and req.revision and req.revision == p.revision
|
||||
|
||||
if not req.specs and not p.specs:
|
||||
return True
|
||||
|
||||
# return if this is the same version
|
||||
return req.specs and p.specs and req.compare_version(p, op="==")
|
||||
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_cuda_version(config): # type: (ConfigTree) -> (Text, Text)
|
||||
# we assume os.environ already updated the config['agent.cuda_version'] & config['agent.cudnn_version']
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '1.2.0rc6'
|
||||
__version__ = '1.2.2'
|
||||
|
||||
@@ -15,7 +15,11 @@ api {
|
||||
agent {
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# Notice: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# To learn how to generate git token GitHub/Bitbucket/GitLab:
|
||||
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
|
||||
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
|
||||
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
|
||||
git_user=""
|
||||
git_pass=""
|
||||
# Limit credentials to a single domain, for example: github.com,
|
||||
|
||||
@@ -12,6 +12,6 @@ pyjwt>=1.6.4,<2.1.0
|
||||
PyYAML>=3.12,<5.5.0
|
||||
requests>=2.20.0,<2.26.0
|
||||
six>=1.13.0,<1.16.0
|
||||
typing>=3.6.4,<3.8.0
|
||||
typing>=3.6.4,<3.8.0 ; python_version < '3.5'
|
||||
urllib3>=1.21.1,<1.27.0
|
||||
virtualenv>=16,<21
|
||||
|
||||
Reference in New Issue
Block a user