Compare commits

..

1 Commits

Author SHA1 Message Date
allegroai
7022df2670 Fix pathlib2 six conflict
Version bump to v1.1.2
2022-02-09 18:08:56 +02:00
34 changed files with 344 additions and 1726 deletions

View File

@@ -8,8 +8,8 @@ ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
[![GitHub license](https://img.shields.io/github/license/allegroai/clearml-agent.svg)](https://img.shields.io/github/license/allegroai/clearml-agent.svg)
[![PyPI pyversions](https://img.shields.io/pypi/pyversions/clearml-agent.svg)](https://img.shields.io/pypi/pyversions/clearml-agent.svg)
[![PyPI version shields.io](https://img.shields.io/pypi/v/clearml-agent.svg)](https://img.shields.io/pypi/v/clearml-agent.svg)
[![PyPI Downloads](https://pepy.tech/badge/clearml-agent/month)](https://pypi.org/project/clearml-agent/)
[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/allegroai)](https://artifacthub.io/packages/search?repo=allegroai)
[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/allegroai)](https://artifacthub.io/packages/search?repo=allegroai)
</div>
---
@@ -29,7 +29,7 @@ ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
It is a zero configuration fire-and-forget execution agent, providing a full ML/DL cluster solution.
**Full Automation in 5 steps**
1. ClearML Server [self-hosted](https://github.com/allegroai/clearml-server) or [free tier hosting](https://app.clear.ml)
1. ClearML Server [self-hosted](https://github.com/allegroai/clearml-server) or [free tier hosting](https://app.community.clear.ml)
2. `pip install clearml-agent` ([install](#installing-the-clearml-agent) the ClearML Agent on any GPU machine: on-premises / cloud / ...)
3. Create a [job](https://github.com/allegroai/clearml/docs/clearml-task.md) or Add [ClearML](https://github.com/allegroai/clearml) to your code with just 2 lines
4. Change the [parameters](#using-the-clearml-agent) in the UI & schedule for [execution](#using-the-clearml-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
@@ -37,8 +37,8 @@ It is a zero configuration fire-and-forget execution agent, providing a full ML/
"All the Deep/Machine-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
**Try ClearML now** [Self Hosted](https://github.com/allegroai/clearml-server) or [Free tier Hosting](https://app.clear.ml)
<a href="https://app.clear.ml"><img src="https://github.com/allegroai/clearml-agent/blob/master/docs/screenshots.gif?raw=true" width="100%"></a>
**Try ClearML now** [Self Hosted](https://github.com/allegroai/clearml-server) or [Free tier Hosting](https://app.community.clear.ml)
<a href="https://app.community.clear.ml"><img src="https://github.com/allegroai/clearml-agent/blob/master/docs/screenshots.gif?raw=true" width="100%"></a>
### Simple, Flexible Experiment Orchestration
**The ClearML Agent was built to address the DL/ML R&D DevOps needs:**
@@ -60,8 +60,6 @@ It is a zero configuration fire-and-forget execution agent, providing a full ML/
### Kubernetes Integration (Optional)
We think Kubernetes is awesome, but it should be a choice.
We designed `clearml-agent` so you can run bare-metal or inside a pod with any mix that fits your environment.
Find Dockerfiles in the [docker](./docker) dir and a helm Chart in https://github.com/allegroai/clearml-helm-charts
#### Benefits of integrating existing K8s with ClearML-Agent
- ClearML-Agent adds the missing scheduling capabilities to K8s
- Allowing for more flexible automation from code
@@ -221,7 +219,7 @@ clearml-agent daemon --queue important_jobs default
```
The **ClearML Agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [free server](https://app.clear.ml/workers-and-queues/queues)
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [free server](https://app.community.clear.ml/workers-and-queues/queues)
##### Stopping the ClearML Agent

View File

@@ -12,7 +12,7 @@ from clearml_agent.definitions import FileBuffering, CONFIG_FILE
from clearml_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
from clearml_agent.helper.process import ExitStatus
from . import interface, session, definitions, commands
from .errors import ConfigFileNotFound, Sigterm, APIError, CustomBuildScriptFailed
from .errors import ConfigFileNotFound, Sigterm, APIError
from .helper.trace import PackageTrace
from .interface import get_parser
@@ -44,8 +44,6 @@ def run_command(parser, args, command_name):
debug = command._session.debug_mode
func = getattr(command, command_name)
return func(**args_dict)
except CustomBuildScriptFailed as e:
command_class.exit(e.message, e.errno)
except ConfigFileNotFound:
message = 'Cannot find configuration file in "{}".\n' \
'To create a configuration file, run:\n' \

View File

@@ -11,11 +11,6 @@
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
# To learn how to generate git token GitHub/Bitbucket/GitLab:
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
# git_user: ""
# git_pass: ""
# git_host: ""
@@ -35,15 +30,6 @@
# specific python version and the system supports multiple python the agent will use the requested python version)
# ignore_requested_python_version: true
# Force the root folder of the git repository (instead of the working directory) into the PYHTONPATH
# default false, only the working directory will be added to the PYHTONPATH
# force_git_root_python_path: false
# in docker mode, if container's entrypoint automatically activated a virtual environment
# use the activated virtual environment and install everything there
# set to False to disable, and always create a new venv inheriting from the system_site_packages
# docker_use_activated_venv: true
# select python package manager:
# currently supported: pip, conda and poetry
# if "pip" or "conda" are used, the agent installs the required packages
@@ -58,8 +44,6 @@
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
pip_version: "<20.2",
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
# poetry_version: "<2",
# virtual environment inheres packages from system
system_site_packages: false,
@@ -83,7 +67,7 @@
# set the optional priority packages to be installed before the rest of the required packages,
# In case a package installation fails, the package will be ignored,
# and the virtual environment process will continue
priority_optional_packages: ["pygobject", ]
# priority_optional_packages: ["pygobject", ]
# set the post packages to be installed after all the rest of the required packages
# post_packages: ["horovod", ]
@@ -172,7 +156,7 @@
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04"
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host", ]
@@ -217,7 +201,6 @@
hide_docker_command_env_vars {
enabled: true
extra_keys: []
parse_embedded_urls: true
}
# allow to set internal mount points inside the docker,
@@ -278,34 +261,4 @@
# target_format: json
# }
# }
# Specifies a custom environment setup script to be executed instead of installing a virtual environment.
# If provided, this script is executed following Git cloning. Script command may include environment variable and
# will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
# The script can also be specified using the CLEARML_AGENT_CUSTOM_BUILD_SCRIPT environment variable.
#
# When running the script, the following environment variables will be set:
# - CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
# contents in JSON format
# - CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
# - CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
# - CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
# - CLEARML_GIT_ROOT: path to the cloned Git repository
# - CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
# this file must be in the following JSON format:
# ```json
# {
# "binary": "/absolute/path/to/python-executable",
# "entry_point": "/absolute/path/to/task-entrypoint-script",
# "working_dir": "/absolute/path/to/task-working/dir"
# }
# ```
# If provided, the agent will use these instead of the predefined task script section to execute the task and will
# skip virtual environment creation.
#
# In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
# In case the custom script is specified but does not exist, or if the custom script does not write valid content
# into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
# standard flow.
custom_build_script: ""
}

View File

@@ -15,17 +15,6 @@ ENV_NO_DEFAULT_SERVER = EnvEntry("CLEARML_NO_DEFAULT_SERVER", "TRAINS_NO_DEFAULT
ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type=bool)
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
)
"""
Experimental option to set the request method for all API requests and auth login.
This could be useful when GET requests with payloads are blocked by a server as
POST requests can be used instead.
However this has not been vigorously tested and may have unintended consequences.
"""
ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD", default="GET")

View File

@@ -5,17 +5,10 @@ import six
from .apimodel import ApiModel
from .datamodel import DataModel
from .defs import ENV_API_DEFAULT_REQ_METHOD
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST"):
raise ValueError(
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
)
class Request(ApiModel):
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
_method = 'get'
def __init__(self, **kwargs):
if kwargs:

View File

@@ -15,7 +15,7 @@ from six.moves.urllib.parse import urlparse, urlunparse
from .callresult import CallResult
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE
from .request import Request, BatchRequest
from .token_manager import TokenManager
from ..config import load
@@ -142,7 +142,7 @@ class Session(TokenManager):
"Could not find host server definition "
"(missing `~/clearml.conf` or Environment CLEARML_API_HOST)\n"
"To get started with ClearML: setup your own `clearml-server`, "
"or create a free account at https://app.clear.ml and run `clearml-agent init`"
"or create a free account at https://app.community.clear.ml and run `clearml-agent init`"
)
self.__host = host.strip("/")
@@ -206,7 +206,7 @@ class Session(TokenManager):
http_retries_config = dict(**http_retries_config)
http_retries_config['connect'] = connect_retries
return http_retries_config, get_http_session_with_retry(config=self.config or None, **http_retries_config)
return http_retries_config, get_http_session_with_retry(**http_retries_config)
def load_vaults(self):
if not self.check_min_api_version("2.15") or self.feature_set == "basic":
@@ -240,12 +240,6 @@ class Session(TokenManager):
except Exception as ex:
print("Failed getting vaults: {}".format(ex))
def verify_feature_set(self, feature_set):
if isinstance(feature_set, str):
feature_set = [feature_set]
if self.feature_set not in feature_set:
raise ValueError('ClearML-server does not support requested feature set {}'.format(feature_set))
def _send_request(
self,
service,
@@ -615,7 +609,6 @@ class Session(TokenManager):
try:
data = {"expiration_sec": exp} if exp else {}
res = self._send_request(
method=ENV_API_DEFAULT_REQ_METHOD.get(default="get"),
service="auth",
action="login",
auth=auth,

View File

@@ -11,10 +11,10 @@ from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
description = """
Please create new clearml credentials through the settings page in your `clearml-server` web app,
or create a free account at https://app.clear.ml/settings/webapp-configuration
Please create new clearml credentials through the profile page in your `clearml-server` web app,
or create a free account at https://app.community.clear.ml/profile
In the settings > workspace page, press "Create new credentials", then press "Copy to clipboard".
In the profile page, press "Create new credentials", then press "Copy to clipboard".
Paste copied configuration here:
"""
@@ -27,9 +27,9 @@ except Exception:
host_description = """
Editing configuration file: {CONFIG_FILE}
Enter the url of the clearml-server's Web service, for example: {HOST} or https://app.clear.ml
Enter the url of the clearml-server's Web service, for example: {HOST}
""".format(
CONFIG_FILE=LOCAL_CONFIG_FILES[-1],
CONFIG_FILE=LOCAL_CONFIG_FILES[0],
HOST=def_host,
)
@@ -84,7 +84,7 @@ def main():
host = input_url('API Host', api_server)
else:
print(host_description)
host = input_url('WEB Host', 'https://app.clear.ml')
host = input_url('WEB Host', '')
parsed_host = verify_url(host)
api_host, files_host, web_host = parse_host(parsed_host, allow_input=True)
@@ -116,15 +116,9 @@ def main():
print('Enter git username for repository cloning (leave blank for SSH key authentication): [] ', end='')
git_user = input()
if git_user.strip():
print(
"Git personal token is equivalent to a password, to learn how to generate a token:\n"
" GitHub: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token\n" # noqa
" Bitbucket: https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/\n"
" GitLab: https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html\n"
)
print('Enter git password token for user \'{}\': '.format(git_user), end='')
print('Enter password for user \'{}\': '.format(git_user), end='')
git_pass = input()
print('Git repository cloning will be using user={} token={}'.format(git_user, git_pass))
print('Git repository cloning will be using user={} password={}'.format(git_user, git_pass))
else:
git_user = None
git_pass = None
@@ -163,7 +157,7 @@ def main():
' api_server: %s\n' \
' web_server: %s\n' \
' files_server: %s\n' \
' # Credentials are generated using the webapp, %s/settings\n' \
' # Credentials are generated using the webapp, %s/profile\n' \
' # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY\n' \
' credentials {"access_key": "%s", "secret_key": "%s"}\n' \
'}\n\n' % (api_host, web_host, files_host,

View File

@@ -1,166 +0,0 @@
import json
import re
import shlex
from clearml_agent.helper.package.requirements import (
RequirementsManager, MarkerRequirement,
compare_version_rules, )
def resolve_default_container(session, task_id, container_config):
container_lookup = session.config.get('agent.default_docker.match_rules', None)
if not session.check_min_api_version("2.13") or not container_lookup:
return container_config
# check backend support before sending any more requests (because they will fail and crash the Task)
try:
session.verify_feature_set('advanced')
except ValueError:
return container_config
result = session.send_request(
service='tasks',
action='get_all',
version='2.14',
json={'id': [task_id],
'only_fields': ['script.requirements', 'script.binary',
'script.repository', 'script.branch',
'project', 'container'],
'search_hidden': True},
method='get',
async_enable=False,
)
try:
task_info = result.json()['data']['tasks'][0] if result.ok else {}
except (ValueError, TypeError):
return container_config
from clearml_agent.external.requirements_parser.requirement import Requirement
# store tasks repository
repository = task_info.get('script', {}).get('repository') or ''
branch = task_info.get('script', {}).get('branch') or ''
binary = task_info.get('script', {}).get('binary') or ''
requested_container = task_info.get('container', {})
# get project full path
project_full_name = ''
if task_info.get('project', None):
result = session.send_request(
service='projects',
action='get_all',
version='2.13',
json={
'id': [task_info.get('project')],
'only_fields': ['name'],
},
method='get',
async_enable=False,
)
try:
if result.ok:
project_full_name = result.json()['data']['projects'][0]['name'] or ''
except (ValueError, TypeError):
pass
task_packages_lookup = {}
for entry in container_lookup:
match = entry.get('match', None)
if not match:
continue
if match.get('project', None):
# noinspection PyBroadException
try:
if not re.search(match.get('project', None), project_full_name):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('project', None), entry))
continue
if match.get('script.repository', None):
# noinspection PyBroadException
try:
if not re.search(match.get('script.repository', None), repository):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('script.repository', None), entry))
continue
if match.get('script.branch', None):
# noinspection PyBroadException
try:
if not re.search(match.get('script.branch', None), branch):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('script.branch', None), entry))
continue
if match.get('script.binary', None):
# noinspection PyBroadException
try:
if not re.search(match.get('script.binary', None), binary):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('script.binary', None), entry))
continue
if match.get('container', None):
# noinspection PyBroadException
try:
if not re.search(match.get('container', None), requested_container.get('image', '')):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('container', None), entry))
continue
matched = True
for req_section in ['script.requirements.pip', 'script.requirements.conda']:
if not match.get(req_section, None):
continue
match_pip_reqs = [MarkerRequirement(Requirement.parse('{} {}'.format(k, v)))
for k, v in match.get(req_section, None).items()]
if not task_packages_lookup.get(req_section):
req_section_parts = req_section.split('.')
task_packages_lookup[req_section] = \
RequirementsManager.parse_requirements_section_to_marker_requirements(
requirements=task_info.get(req_section_parts[0], {}).get(
req_section_parts[1], {}).get(req_section_parts[2], None)
)
matched_all_reqs = True
for mr in match_pip_reqs:
matched_req = False
for pr in task_packages_lookup[req_section]:
if mr.req.name != pr.req.name:
continue
if compare_version_rules(mr.specs, pr.specs):
matched_req = True
break
if not matched_req:
matched_all_reqs = False
break
# if ew have a match, check second section
if matched_all_reqs:
continue
# no match stop
matched = False
break
if matched:
if not container_config.get('container'):
container_config['container'] = entry.get('image', None)
if not container_config.get('arguments'):
container_config['arguments'] = entry.get('arguments', None)
container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
print('Matching default container with rule:\n{}'.format(json.dumps(entry)))
return container_config
return container_config

File diff suppressed because it is too large Load Diff

View File

@@ -126,7 +126,6 @@ DEFAULT_VENV_UPDATE_URL = (
"https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
)
WORKING_REPOSITORY_DIR = "task_repository"
WORKING_STANDALONE_DIR = "code"
DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
PIP_EXTRA_INDICES = [
]
@@ -135,7 +134,6 @@ ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAG
ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
ENV_WORKER_TAGS = EnvironmentConfig('CLEARML_WORKER_TAGS')
ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PIP_VENV_INSTALL')
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL', type=bool)
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
@@ -149,38 +147,6 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEAR
ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
"""
Specifies a custom environment setup script to be executed instead of installing a virtual environment.
If provided, this script is executed following Git cloning. Script command may include environment variable and
will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
The script can also be specified using the `agent.custom_build_script` configuration setting.
When running the script, the following environment variables will be set:
- CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
contents in JSON format
- CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
- CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
- CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
- CLEARML_GIT_ROOT: path to the cloned Git repository
- CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
this file must be in the following JSON format:
```json
{
"binary": "/absolute/path/to/python-executable",
"entry_point": "/absolute/path/to/task-entrypoint-script",
"working_dir": "/absolute/path/to/task-working/dir"
}
```
If provided, the agent will use these instead of the predefined task script section to execute the task and will
skip virtual environment creation.
In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
In case the custom script is specified but does not exist, or if the custom script does not write valid content
into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
standard flow.
"""
class FileBuffering(IntEnum):
"""

View File

@@ -84,13 +84,3 @@ class MissingPackageError(CommandFailedError):
def __str__(self):
return '{self.__class__.__name__}: ' \
'"{self.name}" package is required. Please run "pip install {self.name}"'.format(self=self)
class CustomBuildScriptFailed(CommandFailedError):
def __init__(self, errno, *args, **kwargs):
super(CustomBuildScriptFailed, self).__init__(*args, **kwargs)
self.errno = errno
class SkippedCustomBuildScript(CommandFailedError):
pass

View File

@@ -1,9 +1,6 @@
import os
import re
import warnings
from clearml_agent.definitions import PIP_EXTRA_INDICES
from .requirement import Requirement
@@ -45,14 +42,9 @@ def parse(reqstr, cwd=None):
yield requirement
elif line.startswith('-f') or line.startswith('--find-links') or \
line.startswith('-i') or line.startswith('--index-url') or \
line.startswith('--extra-index-url') or \
line.startswith('--no-index'):
warnings.warn('Private repos not supported. Skipping.')
elif line.startswith('--extra-index-url'):
extra_index = line[len('--extra-index-url'):].strip()
extra_index = re.sub(r"\s+#.*$", "", extra_index) # strip comments
if extra_index and extra_index not in PIP_EXTRA_INDICES:
PIP_EXTRA_INDICES.append(extra_index)
print(f"appended {extra_index} to list of extra pip indices")
continue
elif line.startswith('-Z') or line.startswith('--always-unzip'):
warnings.warn('Unused option --always-unzip. Skipping.')

View File

@@ -18,7 +18,7 @@ from typing import Text, List, Callable, Any, Collection, Optional, Union
import yaml
from clearml_agent.commands.events import Events
from clearml_agent.commands.worker import Worker, get_task_container, set_task_container, get_next_task
from clearml_agent.commands.worker import Worker, get_task_container, set_task_container
from clearml_agent.definitions import ENV_DOCKER_IMAGE
from clearml_agent.errors import APIError
from clearml_agent.helper.base import safe_remove_file
@@ -69,7 +69,7 @@ class K8sIntegration(Worker):
"apt-get update",
"apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
"declare LOCAL_PYTHON",
"[ ! -z $LOCAL_PYTHON ] || for i in {{15..5}}; do which python3.$i && python3.$i -m pip --version && "
"for i in {{10..5}}; do which python3.$i && python3.$i -m pip --version && "
"export LOCAL_PYTHON=$(which python3.$i) && break ; done",
"[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
@@ -362,7 +362,7 @@ class K8sIntegration(Worker):
print('Failed getting number of used pods: {}'.format(ex))
return -2
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_session=None, **_):
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
@@ -398,19 +398,11 @@ class K8sIntegration(Worker):
self.conf_file_content
or Path(self._session._config_file).read_text()
).encode("ascii")
create_clearml_conf = ["echo '{}' | base64 --decode >> ~/clearml.conf".format(
create_clearml_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
base64.b64encode(
hocon_config_encoded
).decode('ascii')
)]
if task_session:
create_clearml_conf.append(
"export CLEARML_AUTH_TOKEN=$(echo '{}' | base64 --decode)".format(
base64.b64encode(task_session.token.encode("ascii")).decode('ascii')
)
)
)
if self.ports_mode:
print("Kubernetes looking for available pod to use")
@@ -602,22 +594,19 @@ class K8sIntegration(Worker):
extra_docker_bash_script=extra_docker_bash_script)
for line in container_bash_script])
extra_bash_commands = list(create_clearml_conf or [])
extra_bash_commands.append(
"echo '{}' | base64 --decode >> ~/__start_agent__.sh ; "
create_init_script = \
"echo '{}' | base64 --decode >> ~/__start_agent__.sh ; " \
"/bin/bash ~/__start_agent__.sh".format(
base64.b64encode(
script_encoded.encode('ascii')
).decode('ascii'))
)
# Notice: we always leave with exit code 0, so pods are never restarted
container = self._merge_containers(
container,
dict(name=name, image=docker_image,
command=['/bin/bash'],
args=['-c', '{} ; exit 0'.format(' ; '.join(extra_bash_commands))])
args=['-c', '{} ; {} ; exit 0'.format(create_clearml_conf, create_init_script)])
)
if template['spec']['containers']:
@@ -696,7 +685,7 @@ class K8sIntegration(Worker):
"--",
"/bin/sh",
"-c",
"{} ; {}".format(" ; ".join(create_clearml_conf or []), container_bash_script.format(
"{} ; {}".format(create_clearml_conf, container_bash_script.format(
extra_bash_init_cmd=self.extra_bash_init_script or "",
extra_docker_bash_script=docker_bash or "",
task_id=task_id
@@ -753,16 +742,14 @@ class K8sIntegration(Worker):
# get next task in queue
try:
response = get_next_task(
self._session, queue=queue, get_task_info=self._impersonate_as_task_owner
)
response = self._session.api_client.queues.get_next_task(queue=queue)
except Exception as e:
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
continue
else:
try:
task_id = response["entry"]["task"]
except (KeyError, TypeError, AttributeError):
task_id = response.entry.task
except AttributeError:
print("No tasks in queue {}".format(queue))
continue
events_service.send_log_events(
@@ -774,26 +761,8 @@ class K8sIntegration(Worker):
level="INFO",
)
task_session = None
if self._impersonate_as_task_owner:
try:
task_user = response["task_info"]["user"]
task_company = response["task_info"]["company"]
except (KeyError, TypeError, AttributeError):
print("Error: cannot retrieve owner user for the task '{}', skipping".format(task_id))
continue
task_session = self.get_task_session(task_user, task_company)
if not task_session:
print(
"Error: Could not login as the user '{}' for the task '{}', skipping".format(
task_user, task_id
)
)
continue
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
self.run_one_task(queue, task_id, worker_params, task_session)
self.run_one_task(queue, task_id, worker_params)
self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
break
else:
@@ -804,7 +773,7 @@ class K8sIntegration(Worker):
if self._session.config["agent.reload_config"]:
self.reload_config()
def k8s_daemon(self, queue, **kwargs):
def k8s_daemon(self, queue):
"""
Start the k8s Glue service.
This service will be pulling tasks from *queue* and scheduling them for execution using kubectl.
@@ -815,10 +784,8 @@ class K8sIntegration(Worker):
:param list(str) queue: queue name to pull from
"""
return self.daemon(
queues=[ObjectID(name=queue)] if queue else None,
log_level=logging.INFO, foreground=True, docker=False, **kwargs,
)
return self.daemon(queues=[ObjectID(name=queue)] if queue else None,
log_level=logging.INFO, foreground=True, docker=False)
@classmethod
def get_ssh_server_bash(cls, ssh_port_number):

View File

@@ -204,13 +204,10 @@ def get_python_path(script_dir, entry_point, package_api, is_conda_env=False):
["-c", "import sys; print('{}'.join(sys.path))".format(python_path_sep)])
org_python_path = python_path_cmd.get_output(cwd=script_dir)
# Add path of the script directory and executable directory
python_path = '{}{python_path_sep}'.format(
Path(script_dir).absolute().as_posix(), python_path_sep=python_path_sep)
if entry_point:
python_path += '{}{python_path_sep}'.format(
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
python_path_sep=python_path_sep)
python_path = '{}{python_path_sep}{}{python_path_sep}'.format(
Path(script_dir).absolute().as_posix(),
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
python_path_sep=python_path_sep)
if is_windows_platform():
python_path = python_path.replace('/', '\\')
@@ -506,38 +503,6 @@ def is_conda(config):
return config['agent.package_manager.type'].lower() == 'conda'
def convert_cuda_version_to_float_single_digit_str(cuda_version):
"""
Convert a cuda_version (string/float/int) into a float representation, e.g. 11.4
Notice returns String Single digit only!
:return str:
"""
cuda_version = str(cuda_version or 0)
# if we have patch version we parse it here
cuda_version_parts = [int(v) for v in cuda_version.split('.')]
if len(cuda_version_parts) > 1 or cuda_version_parts[0] < 60:
cuda_version = 10 * cuda_version_parts[0]
if len(cuda_version_parts) > 1:
cuda_version += float(".{:d}".format(cuda_version_parts[1]))*10
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
else:
cuda_version = cuda_version_parts[0]
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
return cuda_version_full
def convert_cuda_version_to_int_10_base_str(cuda_version):
"""
Convert a cuda_version (string/float/int) into an integer version, e.g. 112 for cuda 11.2
Return string
:return str:
"""
cuda_version = convert_cuda_version_to_float_single_digit_str(cuda_version)
return str(int(float(cuda_version)*10))
class NonStrictAttrs(object):
@classmethod

View File

@@ -2,7 +2,7 @@ from __future__ import unicode_literals, print_function
import csv
import sys
from collections.abc import Iterable
from collections import Iterable
from typing import List, Dict, Text, Any
from attr import attrs, attrib

View File

@@ -19,9 +19,7 @@ from clearml_agent.external.requirements_parser import parse
from clearml_agent.external.requirements_parser.requirement import Requirement
from clearml_agent.errors import CommandFailedError
from clearml_agent.helper.base import (
rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo,
convert_cuda_version_to_float_single_digit_str, convert_cuda_version_to_int_10_base_str, )
from clearml_agent.helper.base import rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo
from clearml_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
from clearml_agent.helper.package.requirements import SimpleVersion
from clearml_agent.session import Session
@@ -169,7 +167,7 @@ class CondaAPI(PackageManager):
raise ValueError("Could not restore Conda environment, cannot find {}".format(
self.conda_pre_build_env_path))
command = Argv(
output = Argv(
self.conda,
"create",
"--yes",
@@ -177,9 +175,7 @@ class CondaAPI(PackageManager):
"--prefix",
self.path,
"python={}".format(self.python),
)
print('Executing Conda: {}'.format(command.serialize()))
output = command.get_output(stderr=DEVNULL)
).get_output(stderr=DEVNULL)
match = re.search(
r"\W*(.*activate) ({})".format(re.escape(str(self.path))), output
)
@@ -193,6 +189,14 @@ class CondaAPI(PackageManager):
if conda_env.is_file() and not is_windows_platform():
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
# install cuda toolkit
# noinspection PyBroadException
try:
cuda_version = float(int(self.session.config['agent.cuda_version'])) / 10.0
if cuda_version > 0:
self._install('cudatoolkit={:.1f}'.format(cuda_version))
except Exception:
pass
return self
def _init_existing_environment(self, conda_pre_build_env_path):
@@ -424,7 +428,7 @@ class CondaAPI(PackageManager):
finally:
PackageManager._selected_manager = self
self.requirements_manager.post_install(self.session, package_manager=self)
self.requirements_manager.post_install(self.session)
def load_requirements(self, requirements):
# if we are in read only mode, do not uninstall anything
@@ -452,18 +456,9 @@ class CondaAPI(PackageManager):
requirements['conda'] = requirements['conda'].split('\n')
has_torch = False
has_matplotlib = False
has_cudatoolkit = False
cuda_version_full = 0
# noinspection PyBroadException
try:
# notice this is an integer version: 112 (means 11.2)
cuda_version = str(self.session.config.get('agent.cuda_version', "")).strip()
if not cuda_version:
cuda_version = 0
else:
cuda_version_full = convert_cuda_version_to_float_single_digit_str(cuda_version)
cuda_version = int(convert_cuda_version_to_int_10_base_str(cuda_version))
except Exception:
cuda_version = int(self.session.config.get('agent.cuda_version', 0))
except:
cuda_version = 0
# notice 'conda' entry with empty string is a valid conda requirements list, it means pip only
@@ -480,7 +475,6 @@ class CondaAPI(PackageManager):
continue
m = MarkerRequirement(marker[0])
m.validate_local_file_ref()
# conda does not support version control links
if m.vcs:
pip_requirements.append(m)
@@ -494,19 +488,6 @@ class CondaAPI(PackageManager):
if '.' not in m.specs[0][1]:
continue
if m.name.lower() == 'cudatoolkit':
# skip cuda if we are running on CPU
if not cuda_version:
continue
has_cudatoolkit = True
# cuda version, only major.minor
requested_cuda_version = '.'.join(m.specs[0][1].split('.')[:2])
# make sure that the cuda_version we support can install the requested cuda (major version)
if int(float(requested_cuda_version)) > int(float(cuda_version)/10.0):
continue
m.specs = [(m.specs[0][0], str(requested_cuda_version)), ]
conda_supported_req_names.append(m.name.lower())
if m.req.name.lower() == 'matplotlib':
has_matplotlib = True
@@ -523,11 +504,6 @@ class CondaAPI(PackageManager):
reqs.append(m)
if not has_cudatoolkit and cuda_version:
m = MarkerRequirement(Requirement.parse("cudatoolkit == {}".format(cuda_version_full)))
has_cudatoolkit = True
reqs.append(m)
# if we have a conda list, the rest should be installed with pip,
# this means any experiment that was executed with pip environment,
# will be installed using pip
@@ -541,9 +517,9 @@ class CondaAPI(PackageManager):
continue
m = MarkerRequirement(marker[0])
# remove local files reference if it does not exist (leave the package name)
m.validate_local_file_ref()
# skip over local files (we cannot change the version to a local file)
if m.local_file:
continue
m_name = (m.name or '').lower()
if m_name in conda_supported_req_names:
# this package is in the conda list,
@@ -583,12 +559,8 @@ class CondaAPI(PackageManager):
# change _ to - in name but not the prefix _ (as this is conda prefix)
if r.name and not r.name.startswith('_') and not requirements.get('conda', None):
r.name = r.name.replace('_', '-')
if has_cudatoolkit and r.specs and len(r.specs[0]) > 1 and r.name == 'cudatoolkit':
# select specific cuda version if it came from the requirements
r.specs = [(r.specs[0][0].replace('==', '='), r.specs[0][1].split('.post')[0])]
elif r.specs and r.specs[0] and len(r.specs[0]) > 1:
# remove .post from version numbers it fails with ~= version, and change == to ~=
# remove .post from version numbers, it fails ~= version, and change == to ~=
if r.specs and r.specs[0]:
r.specs = [(r.specs[0][0].replace('==', '~='), r.specs[0][1].split('.post')[0])]
while reqs:
@@ -642,7 +614,7 @@ class CondaAPI(PackageManager):
finally:
PackageManager._selected_manager = self
self.requirements_manager.post_install(self.session, package_manager=self)
self.requirements_manager.post_install(self.session)
return True
def _parse_conda_result_bad_packges(self, result_dict):

View File

@@ -46,10 +46,11 @@ class ExternalRequirements(SimpleSubstitution):
post_install_req = self.post_install_req
self.post_install_req = []
for req in post_install_req:
if self.is_already_installed(req):
print("No need to reinstall \'{}\' from VCS, "
"the exact same version is already installed".format(req.name))
continue
try:
freeze_base = PackageManager.out_of_scope_freeze() or ''
except:
freeze_base = ''
req_line = self._add_vcs_credentials(req, session)
# if we have older pip version we have to make sure we replace back the package name with the
@@ -95,8 +96,7 @@ class ExternalRequirements(SimpleSubstitution):
vcs._set_ssh_url()
new_req_line = 'git+{}{}{}'.format(
'' if scheme and '://' in vcs.url else scheme,
vcs_url if session.config.get('agent.force_git_ssh_protocol', None) else vcs.url_with_auth,
fragment
vcs.url_with_auth, fragment
)
if new_req_line != req_line:
furl_line = furl(new_req_line)
@@ -175,11 +175,5 @@ class OnlyExternalRequirements(ExternalRequirements):
# Do not store the skipped requirements
# mark skip package
if super(OnlyExternalRequirements, self).match(req):
if self.is_already_installed(req):
print("No need to reinstall \'{}\' from VCS, "
"the exact same version is already installed".format(req.name))
return Text('')
return self._add_vcs_credentials(req, self._session)
return Text('')

View File

@@ -12,7 +12,7 @@ from ..requirements import RequirementsManager
class VirtualenvPip(SystemPip, PackageManager):
def __init__(self, session, python, requirements_manager, path, interpreter=None, execution_info=None, **kwargs):
# type: (Session, str, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
# type: (Session, float, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
"""
Program interface to virtualenv pip.
Must be given either path to virtualenv or source command.
@@ -39,7 +39,7 @@ class VirtualenvPip(SystemPip, PackageManager):
if isinstance(requirements, dict) and requirements.get("pip"):
requirements["pip"] = self.requirements_manager.replace(requirements["pip"])
super(VirtualenvPip, self).load_requirements(requirements)
self.requirements_manager.post_install(self.session, package_manager=self)
self.requirements_manager.post_install(self.session)
def create_flags(self):
"""

View File

@@ -5,7 +5,6 @@ import attr
import sys
import os
from pathlib2 import Path
from clearml_agent.helper.process import Argv, DEVNULL, check_if_command_exists
from clearml_agent.session import Session, POETRY
@@ -82,32 +81,6 @@ class PoetryConfig:
@_guard_enabled
def initialize(self, cwd=None):
if not self._initialized:
if self.session.config.get("agent.package_manager.poetry_version", None) is not None:
version = str(self.session.config.get("agent.package_manager.poetry_version"))
print('Upgrading Poetry package {}'.format(version))
# first upgrade pip if we need to
try:
from clearml_agent.helper.package.pip_api.venv import VirtualenvPip
pip = VirtualenvPip(
session=self.session, python=self._python,
requirements_manager=None, path=None, interpreter=self._python)
pip.upgrade_pip()
except Exception as ex:
self.log.warning("failed upgrading pip: {}".format(ex))
# now install poetry
try:
version = version.replace(' ', '')
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
version = version
elif version:
version = "==" + version
argv = Argv(self._python, "-m", "pip", "install", "poetry{}".format(version),
"--upgrade", "--disable-pip-version-check")
print(argv.get_output())
except Exception as ex:
self.log.warning("failed upgrading poetry: {}".format(ex))
self._initialized = True
try:
self._config("--local", "virtualenvs.in-project", "true", cwd=cwd)

View File

@@ -1,4 +1,3 @@
import re
from typing import Text
from .base import PackageManager
@@ -12,14 +11,13 @@ class PriorityPackageRequirement(SimpleSubstitution):
def __init__(self, *args, **kwargs):
super(PriorityPackageRequirement, self).__init__(*args, **kwargs)
self._replaced_packages = {}
# check if we need to replace the packages:
priority_packages = self.config.get('agent.package_manager.priority_packages', None)
if priority_packages:
self.__class__.name = [p.lower() for p in priority_packages]
self.__class__.name = priority_packages
priority_optional_packages = self.config.get('agent.package_manager.priority_optional_packages', None)
if priority_optional_packages:
self.__class__.optional_package_names = [p.lower() for p in priority_optional_packages]
self.__class__.optional_package_names = priority_optional_packages
def match(self, req):
# match both Cython & cython
@@ -30,9 +28,7 @@ class PriorityPackageRequirement(SimpleSubstitution):
Replace a requirement
:raises: ValueError if version is pre-release
"""
self._replaced_packages[req.name] = req.line
if req.name.lower() in self.optional_package_names:
if req.name in self.optional_package_names:
# noinspection PyBroadException
try:
if PackageManager.out_of_scope_install_package(str(req)):
@@ -43,41 +39,6 @@ class PriorityPackageRequirement(SimpleSubstitution):
PackageManager.out_of_scope_install_package(str(req))
return Text(req)
def replace_back(self, list_of_requirements):
"""
:param list_of_requirements: {'pip': ['a==1.0', ]}
:return: {'pip': ['a==1.0', ]}
"""
# if we replaced setuptools, it means someone requested it, and since freeze will not contain it,
# we need to add it manually
if not self._replaced_packages or "setuptools" not in self._replaced_packages:
return list_of_requirements
try:
for k, lines in list_of_requirements.items():
# k is either pip/conda
if k not in ('pip', 'conda'):
continue
for i, line in enumerate(lines):
if not line or line.lstrip().startswith('#'):
continue
parts = [p for p in re.split(r'\s|=|\.|<|>|~|!|@|#', line) if p]
if not parts:
continue
# if we found setuptools, do nothing
if parts[0] == "setuptools":
return list_of_requirements
# if we are here it means we have not found setuptools
# we should add it:
if "pip" in list_of_requirements:
list_of_requirements["pip"] = [self._replaced_packages["setuptools"]] + list_of_requirements["pip"]
except Exception as ex: # noqa
return list_of_requirements
return list_of_requirements
class PackageCollectorRequirement(SimpleSubstitution):
"""

View File

@@ -2,19 +2,17 @@ from __future__ import unicode_literals
import re
import sys
import platform
from furl import furl
import urllib.parse
from operator import itemgetter
from html.parser import HTMLParser
from typing import Text, Optional, Dict
from typing import Text
import attr
import requests
import six
from .requirements import SimpleSubstitution, FatalSpecsResolutionError, SimpleVersion, MarkerRequirement
from ...external.requirements_parser.requirement import Requirement
from .requirements import SimpleSubstitution, FatalSpecsResolutionError, SimpleVersion
OS_TO_WHEEL_NAME = {"linux": "linux_x86_64", "windows": "win_amd64"}
@@ -176,43 +174,36 @@ class PytorchRequirement(SimpleSubstitution):
self.log = self._session.get_logger(__name__)
self.package_manager = self.config["agent.package_manager.type"].lower()
self.os = os_name or self.get_platform()
self.cuda = None
self.python_version_string = None
self.python_major_minor_str = None
self.python = None
self._fix_setuptools = None
self.exceptions = []
self.cuda = "cuda{}".format(self.cuda_version).lower()
self.python_version_string = str(self.config["agent.default_python"])
self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
if '.' not in self.python_major_minor_str:
raise PytorchResolutionError(
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
"must have both major and minor parts of the version (for example: '3.7')".format(
self.python_version_string
)
)
self.python = "python{}".format(self.python_major_minor_str)
self.exceptions = [
PytorchResolutionError(message)
for message in (
None,
'cuda version "{}" is not supported'.format(self.cuda),
'python version "{}" is not supported'.format(
self.python_version_string
),
)
]
try:
self.validate_python_version()
except PytorchResolutionError as e:
self.log.warn("will not be able to install pytorch wheels: %s", e.args[0])
self._original_req = []
def _init_python_ver_cuda_ver(self):
if self.cuda is None:
self.cuda = "cuda{}".format(self.cuda_version).lower()
if self.python_version_string is None:
self.python_version_string = str(self.config["agent.default_python"])
if self.python_major_minor_str is None:
self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
if '.' not in self.python_major_minor_str:
raise PytorchResolutionError(
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
"must have both major and minor parts of the version (for example: '3.7')".format(
self.python_version_string
)
)
if self.python is None:
self.python = "python{}".format(self.python_major_minor_str)
if not self.exceptions:
self.exceptions = [
PytorchResolutionError(message)
for message in (
None,
'cuda version "{}" is not supported'.format(self.cuda),
'python version "{}" is not supported'.format(
self.python_version_string
),
)
]
@property
def is_conda(self):
return self.package_manager == "conda"
@@ -225,8 +216,6 @@ class PytorchRequirement(SimpleSubstitution):
"""
Make sure python version has both major and minor versions as required for choosing pytorch wheel
"""
self._init_python_ver_cuda_ver()
if self.is_pip and not self.python_major_minor_str:
raise PytorchResolutionError(
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
@@ -248,15 +237,10 @@ class PytorchRequirement(SimpleSubstitution):
return "macos"
raise RuntimeError("unrecognized OS")
@staticmethod
def get_arch():
return str(platform.machine()).lower()
def _get_link_from_torch_page(self, req, torch_url):
links_parser = LinksHTMLParser()
links_parser.feed(requests.get(torch_url, timeout=10).text)
platform_wheel = "win" if self.get_platform() == "windows" else self.get_platform()
arch_wheel = self.get_arch()
py_ver = self.python_major_minor_str.replace('.', '')
url = None
last_v = None
@@ -277,11 +261,8 @@ class PytorchRequirement(SimpleSubstitution):
continue
if len(parts) < 3 or not parts[2].endswith(py_ver):
continue
if len(parts) < 5 or platform_wheel not in parts[4].lower():
if len(parts) < 5 or platform_wheel not in parts[4]:
continue
if len(parts) < 5 or arch_wheel not in parts[4].lower():
continue
# yes this is for linux python 2.7 support, this is the only python 2.7 we support...
if py_ver and py_ver[0] == '2' and len(parts) > 3 and not parts[3].endswith('u'):
continue
@@ -313,21 +294,18 @@ class PytorchRequirement(SimpleSubstitution):
def get_url_for_platform(self, req):
# check if package is already installed with system packages
self.validate_python_version()
# noinspection PyBroadException
try:
if self.config.get("agent.package_manager.system_site_packages", None):
from pip._internal.commands.show import search_packages_info
installed_torch = list(search_packages_info([req.name]))
# notice the comparison order, the first part will make sure we have a valid installed package
installed_torch_version = (getattr(installed_torch[0], 'version', None) or installed_torch[0]['version']) \
if installed_torch else None
if installed_torch and installed_torch_version and \
req.compare_version(installed_torch_version):
if installed_torch and installed_torch[0]['version'] and \
req.compare_version(installed_torch[0]['version']):
print('PyTorch: requested "{}" version {}, using pre-installed version {}'.format(
req.name, req.specs[0] if req.specs else 'unspecified', installed_torch_version))
req.name, req.specs[0] if req.specs else 'unspecified', installed_torch[0]['version']))
# package already installed, do nothing
req.specs = [('==', str(installed_torch_version))]
req.specs = [('==', str(installed_torch[0]['version']))]
return '{} {} {}'.format(req.name, req.specs[0][0], req.specs[0][1]), True
except Exception:
pass
@@ -368,10 +346,6 @@ class PytorchRequirement(SimpleSubstitution):
else:
print('Trying PyTorch CUDA version {} support'.format(torch_url_key))
# fix broken pytorch setuptools incompatibility
if closest_matched_version and SimpleVersion.compare_versions(closest_matched_version, "<", "1.11.0"):
self._fix_setuptools = "setuptools < 59"
if not url:
url = PytorchWheel(
torch_version=fix_version(version),
@@ -512,7 +486,7 @@ class PytorchRequirement(SimpleSubstitution):
for i, line in enumerate(lines):
if not line or line.lstrip().startswith('#'):
continue
parts = [p for p in re.split(r'\s|=|\.|<|>|~|!|@|#', line) if p]
parts = [p for p in re.split('\s|=|\.|<|>|~|!|@|#', line) if p]
if not parts:
continue
for req, new_req in self._original_req:
@@ -534,16 +508,6 @@ class PytorchRequirement(SimpleSubstitution):
return list_of_requirements
def post_scan_add_req(self): # type: () -> Optional[MarkerRequirement]
"""
Allows the RequirementSubstitution to add an extra line/requirements after
the initial requirements scan is completed.
Called only once per requirements.txt object
"""
if self._fix_setuptools:
return MarkerRequirement(Requirement.parse(self._fix_setuptools))
return None
MAP = {
"windows": {
"cuda100": {

View File

@@ -14,11 +14,8 @@ from pathlib2 import Path
from pyhocon import ConfigTree
import six
import logging
from clearml_agent.definitions import PIP_EXTRA_INDICES
from clearml_agent.helper.base import (
warning, is_conda, which, join_lines, is_windows_platform,
convert_cuda_version_to_int_10_base_str, )
from clearml_agent.helper.base import warning, is_conda, which, join_lines, is_windows_platform
from clearml_agent.helper.process import Argv, PathLike
from clearml_agent.helper.gpu.gpustat import get_driver_cuda_version
from clearml_agent.session import Session, normalize_cuda_version
@@ -156,31 +153,6 @@ class MarkerRequirement(object):
return SimpleVersion.compare_versions(
version_a=requested_version, op=op, version_b=version, num_parts=num_parts)
def remove_local_file_ref(self):
if not self.local_file or self.vcs or self.editable or self.path:
return False
parts = re.split(r"@\s*{}".format(self.req.uri), self.req.line)
# if we did not find anything do nothing
if len(parts) < 2:
return False
self.req.line = ''.join(parts).strip()
self.req.uri = None
self.req.local_file = False
return True
def validate_local_file_ref(self):
# if local file does not exist, remove the reference to it
if self.vcs or self.editable or self.path or not self.local_file or not self.name or \
not self.uri or not self.uri.startswith("file://"):
return
local_path = Path(self.uri[len("file://"):])
if not local_path.exists():
line = self.line
if self.remove_local_file_ref():
# print warning
logging.getLogger(__name__).warning(
'Local file not found [{}], references removed'.format(line))
class SimpleVersion:
_sub_versions_pep440 = ['a', 'b', 'rc', '.post', '.dev', '+', ]
@@ -236,11 +208,7 @@ class SimpleVersion:
if not version_b:
return True
if not num_parts:
num_parts = max(len(version_a.split('.')), len(version_b.split('.')), )
if op == '~=':
num_parts = len(version_b.split('.')) - 1
num_parts = max(num_parts, 2)
op = '=='
ignore_sub_versions = True
@@ -277,16 +245,6 @@ class SimpleVersion:
return version_a_key < version_b_key
raise ValueError('Unrecognized comparison operator [{}]'.format(op))
@classmethod
def max_version(cls, version_a, version_b):
return version_a if cls.compare_versions(
version_a=version_a, op='>=', version_b=version_b, num_parts=None) else version_b
@classmethod
def min_version(cls, version_a, version_b):
return version_a if cls.compare_versions(
version_a=version_a, op='<=', version_b=version_b, num_parts=None) else version_b
@staticmethod
def _parse_letter_version(
letter, # type: str
@@ -355,77 +313,6 @@ class SimpleVersion:
return ()
def compare_version_rules(specs_a, specs_b):
# specs_a/b are a list of tuples: [('==', '1.2.3'), ] or [('>=', '1.2'), ('<', '1.3')]
# section definition:
class Section(object):
def __init__(self, left=None, left_eq=False, right=None, right_eq=False):
self.left, self.left_eq, self.right, self.right_eq = left, left_eq, right, right_eq
# first create a list of in/out sections for each spec
# >, >= are left rule
# <, <= are right rule
# ~= x.y.z is converted to: >= x.y and < x.y+1
# ==/=== are converted to: >= and <=
# != x.y.z will split a section into: left < x.y.z and right > x.y.z
def create_section(specs):
section = Section()
for op, v in specs:
a = section
if op == '>':
a.left = v
a.left_eq = False
elif op == '>=':
a.left = v
a.left_eq = True
elif op == '<':
a.right = v
a.right_eq = False
elif op == '<=':
a.right = v
a.right_eq = True
elif op == '==':
a.left = v
a.left_eq = True
a.right = v
a.right_eq = True
elif op == '~=':
new_v = v.split('.')
a_left = '.'.join(new_v[:-1])
a.left = a_left if not a.left else SimpleVersion.max_version(a_left, a.left)
a.left_eq = True
a_right = '.'.join(new_v[:-2] + [str(int(new_v[-2])+1)])
a.right = a_right if not a.right else SimpleVersion.min_version(a_right, a.right)
a.right_eq = False if a.right == a_right else a.right_eq
return section
section_a = create_section(specs_a)
section_b = create_section(specs_b)
i = Section()
# then we have a list of sections for spec A/B
if section_a.left == section_b.left:
i.left = section_a.left
i.left_eq = section_a.left_eq and section_b.left_eq
else:
i.left = SimpleVersion.max_version(section_a.left, section_b.left)
i.left_eq = section_a.left_eq if i.left == section_a.left else section_b.left_eq
if section_a.right == section_b.right:
i.right = section_a.right
i.right_eq = section_a.right_eq and section_b.right_eq
else:
i.right = SimpleVersion.min_version(section_a.right, section_b.right)
i.right_eq = section_a.right_eq if i.right == section_a.right else section_b.right_eq
# return true if any section from A intersects a section from B
valid = True
valid &= SimpleVersion.compare_versions(
version_a=i.left, op='<=' if i.left_eq else '<', version_b=i.right, num_parts=None)
valid &= SimpleVersion.compare_versions(
version_a=i.right, op='>=' if i.left_eq else '>', version_b=i.left, num_parts=None)
return valid
@six.add_metaclass(ABCMeta)
class RequirementSubstitution(object):
@@ -437,7 +324,6 @@ class RequirementSubstitution(object):
self.config = session.config # type: ConfigTree
self.suffix = '.post{config[agent.cuda_version]}.dev{config[agent.cudnn_version]}'.format(config=self.config)
self.package_manager = self.config['agent.package_manager.type']
self._is_already_installed_cb = None
@abstractmethod
def match(self, req): # type: (MarkerRequirement) -> bool
@@ -453,20 +339,6 @@ class RequirementSubstitution(object):
"""
pass
def set_is_already_installed_cb(self, cb):
self._is_already_installed_cb = cb
def is_already_installed(self, req):
if not self._is_already_installed_cb:
return False
# noinspection PyBroadException
try:
return self._is_already_installed_cb(req)
except BaseException as ex:
# debug could not resolve something
print("Warning: Requirements post install callback exception (check if package installed): {}".format(ex))
return False
def post_scan_add_req(self): # type: () -> Optional[MarkerRequirement]
"""
Allows the RequirementSubstitution to add an extra line/requirements after
@@ -491,7 +363,7 @@ class RequirementSubstitution(object):
@property
def cuda_version(self):
return convert_cuda_version_to_int_10_base_str(self.config['agent.cuda_version'])
return self.config['agent.cuda_version']
@property
def cudnn_version(self):
@@ -577,7 +449,6 @@ class RequirementsManager(object):
cache_dir=pip_cache_dir.as_posix())
self._base_interpreter = base_interpreter
self._cwd = None
self._installed_parsed_packages = set()
def register(self, cls): # type: (Type[RequirementSubstitution]) -> None
self.handlers.append(cls(self._session))
@@ -597,9 +468,20 @@ class RequirementsManager(object):
return None
def replace(self, requirements): # type: (Text) -> Text
parsed_requirements = self.parse_requirements_section_to_marker_requirements(
requirements=requirements, cwd=self._cwd)
def safe_parse(req_str):
# noinspection PyBroadException
try:
return list(parse(req_str, cwd=self._cwd))
except Exception as ex:
return [Requirement(req_str)]
parsed_requirements = tuple(
map(
MarkerRequirement,
[r for line in (requirements.splitlines() if isinstance(requirements, six.text_type) else requirements)
for r in safe_parse(line)]
)
)
if not parsed_requirements:
# return the original requirements just in case
return requirements
@@ -628,29 +510,14 @@ class RequirementsManager(object):
result = list(result)
# add post scan add requirements call back
double_req_set = None
for h in self.handlers:
reqs = h.post_scan_add_req()
if reqs:
if double_req_set is None:
def safe_parse_name(line):
try:
return Requirement.parse(line).name
except: # noqa
return None
double_req_set = set([safe_parse_name(r) for r in result if r])
for r in (reqs if isinstance(reqs, (tuple, list)) else [reqs]):
if r and (not r.name or r.name not in double_req_set):
result.append(r.tostr())
elif r:
print("SKIPPING additional auto installed package: \"{}\"".format(r))
req = h.post_scan_add_req()
if req:
result.append(req.tostr())
return join_lines(result)
def post_install(self, session, package_manager=None):
if package_manager:
self.update_installed_packages_state(package_manager.freeze())
def post_install(self, session):
for h in self.handlers:
try:
h.post_install(session)
@@ -672,34 +539,6 @@ class RequirementsManager(object):
def get_interpreter(self):
return self._base_interpreter
def update_installed_packages_state(self, requirements):
"""
Updates internal Installed Packages objects, so that later we can detect
if we already have a pre-installed package
:param requirements: is the output of a freeze() call, i.e. dict {'pip': "package==version"}
"""
requirements = requirements if not isinstance(requirements, dict) else requirements.get("pip")
self._installed_parsed_packages = self.parse_requirements_section_to_marker_requirements(
requirements=requirements, cwd=self._cwd)
for h in self.handlers:
h.set_is_already_installed_cb(self._callback_is_already_installed)
def _callback_is_already_installed(self, req):
for p in (self._installed_parsed_packages or []):
if p.name != req.name:
continue
# if this is version control package, only return true of both installed and requests specify commit ID
if req.vcs:
return p.vcs and req.revision and req.revision == p.revision
if not req.specs and not p.specs:
return True
# return if this is the same version
return req.specs and p.specs and req.compare_version(p, op="==")
return False
@staticmethod
def get_cuda_version(config): # type: (ConfigTree) -> (Text, Text)
# we assume os.environ already updated the config['agent.cuda_version'] & config['agent.cudnn_version']
@@ -775,29 +614,3 @@ class RequirementsManager(object):
return (normalize_cuda_version(cuda_version or 0),
normalize_cuda_version(cudnn_version or 0))
@staticmethod
def parse_requirements_section_to_marker_requirements(requirements, cwd=None):
def safe_parse(req_str):
# noinspection PyBroadException
try:
return list(parse(req_str, cwd=cwd))
except Exception as ex:
return [Requirement(req_str)]
def create_req(x):
r = MarkerRequirement(x)
r.validate_local_file_ref()
return r
if not requirements:
return tuple()
parsed_requirements = tuple(
map(
create_req,
[r for line in (requirements.splitlines() if isinstance(requirements, str) else requirements)
for r in safe_parse(line)]
)
)
return parsed_requirements

View File

@@ -108,7 +108,7 @@ class VCS(object):
)
self.url = url
self.location = Text(location)
self._revision = revision
self.revision = revision
self.log = self.session.get_logger(__name__)
@property
@@ -390,7 +390,7 @@ class VCS(object):
"""
Checkout repository at specified revision
"""
self.call("checkout", self._revision, *self.checkout_flags, cwd=self.location)
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
@abc.abstractmethod
def pull(self):
@@ -482,7 +482,7 @@ class VCS(object):
parsed_url = furl(url)
except ValueError:
return url
if parsed_url.scheme in ["", "ssh"] or (parsed_url.scheme or '').startswith("git"):
if parsed_url.scheme in ["", "ssh"] or parsed_url.scheme.startswith("git"):
return parsed_url.url
config_user = ENV_AGENT_GIT_USER.get() or config.get("agent.{}_user".format(cls.executable_name), None)
config_pass = ENV_AGENT_GIT_PASS.get() or config.get("agent.{}_pass".format(cls.executable_name), None)
@@ -519,7 +519,7 @@ class VCS(object):
class Git(VCS):
executable_name = "git"
main_branch = ("master", "main")
main_branch = "master"
clone_flags = ("--quiet", "--recursive")
checkout_flags = ("--force",)
COMMAND_ENV = {
@@ -529,18 +529,9 @@ class Git(VCS):
"GIT_SSH_COMMAND": "ssh -oBatchMode=yes",
}
def __init__(self, *args, **kwargs):
super(Git, self).__init__(*args, **kwargs)
try:
self.call("config", "--global", "--replace-all", "safe.directory", "*", cwd=self.location)
except: # noqa
pass
@staticmethod
def remote_branch_name(branch):
return [
"origin/{}".format(b) for b in ([branch] if isinstance(branch, str) else branch)
]
return "origin/{}".format(branch)
def executable_not_found_error_help(self):
return 'Cannot find "{}" executable. {}'.format(
@@ -562,15 +553,7 @@ class Git(VCS):
"""
Checkout repository at specified revision
"""
revisions = [self._revision] if isinstance(self._revision, str) else self._revision
for i, revision in enumerate(revisions):
try:
self.call("checkout", revision, *self.checkout_flags, cwd=self.location)
break
except subprocess.CalledProcessError:
if i == len(revisions) - 1:
raise
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
try:
self.call("submodule", "update", "--recursive", cwd=self.location)
except: # noqa
@@ -610,7 +593,7 @@ class Hg(VCS):
"pull",
self.url_with_auth,
cwd=self.location,
*(("-r", self._revision) if self._revision else ())
*(("-r", self.revision) if self.revision else ())
)
info_commands = dict(
@@ -680,9 +663,7 @@ def clone_repository_cached(session, execution, destination):
vcs.pull()
rm_tree(destination)
shutil.copytree(Text(cached_repo_path), Text(clone_folder),
symlinks=select_for_platform(linux=True, windows=False),
ignore_dangling_symlinks=True)
shutil.copytree(Text(cached_repo_path), Text(clone_folder))
if not clone_folder.is_dir():
raise CommandFailedError(
"copying of repository failed: from {} to {}".format(
@@ -690,9 +671,9 @@ def clone_repository_cached(session, execution, destination):
)
)
# checkout in the newly copy destination
vcs.location = Text(clone_folder)
vcs.checkout()
# checkout in the newly copy destination
vcs.location = Text(clone_folder)
vcs.checkout()
repo_info = vcs.get_repository_copy_info(clone_folder)

View File

@@ -99,10 +99,8 @@ DAEMON_ARGS = dict({
'aliases': ['-d'],
},
'--stop': {
'help': 'Stop the running agent (based on the same set of arguments). '
'Optional: provide a list of specific local worker IDs to stop',
'nargs': '*',
'default': False,
'help': 'Stop the running agent (based on the same set of arguments)',
'action': 'store_true',
},
'--dynamic-gpus': {
'help': 'Allow to dynamically allocate gpus based on queue properties, '
@@ -167,7 +165,7 @@ COMMANDS = {
},
'--docker': {
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
'nargs': '*',
'default': False,
@@ -201,18 +199,11 @@ COMMANDS = {
},
'--docker': {
'help': 'Build the experiment inside a docker (v19.03 and above). Optional args <image> <arguments> or '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
'nargs': '*',
'default': False,
},
'--force-docker': {
'help': 'Force using the agent-specified docker image (either explicitly in the --docker argument or '
'using the agent\'s default docker image). If provided, the agent will not use any docker '
'container information stored on the task itself (default False)',
'default': False,
'action': 'store_true',
},
'--python-version': {
'help': 'Virtual environment python version to use',
},

View File

@@ -229,35 +229,26 @@ class Session(_Session):
except:
pass
def print_configuration(
self,
remove_secret_keys=("secret", "pass", "token", "account_key", "contents"),
skip_value_keys=("environment", )
):
def print_configuration(self, remove_secret_keys=("secret", "pass", "token", "account_key")):
# remove all the secrets from the print
def recursive_remove_secrets(dictionary, secret_keys=(), empty_keys=()):
def recursive_remove_secrets(dictionary, secret_keys=()):
for k in list(dictionary):
for s in secret_keys:
if s in k:
dictionary.pop(k)
break
for s in empty_keys:
if s == k:
dictionary[k] = {key: '****' for key in dictionary[k]} \
if isinstance(dictionary[k], dict) else '****'
break
if isinstance(dictionary.get(k, None), dict):
recursive_remove_secrets(dictionary[k], secret_keys=secret_keys, empty_keys=empty_keys)
recursive_remove_secrets(dictionary[k], secret_keys=secret_keys)
elif isinstance(dictionary.get(k, None), (list, tuple)):
for item in dictionary[k]:
if isinstance(item, dict):
recursive_remove_secrets(item, secret_keys=secret_keys, empty_keys=empty_keys)
recursive_remove_secrets(item, secret_keys=secret_keys)
config = deepcopy(self.config.to_dict())
# remove the env variable, it's not important
config.pop('env', None)
if remove_secret_keys or skip_value_keys:
recursive_remove_secrets(config, secret_keys=remove_secret_keys, empty_keys=skip_value_keys)
if remove_secret_keys:
recursive_remove_secrets(config, secret_keys=remove_secret_keys)
# remove logging.loggers.urllib3.level from the print
try:
config['logging']['loggers']['urllib3'].pop('level', None)

View File

@@ -1 +1 @@
__version__ = '1.3.0'
__version__ = '1.1.2'

View File

@@ -171,7 +171,7 @@ agent {
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04"
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host", ]

View File

@@ -1,75 +0,0 @@
ARG TAG=3.7.12-alpine3.15
FROM python:${TAG} as build
RUN apk add --no-cache \
gcc \
musl-dev \
libffi-dev
RUN python3 \
-m pip \
install \
--prefix=/install \
--no-cache-dir \
-U \
clearml-agent \
cryptography>=2.9
FROM python:${TAG} as target
WORKDIR /app
ARG KUBECTL_VERSION=1.22.4
# Not sure about these ENV vars
# ENV LC_ALL=en_US.UTF-8
# ENV LANG=en_US.UTF-8
# ENV LANGUAGE=en_US.UTF-8
# ENV PYTHONIOENCODING=UTF-8
COPY --from=build /install /usr/local
ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/
RUN chmod +x /usr/bin/kubectl
RUN apk add --no-cache \
bash
COPY k8s_glue_example.py .
# AWS CLI
# https://github.com/kyleknap/aws-cli/blob/source-proposal/proposals/source-install.md#alpine-linux
# https://github.com/aws/aws-cli/issues/4685
# https://github.com/aws/aws-cli/pull/6352
# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/alpine/Dockerfile
FROM target as gcp
ARG CLOUD_SDK_VERSION=371.0.0
ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION
ENV PATH /google-cloud-sdk/bin:$PATH
WORKDIR /
RUN apk --no-cache add \
curl \
python3 \
py3-crcmod \
py3-openssl \
bash \
libc6-compat \
openssh-client \
git \
gnupg \
&& curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
tar xzf google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
rm google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
gcloud config set core/disable_usage_reporting true && \
gcloud config set component_manager/disable_update_check true && \
gcloud config set metrics/environment github_docker_image && \
gcloud --version
WORKDIR /app

View File

@@ -1,82 +0,0 @@
ARG TAG=3.7.12-slim-bullseye
FROM python:${TAG} as target
ARG KUBECTL_VERSION=1.22.4
WORKDIR /app
RUN python3 \
-m pip \
install \
--no-cache-dir \
-U \
clearml-agent \
cryptography>=2.9
# Not sure about these ENV vars
# ENV LC_ALL=en_US.UTF-8
# ENV LANG=en_US.UTF-8
# ENV LANGUAGE=en_US.UTF-8
# ENV PYTHONIOENCODING=UTF-8
ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/
RUN chmod +x /usr/bin/kubectl
COPY k8s_glue_example.py .
CMD ["python3", "k8s_glue_example.py"]
FROM target as aws
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
# https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html
RUN apt-get update -qqy && \
apt-get install -qqy \
unzip && \
rm -rf /var/lib/apt/lists/*
ADD https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip awscliv2.zip
ADD https://amazon-eks.s3.us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator
RUN unzip awscliv2.zip && \
./aws/install && \
rm -r awscliv2.zip aws/ && \
chmod +x /usr/local/bin/aws-iam-authenticator && \
aws --version && \
aws-iam-authenticator version
# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/debian_slim/Dockerfile
FROM target as gcp
ARG CLOUD_SDK_VERSION=371.0.0
ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION
ENV PATH "$PATH:/opt/google-cloud-sdk/bin/"
ARG INSTALL_COMPONENTS
RUN mkdir -p /usr/share/man/man1/
RUN apt-get update -qqy && \
apt-get install -qqy \
curl \
gcc \
python3-dev \
python3-pip \
apt-transport-https \
lsb-release \
openssh-client \
git \
gnupg && \
rm -rf /var/lib/apt/lists/* && \
pip3 install -U crcmod && \
export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \
echo "deb https://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" > /etc/apt/sources.list.d/google-cloud-sdk.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
apt-get update && apt-get install -y google-cloud-sdk=${CLOUD_SDK_VERSION}-0 $INSTALL_COMPONENTS && \
gcloud config set core/disable_usage_reporting true && \
gcloud config set component_manager/disable_update_check true && \
gcloud config set metrics/environment github_docker_image && \
gcloud --version

View File

@@ -1,94 +0,0 @@
"""
This example assumes you have preconfigured services with selectors in the form of
"ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
The K8sIntegration component will label each pod accordingly.
"""
from argparse import ArgumentParser
from clearml_agent.glue.k8s import K8sIntegration
def parse_args():
parser = ArgumentParser()
group = parser.add_mutually_exclusive_group()
parser.add_argument(
"--queue", type=str, help="Queue to pull tasks from"
)
group.add_argument(
"--ports-mode", action='store_true', default=False,
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
"Should not be used with max-pods"
)
parser.add_argument(
"--num-of-services", type=int, default=20,
help="Specify the number of k8s services to be used. Use only with ports-mode."
)
parser.add_argument(
"--base-port", type=int,
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
)
parser.add_argument(
"--base-pod-num", type=int, default=1,
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
"service (default: %(default)s)"
)
parser.add_argument(
"--gateway-address", type=str, default=None,
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
)
parser.add_argument(
"--pod-clearml-conf", type=str,
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
)
parser.add_argument(
"--overrides-yaml", type=str,
help="YAML file containing pod overrides to be used when launching a new pod"
)
parser.add_argument(
"--template-yaml", type=str,
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
)
parser.add_argument(
"--ssh-server-port", type=int, default=0,
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
)
parser.add_argument(
"--namespace", type=str,
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
)
group.add_argument(
"--max-pods", type=int,
help="Limit the maximum number of pods that this service can run at the same time."
"Should not be used with ports-mode"
)
return parser.parse_args()
def main():
args = parse_args()
user_props_cb = None
if args.ports_mode and args.base_port:
def k8s_user_props_cb(pod_number=0):
user_prop = {"k8s-pod-port": args.base_port + pod_number}
if args.gateway_address:
user_prop["k8s-gateway-address"] = args.gateway_address
return user_prop
user_props_cb = k8s_user_props_cb
k8s = K8sIntegration(
ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
namespace=args.namespace, max_pods_limit=args.max_pods or None,
)
k8s.k8s_daemon(args.queue)
if __name__ == "__main__":
main()

View File

@@ -4,7 +4,7 @@ api {
web_server: https://demoapp.demo.clear.ml
files_server: https://demofiles.demo.clear.ml
# Credentials are generated in the webapp, https://app.clear.ml/settings/workspace-configuration
# Credentials are generated in the webapp, https://demoapp.demo.clear.ml/profile
# Overridden with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
credentials {"access_key": "EGRTCO8JMSIGI6S39GTP43NFWXDQOW", "secret_key": "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"}
@@ -15,11 +15,6 @@ api {
agent {
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
# To learn how to generate git token GitHub/Bitbucket/GitLab:
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
git_user=""
git_pass=""
# Limit credentials to a single domain, for example: github.com,
@@ -34,12 +29,12 @@ agent {
# force_git_ssh_user: git
# unique name of this worker, if None, created based on hostname:process_id
# Overridden with os environment: CLEARML_WORKER_ID
# Overridden with os environment: CLEARML_WORKER_NAME
# worker_id: "clearml-agent-machine1:gpu0"
worker_id: ""
# worker name, replaces the hostname when creating a unique name for this worker
# Overridden with os environment: CLEARML_WORKER_NAME
# Overridden with os environment: CLEARML_WORKER_ID
# worker_name: "clearml-agent-machine1"
worker_name: ""
@@ -65,8 +60,6 @@ agent {
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
# pip_version: "<20"
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
# poetry_version: "<2",
# virtual environment inheres packages from system
system_site_packages: false,
@@ -162,57 +155,10 @@ agent {
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04"
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host"]
# lookup table rules for default container
# first matched rule will be picked, according to rule order
# enterprise version only
# match_rules: [
# {
# image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# arguments: "-e define=value"
# match: {
# script{
# # Optional: must match all requirements (not partial)
# requirements: {
# # version selection matching PEP-440
# pip: {
# tensorflow: "~=2.6"
# },
# }
# # Optional: matching based on regular expression, example: "^exact_match$"
# repository: "/my_repository/"
# branch: "main"
# binary: "python3.6"
# }
# # Optional: matching based on regular expression, example: "^exact_match$"
# project: "project/sub_project"
# }
# },
# {
# image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# arguments: "-e define=value"
# match: {
# # must match all requirements (not partial)
# script{
# requirements: {
# conda: {
# torch: ">=2.6,<2.8"
# }
# }
# # no repository matching required
# repository: ""
# }
# # no container image matching required (allow to replace one requested container with another)
# container: ""
# # no repository matching required
# project: ""
# }
# },
# ]
}
# set the OS environments based on the Task's Environment section before launching the Task process.
@@ -233,7 +179,6 @@ agent {
hide_docker_command_env_vars {
enabled: true
extra_keys: []
parse_embedded_urls: true
}
# allow to set internal mount points inside the docker,
@@ -340,7 +285,6 @@ sdk {
# secret: "12345678"
# multipart: false
# secure: false
# verify: /path/to/ca/bundle.crt OR false to not verify
# }
]
}

View File

@@ -65,10 +65,6 @@ def parse_args():
help="Limit the maximum number of pods that this service can run at the same time."
"Should not be used with ports-mode"
)
parser.add_argument(
"--use-owner-token", action="store_true", default=False,
help="Generate and use task owner token for the execution of each task"
)
return parser.parse_args()
@@ -91,7 +87,7 @@ def main():
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
namespace=args.namespace, max_pods_limit=args.max_pods or None,
)
k8s.k8s_daemon(args.queue, use_owner_token=args.use_owner_token)
k8s.k8s_daemon(args.queue)
if __name__ == "__main__":

View File

@@ -8,10 +8,10 @@ psutil>=3.4.2,<5.9.0
pyhocon>=0.3.38,<0.4.0
pyparsing>=2.0.3,<2.5.0
python-dateutil>=2.4.2,<2.9.0
pyjwt>=2.4.0,<2.5.0
pyjwt>=1.6.4,<2.1.0
PyYAML>=3.12,<5.5.0
requests>=2.20.0,<2.26.0
six>=1.13.0,<1.16.0
typing>=3.6.4,<3.8.0 ; python_version < '3.5'
typing>=3.6.4,<3.8.0
urllib3>=1.21.1,<1.27.0
virtualenv>=16,<21

View File

@@ -61,7 +61,6 @@ setup(
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'License :: OSI Approved :: Apache Software License',
],