mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
174 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a51f9bed49 | ||
|
|
531e514003 | ||
|
|
2cd9e706c8 | ||
|
|
e3e6a1dda8 | ||
|
|
92b5ce61a0 | ||
|
|
36073ad488 | ||
|
|
d89d0f9ff5 | ||
|
|
14c48d0a78 | ||
|
|
b1ee3e105b | ||
|
|
1f53c4fd1b | ||
|
|
bfed3ccf4d | ||
|
|
d521482409 | ||
|
|
53eba5658f | ||
|
|
bb64e4a850 | ||
|
|
771690d5c0 | ||
|
|
d39e30995a | ||
|
|
363aaeaba8 | ||
|
|
fa1307e62c | ||
|
|
e7c9e9695b | ||
|
|
bf07b7f76d | ||
|
|
5afb604e3d | ||
|
|
b3e8be6296 | ||
|
|
2cb452b1c2 | ||
|
|
938fcc4530 | ||
|
|
73625bf00f | ||
|
|
f41ed09dc1 | ||
|
|
f03c4576f7 | ||
|
|
6c5087e425 | ||
|
|
5a6caf6399 | ||
|
|
a07053d961 | ||
|
|
aa9a9a25fb | ||
|
|
cd4a39d8fc | ||
|
|
92e3f00435 | ||
|
|
a890e36a36 | ||
|
|
bed94ee431 | ||
|
|
175e99b12b | ||
|
|
2a941e3abf | ||
|
|
3c8e0ae5db | ||
|
|
e416ab526b | ||
|
|
e17246d8ea | ||
|
|
f6f043d1ca | ||
|
|
db57441c5d | ||
|
|
31d90be0a1 | ||
|
|
5a080798cb | ||
|
|
21c4857795 | ||
|
|
4149afa896 | ||
|
|
b196ab5793 | ||
|
|
b39b54bbaf | ||
|
|
26d76f52ac | ||
|
|
2fff28845d | ||
|
|
5e4c495d62 | ||
|
|
5c5802c089 | ||
|
|
06010ef1b7 | ||
|
|
bd411a1984 | ||
|
|
29d24e3eaa | ||
|
|
0fbbe774fa | ||
|
|
aede6f4bac | ||
|
|
84706ba66d | ||
|
|
6b602889a5 | ||
|
|
cd046927f3 | ||
|
|
5ed47d2d2c | ||
|
|
fd068c0933 | ||
|
|
9456e493ac | ||
|
|
3b08a73245 | ||
|
|
42606d9247 | ||
|
|
499b3dfa66 | ||
|
|
ca360b7d43 | ||
|
|
6470b16b70 | ||
|
|
4c9410c5fe | ||
|
|
351f0657c3 | ||
|
|
382604e923 | ||
|
|
b48f25a7f9 | ||
|
|
b76e4fc02b | ||
|
|
27cf7dd67f | ||
|
|
05ec45352c | ||
|
|
0e7546f248 | ||
|
|
e3c8bd5666 | ||
|
|
3ae1741343 | ||
|
|
53c106c3af | ||
|
|
44fc7dffe6 | ||
|
|
aaa6b32f9f | ||
|
|
821a0c4a2b | ||
|
|
6373237960 | ||
|
|
1caf7b104f | ||
|
|
176b4a4cde | ||
|
|
29bf993be7 | ||
|
|
eda597dea5 | ||
|
|
8c56777125 | ||
|
|
7e90ebd5db | ||
|
|
3a07bfe1d7 | ||
|
|
0694b9e8af | ||
|
|
742cbf5767 | ||
|
|
e93384b99b | ||
|
|
3c4e976093 | ||
|
|
1e795beec8 | ||
|
|
4f7407084d | ||
|
|
ae3d034531 | ||
|
|
a2db1f5ab5 | ||
|
|
cec6420c8f | ||
|
|
4f18bb7ea0 | ||
|
|
3ec2a3a92e | ||
|
|
823b67a3ce | ||
|
|
24dc59e31f | ||
|
|
08ff5e6db7 | ||
|
|
e60a6f9d14 | ||
|
|
161656d9e4 | ||
|
|
8569c02b33 | ||
|
|
35e714d8d9 | ||
|
|
6f8d5710d6 | ||
|
|
a671692832 | ||
|
|
5c8675e43a | ||
|
|
60a58f6fad | ||
|
|
948fc4c6ce | ||
|
|
5be5f3209d | ||
|
|
537b67e0cd | ||
|
|
82c5e55fe4 | ||
|
|
5f0d51d485 | ||
|
|
945dd816ad | ||
|
|
45009e6cc2 | ||
|
|
8eace6d57b | ||
|
|
3774fa6abd | ||
|
|
e71e6865d2 | ||
|
|
0e8f1528b1 | ||
|
|
c331babf51 | ||
|
|
c59d268995 | ||
|
|
9e9fcb0ba9 | ||
|
|
f33e0b2f78 | ||
|
|
0e4b99351f | ||
|
|
81edd2860f | ||
|
|
14ac584577 | ||
|
|
9ce6baf074 | ||
|
|
92a1e07b33 | ||
|
|
cb6bdece39 | ||
|
|
2ea38364bb | ||
|
|
cf6fdc0d81 | ||
|
|
91eec99563 | ||
|
|
f8cbaa9a06 | ||
|
|
d9b9b4984b | ||
|
|
8a46dc6b03 | ||
|
|
205f9dd816 | ||
|
|
9dfa1294e2 | ||
|
|
f019905720 | ||
|
|
9c257858dd | ||
|
|
2006ab20dd | ||
|
|
0caf31719c | ||
|
|
5da7184276 | ||
|
|
50fccdab96 | ||
|
|
77d6ff6630 | ||
|
|
99614702ea | ||
|
|
58cb344ee6 | ||
|
|
22d5892b12 | ||
|
|
f619969efc | ||
|
|
ca242424ab | ||
|
|
407deb84e9 | ||
|
|
14589aa094 | ||
|
|
1260e3d942 | ||
|
|
b22d926d94 | ||
|
|
410cc8c7be | ||
|
|
784c676f5b | ||
|
|
296f7970df | ||
|
|
cd59933c9c | ||
|
|
b95d3f5300 | ||
|
|
fa0d5d8469 | ||
|
|
8229843018 | ||
|
|
c578b37c6d | ||
|
|
8ea062c0bd | ||
|
|
5d8bbde434 | ||
|
|
0462af6a3d | ||
|
|
5a94a4048e | ||
|
|
2602301e1d | ||
|
|
161993f66f | ||
|
|
b7f87fb8d3 | ||
|
|
8fdb87f1f5 | ||
|
|
a9a68d230e |
55
README.md
55
README.md
@@ -5,10 +5,11 @@
|
||||
**ClearML Agent - ML-Ops made easy
|
||||
ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
|
||||
[](https://img.shields.io/github/license/allegroai/trains-agent.svg)
|
||||
[](https://img.shields.io/github/license/allegroai/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/v/clearml-agent.svg)
|
||||
|
||||
[](https://artifacthub.io/packages/search?repo=allegroai)
|
||||
|
||||
</div>
|
||||
|
||||
---
|
||||
@@ -21,23 +22,23 @@ ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
* Implement optimized resource utilization policies
|
||||
* Deploy execution environments with either virtualenv or fully docker containerized with zero effort
|
||||
* Launch-and-Forget service containers
|
||||
* [Cloud autoscaling](https://allegro.ai/clearml/docs/examples/services/aws_autoscaler/aws_autoscaler/)
|
||||
* [Customizable cleanup](https://allegro.ai/clearml/docs/examples/services/cleanup/cleanup_service/)
|
||||
* Advanced [pipeline building and execution](https://allegro.ai/clearml/docs/examples/frameworks/pytorch/notebooks/table/tabular_training_pipeline/)
|
||||
* [Cloud autoscaling](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler)
|
||||
* [Customizable cleanup](https://clear.ml/docs/latest/docs/guides/services/cleanup_service)
|
||||
* Advanced [pipeline building and execution](https://clear.ml/docs/latest/docs/guides/frameworks/pytorch/notebooks/table/tabular_training_pipeline)
|
||||
|
||||
It is a zero configuration fire-and-forget execution agent, providing a full ML/DL cluster solution.
|
||||
|
||||
**Full Automation in 5 steps**
|
||||
1. ClearML Server [self-hosted](https://github.com/allegroai/trains-server) or [free tier hosting](https://app.community.clear.ml)
|
||||
1. ClearML Server [self-hosted](https://github.com/allegroai/clearml-server) or [free tier hosting](https://app.clear.ml)
|
||||
2. `pip install clearml-agent` ([install](#installing-the-clearml-agent) the ClearML Agent on any GPU machine: on-premises / cloud / ...)
|
||||
3. Create a [job](https://github.com/allegroai/clearml/docs/clearml-task.md) or Add [ClearML](https://github.com/allegroai/trains) to your code with just 2 lines
|
||||
3. Create a [job](https://github.com/allegroai/clearml/docs/clearml-task.md) or Add [ClearML](https://github.com/allegroai/clearml) to your code with just 2 lines
|
||||
4. Change the [parameters](#using-the-clearml-agent) in the UI & schedule for [execution](#using-the-clearml-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
5. :chart_with_downwards_trend: :chart_with_upwards_trend: :eyes: :beer:
|
||||
|
||||
"All the Deep/Machine-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
|
||||
**Try ClearML now** [Self Hosted](https://github.com/allegroai/trains-server) or [Free tier Hosting](https://app.community.clear.ml)
|
||||
<a href="https://app.community.clear.ml"><img src="https://raw.githubusercontent.com/allegroai/trains-agent/9f1e86c1ca45c984ee13edc9353c7b10c55d7257/docs/screenshots.gif" width="100%"></a>
|
||||
**Try ClearML now** [Self Hosted](https://github.com/allegroai/clearml-server) or [Free tier Hosting](https://app.clear.ml)
|
||||
<a href="https://app.clear.ml"><img src="https://github.com/allegroai/clearml-agent/blob/master/docs/screenshots.gif?raw=true" width="100%"></a>
|
||||
|
||||
### Simple, Flexible Experiment Orchestration
|
||||
**The ClearML Agent was built to address the DL/ML R&D DevOps needs:**
|
||||
@@ -59,6 +60,8 @@ It is a zero configuration fire-and-forget execution agent, providing a full ML/
|
||||
### Kubernetes Integration (Optional)
|
||||
We think Kubernetes is awesome, but it should be a choice.
|
||||
We designed `clearml-agent` so you can run bare-metal or inside a pod with any mix that fits your environment.
|
||||
|
||||
Find Dockerfiles in [docker](./docker) dir and a helm Chart in https://github.com/allegroai/clearml-helm-charts
|
||||
#### Benefits of integrating existing K8s with ClearML-Agent
|
||||
- ClearML-Agent adds the missing scheduling capabilities to K8s
|
||||
- Allowing for more flexible automation from code
|
||||
@@ -68,13 +71,13 @@ We designed `clearml-agent` so you can run bare-metal or inside a pod with any m
|
||||
|
||||
**Two K8s integration flavours**
|
||||
- Spin ClearML-Agent as a long-lasting service pod
|
||||
- use [clearml-agent](https://hub.docker.com/r/allegroai/trains-agent) docker image
|
||||
- use [clearml-agent](https://hub.docker.com/r/allegroai/clearml-agent) docker image
|
||||
- map docker socket into the pod (soon replaced by [podman](https://github.com/containers/podman))
|
||||
- allow the clearml-agent to manage sibling dockers
|
||||
- benefits: full use of the ClearML scheduling, no need to worry about wrong container images / lost pods etc.
|
||||
- downside: Sibling containers
|
||||
- Kubernetes Glue, map ClearML jobs directly to K8s jobs
|
||||
- Run the [clearml-k8s glue](https://github.com/allegroai/trains-agent/blob/master/examples/k8s_glue_example.py) on a K8s cpu node
|
||||
- Run the [clearml-k8s glue](https://github.com/allegroai/clearml-agent/blob/master/examples/k8s_glue_example.py) on a K8s cpu node
|
||||
- The clearml-k8s glue pulls jobs from the ClearML job execution queue and prepares a K8s job (based on provided yaml template)
|
||||
- Inside the pod itself the clearml-agent will install the job (experiment) environment and spin and monitor the experiment's process
|
||||
- benefits: Kubernetes full view of all running jobs in the system
|
||||
@@ -122,7 +125,7 @@ The ClearML Agent executes experiments using the following process:
|
||||
|
||||
#### System Design & Flow
|
||||
|
||||
<img src="https://allegro.ai/clearml/docs/img/ClearML_Architecture.png" width="100%" alt="clearml-architecture">
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_architecture.png" width="100%" alt="clearml-architecture">
|
||||
|
||||
|
||||
#### Installing the ClearML Agent
|
||||
@@ -196,16 +199,16 @@ Notice: with `--detached` flag, the *clearml-agent* will be running in the backg
|
||||
clearml-agent daemon --detached --queue default --docker
|
||||
```
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda docker:
|
||||
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 docker:
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
clearml-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
```
|
||||
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda docker:
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 docker:
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
```
|
||||
|
||||
##### Starting the ClearML Agent - Priority Queues
|
||||
@@ -218,18 +221,18 @@ clearml-agent daemon --queue important_jobs default
|
||||
```
|
||||
The **ClearML Agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
|
||||
|
||||
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [free server](https://app.community.clear.ml/workers-and-queues/queues)
|
||||
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [free server](https://app.clear.ml/workers-and-queues/queues)
|
||||
|
||||
##### Stopping the ClearML Agent
|
||||
|
||||
To stop a **ClearML Agent** running in the background, run the same command line used to start the agent with `--stop` appended.
|
||||
For example, to stop the first of the above shown same machine, single gpu agents:
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda --stop
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 --stop
|
||||
```
|
||||
|
||||
### How do I create an experiment on the ClearML Server? <a name="from-scratch"></a>
|
||||
* Integrate [ClearML](https://github.com/allegroai/trains) with your code
|
||||
* Integrate [ClearML](https://github.com/allegroai/clearml) with your code
|
||||
* Execute the code on your machine (Manually / PyCharm / Jupyter Notebook)
|
||||
* As your code is running, **ClearML** creates an experiment logging all the necessary execution information:
|
||||
- Git repository link and commit ID (or an entire jupyter notebook)
|
||||
@@ -273,18 +276,18 @@ clearml-agent daemon --services-mode --detached --queue services --create-queue
|
||||
### AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
|
||||
The ClearML Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the ClearML package.
|
||||
|
||||
Sample AutoML & Orchestration examples can be found in the ClearML [example/automation](https://github.com/allegroai/trains/tree/master/examples/automation) folder.
|
||||
Sample AutoML & Orchestration examples can be found in the ClearML [example/automation](https://github.com/allegroai/clearml/tree/master/examples/automation) folder.
|
||||
|
||||
AutoML examples
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/trains/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/clearml/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/trains/blob/master/examples/automation/manual_random_param_search_example.py)
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/clearml/blob/master/examples/automation/manual_random_param_search_example.py)
|
||||
- This example will create multiple copies of the Keras experiment-template, with different hyper-parameter combinations
|
||||
|
||||
Experiment Pipeline examples
|
||||
- [First step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/task_piping_example.py)
|
||||
- [First step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/task_piping_example.py)
|
||||
- This example will "process data", and once done, will launch a copy of the 'second step' experiment-template
|
||||
- [Second step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/toy_base_task.py)
|
||||
- [Second step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/toy_base_task.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
|
||||
### License
|
||||
|
||||
@@ -12,7 +12,7 @@ from clearml_agent.definitions import FileBuffering, CONFIG_FILE
|
||||
from clearml_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
|
||||
from clearml_agent.helper.process import ExitStatus
|
||||
from . import interface, session, definitions, commands
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError, CustomBuildScriptFailed
|
||||
from .helper.trace import PackageTrace
|
||||
from .interface import get_parser
|
||||
|
||||
@@ -44,6 +44,8 @@ def run_command(parser, args, command_name):
|
||||
debug = command._session.debug_mode
|
||||
func = getattr(command, command_name)
|
||||
return func(**args_dict)
|
||||
except CustomBuildScriptFailed as e:
|
||||
command_class.exit(e.message, e.errno)
|
||||
except ConfigFileNotFound:
|
||||
message = 'Cannot find configuration file in "{}".\n' \
|
||||
'To create a configuration file, run:\n' \
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# Notice: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# git_host: ""
|
||||
@@ -19,21 +20,42 @@
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# Force the root folder of the git repository (instead of the working directory) into the PYHTONPATH
|
||||
# default false, only the working directory will be added to the PYHTONPATH
|
||||
# force_git_root_python_path: false
|
||||
|
||||
# in docker mode, if container's entrypoint automatically activated a virtual environment
|
||||
# use the activated virtual environment and install everything there
|
||||
# set to False to disable, and always create a new venv inheriting from the system_site_packages
|
||||
# docker_use_activated_venv: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: "<20.2",
|
||||
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
|
||||
# poetry_version: "<2",
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
@@ -45,7 +67,7 @@
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["defaults", "conda-forge", "pytorch", ]
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
@@ -75,6 +97,16 @@
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
# path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
@@ -109,6 +141,11 @@
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
@@ -131,12 +168,15 @@
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
|
||||
image: "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
@@ -162,4 +202,106 @@
|
||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
parse_embedded_urls: true
|
||||
}
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
docker_internal_mounts {
|
||||
sdk_cache: "/clearml_agent_cache"
|
||||
apt_cache: "/var/cache/apt/archives"
|
||||
ssh_folder: "/root/.ssh"
|
||||
pip_cache: "/root/.cache/pip"
|
||||
poetry_cache: "/root/.cache/pypoetry"
|
||||
vcs_cache: "/root/.clearml/vcs-cache"
|
||||
venv_build: "/root/.clearml/venvs-builds"
|
||||
pip_download: "/root/.clearml/pip-download-cache"
|
||||
}
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
|
||||
# Note: resulting name must start with an alphanumeric character and continue with alphanumeric characters,
|
||||
# underscores (_), dots (.) and/or dashes (-)
|
||||
#docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
|
||||
# Apply top-level environment section from configuration into os.environ
|
||||
apply_environment: true
|
||||
# Top-level environment section is in the form of:
|
||||
# environment {
|
||||
# key: value
|
||||
# ...
|
||||
# }
|
||||
# and is applied to the OS environment as `key=value` for each key/value pair
|
||||
|
||||
# Apply top-level files section from configuration into local file system
|
||||
apply_files: true
|
||||
# Top-level files section allows auto-generating files at designated paths with a predefined contents
|
||||
# and target format. Options include:
|
||||
# contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
|
||||
# format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
|
||||
# base64-encoded contents string, otherwise ignored
|
||||
# path: the target file's path, may include ~ and inplace env vars
|
||||
# target_format: format used to encode contents before writing into the target file. Supported values are json,
|
||||
# yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
|
||||
# overwrite: overwrite the target file in case it exists. Default is true.
|
||||
#
|
||||
# Example:
|
||||
# files {
|
||||
# myfile1 {
|
||||
# contents: "The quick brown fox jumped over the lazy dog"
|
||||
# path: "/tmp/fox.txt"
|
||||
# }
|
||||
# myjsonfile {
|
||||
# contents: {
|
||||
# some {
|
||||
# nested {
|
||||
# value: [1, 2, 3, 4]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# path: "/tmp/test.json"
|
||||
# target_format: json
|
||||
# }
|
||||
# }
|
||||
|
||||
# Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
# If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
# will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
# The script can also be specified using the CLEARML_AGENT_CUSTOM_BUILD_SCRIPT environment variable.
|
||||
#
|
||||
# When running the script, the following environment variables will be set:
|
||||
# - CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
# contents in JSON format
|
||||
# - CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
# - CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
# - CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
# - CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
# - CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
# this file must be in the following JSON format:
|
||||
# ```json
|
||||
# {
|
||||
# "binary": "/absolute/path/to/python-executable",
|
||||
# "entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
# "working_dir": "/absolute/path/to/task-working/dir"
|
||||
# }
|
||||
# ```
|
||||
# If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
# skip virtual environment creation.
|
||||
#
|
||||
# In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
# In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
# into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
# standard flow.
|
||||
custom_build_script: ""
|
||||
}
|
||||
|
||||
@@ -31,7 +31,9 @@
|
||||
}
|
||||
|
||||
auth {
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token
|
||||
token_expiration_threshold_sec = 360
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token. Default 12 hours
|
||||
token_expiration_threshold_sec: 43200
|
||||
# When requesting a token, request specific expiration time. Server default (and maximum) is 30 days
|
||||
# request_token_expiration_sec: 2592000
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from ...backend_config.converters import safe_text_to_bool
|
||||
from ...backend_config.environment import EnvEntry
|
||||
|
||||
|
||||
@@ -6,6 +7,24 @@ ENV_WEB_HOST = EnvEntry("CLEARML_WEB_HOST", "TRAINS_WEB_HOST")
|
||||
ENV_FILES_HOST = EnvEntry("CLEARML_FILES_HOST", "TRAINS_FILES_HOST")
|
||||
ENV_ACCESS_KEY = EnvEntry("CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY")
|
||||
ENV_SECRET_KEY = EnvEntry("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AUTH_TOKEN = EnvEntry("CLEARML_AUTH_TOKEN")
|
||||
ENV_VERBOSE = EnvEntry("CLEARML_API_VERBOSE", "TRAINS_API_VERBOSE", type=bool, default=False)
|
||||
ENV_HOST_VERIFY_CERT = EnvEntry("CLEARML_API_HOST_VERIFY_CERT", "TRAINS_API_HOST_VERIFY_CERT", type=bool, default=True)
|
||||
ENV_CONDA_ENV_PACKAGE = EnvEntry("CLEARML_CONDA_ENV_PACKAGE", "TRAINS_CONDA_ENV_PACKAGE")
|
||||
ENV_NO_DEFAULT_SERVER = EnvEntry("CLEARML_NO_DEFAULT_SERVER", "TRAINS_NO_DEFAULT_SERVER", type=bool, default=True)
|
||||
ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type=bool)
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
|
||||
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
|
||||
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
|
||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||
)
|
||||
|
||||
"""
|
||||
Experimental option to set the request method for all API requests and auth login.
|
||||
This could be useful when GET requests with payloads are blocked by a server as
|
||||
POST requests can be used instead.
|
||||
|
||||
However this has not been vigorously tested and may have unintended consequences.
|
||||
"""
|
||||
ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD", default="GET")
|
||||
|
||||
@@ -5,10 +5,17 @@ import six
|
||||
|
||||
from .apimodel import ApiModel
|
||||
from .datamodel import DataModel
|
||||
from .defs import ENV_API_DEFAULT_REQ_METHOD
|
||||
|
||||
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST"):
|
||||
raise ValueError(
|
||||
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
|
||||
)
|
||||
|
||||
|
||||
class Request(ApiModel):
|
||||
_method = 'get'
|
||||
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs:
|
||||
|
||||
@@ -1,17 +1,21 @@
|
||||
|
||||
import json as json_lib
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
from socket import gethostname
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
from typing import Optional
|
||||
|
||||
import jwt
|
||||
import requests
|
||||
import six
|
||||
from pyhocon import ConfigTree
|
||||
from pyhocon import ConfigTree, ConfigFactory
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
|
||||
from .callresult import CallResult
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD
|
||||
from .request import Request, BatchRequest
|
||||
from .token_manager import TokenManager
|
||||
from ..config import load
|
||||
@@ -40,11 +44,12 @@ class Session(TokenManager):
|
||||
_session_requests = 0
|
||||
_session_initial_timeout = (3.0, 10.)
|
||||
_session_timeout = (10.0, 30.)
|
||||
_session_initial_connect_retry = 4
|
||||
_session_initial_retry_connect_override = 4
|
||||
_write_session_data_size = 15000
|
||||
_write_session_timeout = (30.0, 30.)
|
||||
|
||||
api_version = '2.1'
|
||||
feature_set = 'basic'
|
||||
default_host = "https://demoapi.demo.clear.ml"
|
||||
default_web = "https://demoapp.demo.clear.ml"
|
||||
default_files = "https://demofiles.demo.clear.ml"
|
||||
@@ -99,42 +104,48 @@ class Session(TokenManager):
|
||||
if initialize_logging:
|
||||
self.config.initialize_logging(debug=kwargs.get('debug', False))
|
||||
|
||||
token_expiration_threshold_sec = self.config.get(
|
||||
"auth.token_expiration_threshold_sec", 60
|
||||
)
|
||||
|
||||
super(Session, self).__init__(
|
||||
token_expiration_threshold_sec=token_expiration_threshold_sec, **kwargs
|
||||
)
|
||||
super(Session, self).__init__(config=config, **kwargs)
|
||||
|
||||
self._verbose = verbose if verbose is not None else ENV_VERBOSE.get()
|
||||
self._logger = logger
|
||||
self.__auth_token = None
|
||||
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key)
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
if ENV_AUTH_TOKEN.get(
|
||||
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
|
||||
):
|
||||
self.set_auth_token(ENV_AUTH_TOKEN.get())
|
||||
else:
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key),
|
||||
value_cb=lambda key, value: print("Using environment access key {}={}".format(key, value))
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret)
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
|
||||
value_cb=lambda key, value: print("Using environment secret key {}=********".format(key))
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
if self.access_key == self.default_key and self.secret_key == self.default_secret:
|
||||
print("Using built-in ClearML default key/secret")
|
||||
|
||||
host = host or self.get_api_server_host(config=self.config)
|
||||
if not host:
|
||||
raise ValueError("host is required in init or config")
|
||||
raise ValueError(
|
||||
"Could not find host server definition "
|
||||
"(missing `~/clearml.conf` or Environment CLEARML_API_HOST)\n"
|
||||
"To get started with ClearML: setup your own `clearml-server`, "
|
||||
"or create a free account at https://app.clear.ml and run `clearml-agent init`"
|
||||
)
|
||||
|
||||
self.__host = host.strip("/")
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
self.__worker = worker or gethostname()
|
||||
|
||||
@@ -145,22 +156,25 @@ class Session(TokenManager):
|
||||
self.client = client or "api-{}".format(__version__)
|
||||
|
||||
# limit the reconnect retries, so we get an error if we are starting the session
|
||||
http_no_retries_config = dict(**http_retries_config)
|
||||
http_no_retries_config['connect'] = self._session_initial_connect_retry
|
||||
self.__http_session = get_http_session_with_retry(**http_no_retries_config)
|
||||
_, self.__http_session = self._setup_session(
|
||||
http_retries_config,
|
||||
initial_session=True,
|
||||
default_initial_connect_override=(False if kwargs.get("command") == "execute" else None)
|
||||
)
|
||||
# try to connect with the server
|
||||
self.refresh_token()
|
||||
# create the default session with many retries
|
||||
self.__http_session = get_http_session_with_retry(**http_retries_config)
|
||||
http_retries_config, self.__http_session = self._setup_session(http_retries_config)
|
||||
|
||||
# update api version from server response
|
||||
try:
|
||||
token_dict = jwt.decode(self.token, verify=False)
|
||||
token_dict = TokenManager.get_decoded_token(self.token, verify=False)
|
||||
api_version = token_dict.get('api_version')
|
||||
if not api_version:
|
||||
api_version = '2.2' if token_dict.get('env', '') == 'prod' else Session.api_version
|
||||
|
||||
Session.api_version = str(api_version)
|
||||
Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
|
||||
except (jwt.DecodeError, ValueError):
|
||||
pass
|
||||
|
||||
@@ -169,6 +183,69 @@ class Session(TokenManager):
|
||||
# notice: this is across the board warning omission
|
||||
urllib_log_warning_setup(total_retries=http_retries_config.get('total', 0), display_warning_after=3)
|
||||
|
||||
def _setup_session(self, http_retries_config, initial_session=False, default_initial_connect_override=None):
|
||||
# type: (dict, bool, Optional[bool]) -> (dict, requests.Session)
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
if initial_session:
|
||||
kwargs = {} if default_initial_connect_override is None else {
|
||||
"default": default_initial_connect_override
|
||||
}
|
||||
if ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(**kwargs):
|
||||
connect_retries = self._session_initial_retry_connect_override
|
||||
try:
|
||||
value = ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(converter=str)
|
||||
if not isinstance(value, bool):
|
||||
connect_retries = abs(int(value))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
http_retries_config = dict(**http_retries_config)
|
||||
http_retries_config['connect'] = connect_retries
|
||||
|
||||
return http_retries_config, get_http_session_with_retry(config=self.config or None, **http_retries_config)
|
||||
|
||||
def load_vaults(self):
|
||||
if not self.check_min_api_version("2.15") or self.feature_set == "basic":
|
||||
return
|
||||
|
||||
if ENV_DISABLE_VAULT_SUPPORT.get():
|
||||
print("Vault support is disabled")
|
||||
return
|
||||
|
||||
def parse(vault):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
d = vault.get('data', None)
|
||||
if d:
|
||||
r = ConfigFactory.parse_string(d)
|
||||
if isinstance(r, (ConfigTree, dict)):
|
||||
return r
|
||||
except Exception as e:
|
||||
print("Failed parsing vault {}: {}".format(vault.get("description", "<unknown>"), e))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
res = self.send_request("users", "get_vaults", json={"enabled": True, "types": ["config"]})
|
||||
if res.ok:
|
||||
vaults = res.json().get("data", {}).get("vaults", [])
|
||||
data = list(filter(None, map(parse, vaults)))
|
||||
if data:
|
||||
self.config.set_overrides(*data)
|
||||
elif res.status_code != 404:
|
||||
raise Exception(res.json().get("meta", {}).get("result_msg", res.text))
|
||||
except Exception as ex:
|
||||
print("Failed getting vaults: {}".format(ex))
|
||||
|
||||
def verify_feature_set(self, feature_set):
|
||||
if isinstance(feature_set, str):
|
||||
feature_set = [feature_set]
|
||||
if self.feature_set not in feature_set:
|
||||
raise ValueError('ClearML-server does not support requested feature set {}'.format(feature_set))
|
||||
|
||||
def _send_request(
|
||||
self,
|
||||
service,
|
||||
@@ -242,6 +319,10 @@ class Session(TokenManager):
|
||||
headers[self._AUTHORIZATION_HEADER] = "Bearer {}".format(self.token)
|
||||
return headers
|
||||
|
||||
def set_auth_token(self, auth_token):
|
||||
self.__access_key = self.__secret_key = None
|
||||
self._set_token(auth_token)
|
||||
|
||||
def send_request(
|
||||
self,
|
||||
service,
|
||||
@@ -439,8 +520,11 @@ class Session(TokenManager):
|
||||
if not config:
|
||||
return None
|
||||
|
||||
return ENV_HOST.get(default=(config.get("api.api_server", None) or
|
||||
config.get("api.host", None) or cls.default_host))
|
||||
default = config.get("api.api_server", None) or config.get("api.host", None)
|
||||
if not ENV_NO_DEFAULT_SERVER.get():
|
||||
default = default or cls.default_host
|
||||
|
||||
return ENV_HOST.get(default=default)
|
||||
|
||||
@classmethod
|
||||
def get_app_server_host(cls, config=None):
|
||||
@@ -508,7 +592,7 @@ class Session(TokenManager):
|
||||
return v + (0,) * max(0, 3 - len(v))
|
||||
return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))
|
||||
|
||||
def _do_refresh_token(self, old_token, exp=None):
|
||||
def _do_refresh_token(self, current_token, exp=None):
|
||||
""" TokenManager abstract method implementation.
|
||||
Here we ignore the old token and simply obtain a new token.
|
||||
"""
|
||||
@@ -520,15 +604,23 @@ class Session(TokenManager):
|
||||
)
|
||||
)
|
||||
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
auth = None
|
||||
headers = None
|
||||
if self.access_key and self.secret_key:
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
elif current_token:
|
||||
headers = dict(Authorization="Bearer {}".format(current_token))
|
||||
|
||||
res = None
|
||||
try:
|
||||
data = {"expiration_sec": exp} if exp else {}
|
||||
res = self._send_request(
|
||||
method=ENV_API_DEFAULT_REQ_METHOD.get(default="get"),
|
||||
service="auth",
|
||||
action="login",
|
||||
auth=auth,
|
||||
json=data,
|
||||
headers=headers,
|
||||
refresh_token_if_unauthorized=False,
|
||||
)
|
||||
try:
|
||||
@@ -544,7 +636,10 @@ class Session(TokenManager):
|
||||
)
|
||||
if verbose:
|
||||
self._logger.info("Received new token")
|
||||
return resp["data"]["token"]
|
||||
token = resp["data"]["token"]
|
||||
if ENV_AUTH_TOKEN.get():
|
||||
os.environ[ENV_AUTH_TOKEN.key] = token
|
||||
return token
|
||||
except LoginError:
|
||||
six.reraise(*sys.exc_info())
|
||||
except KeyError as ex:
|
||||
|
||||
@@ -3,11 +3,14 @@ from abc import ABCMeta, abstractmethod
|
||||
from time import time
|
||||
|
||||
import jwt
|
||||
from jwt.algorithms import get_default_algorithms
|
||||
import six
|
||||
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class TokenManager(object):
|
||||
_default_token_exp_threshold_sec = 12 * 60 * 60
|
||||
_default_req_token_expiration_sec = None
|
||||
|
||||
@property
|
||||
def token_expiration_threshold_sec(self):
|
||||
@@ -40,17 +43,30 @@ class TokenManager(object):
|
||||
return self.__token
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=60,
|
||||
**kwargs
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=None,
|
||||
config=None,
|
||||
**kwargs
|
||||
):
|
||||
super(TokenManager, self).__init__()
|
||||
assert isinstance(token_history, (type(None), dict))
|
||||
self.token_expiration_threshold_sec = token_expiration_threshold_sec
|
||||
self.req_token_expiration_sec = req_token_expiration_sec
|
||||
if config:
|
||||
req_token_expiration_sec = req_token_expiration_sec or config.get(
|
||||
"api.auth.request_token_expiration_sec", None
|
||||
)
|
||||
token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec
|
||||
or config.get("api.auth.token_expiration_threshold_sec", None)
|
||||
)
|
||||
self.token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec or self._default_token_exp_threshold_sec
|
||||
)
|
||||
self.req_token_expiration_sec = (
|
||||
req_token_expiration_sec or self._default_req_token_expiration_sec
|
||||
)
|
||||
self._set_token(token)
|
||||
|
||||
def _calc_token_valid_period_sec(self, token, exp=None, at_least_sec=None):
|
||||
@@ -58,7 +74,9 @@ class TokenManager(object):
|
||||
try:
|
||||
exp = exp or self._get_token_exp(token)
|
||||
if at_least_sec:
|
||||
at_least_sec = max(at_least_sec, self.token_expiration_threshold_sec)
|
||||
at_least_sec = max(
|
||||
at_least_sec, self.token_expiration_threshold_sec
|
||||
)
|
||||
else:
|
||||
at_least_sec = self.token_expiration_threshold_sec
|
||||
return max(0, (exp - time() - at_least_sec))
|
||||
@@ -66,10 +84,26 @@ class TokenManager(object):
|
||||
pass
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def get_decoded_token(cls, token, verify=False):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
if hasattr(jwt, '__version__') and jwt.__version__[0] == '1':
|
||||
return jwt.decode(
|
||||
token,
|
||||
verify=verify,
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
return jwt.decode(
|
||||
token,
|
||||
options=dict(verify_signature=verify),
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_token_exp(cls, token):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
return jwt.decode(token, verify=False).get('exp', sys.maxsize)
|
||||
return cls.get_decoded_token(token).get("exp", sys.maxsize)
|
||||
|
||||
def _set_token(self, token):
|
||||
if token:
|
||||
@@ -80,7 +114,9 @@ class TokenManager(object):
|
||||
self.__token_expiration_sec = 0
|
||||
|
||||
def get_token_valid_period_sec(self):
|
||||
return self._calc_token_valid_period_sec(self.__token, self.token_expiration_sec)
|
||||
return self._calc_token_valid_period_sec(
|
||||
self.__token, self.token_expiration_sec
|
||||
)
|
||||
|
||||
def _get_token(self):
|
||||
if self.get_token_valid_period_sec() <= 0:
|
||||
@@ -92,4 +128,6 @@ class TokenManager(object):
|
||||
pass
|
||||
|
||||
def refresh_token(self):
|
||||
self._set_token(self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec))
|
||||
self._set_token(
|
||||
self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec)
|
||||
)
|
||||
|
||||
@@ -6,16 +6,9 @@ import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util import Retry
|
||||
from urllib3 import PoolManager
|
||||
import six
|
||||
|
||||
from .session.defs import ENV_HOST_VERIFY_CERT
|
||||
|
||||
if six.PY3:
|
||||
from functools import lru_cache
|
||||
elif six.PY2:
|
||||
# python 2 support
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
|
||||
|
||||
__disable_certificate_verification_warning = 0
|
||||
|
||||
|
||||
@@ -4,15 +4,13 @@ import functools
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from fnmatch import fnmatch
|
||||
from os.path import expanduser
|
||||
from typing import Any
|
||||
|
||||
import pyhocon
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
from pyhocon import ConfigTree, ConfigFactory
|
||||
from pyparsing import (
|
||||
ParseFatalException,
|
||||
ParseException,
|
||||
@@ -71,6 +69,10 @@ class Config(object):
|
||||
|
||||
# used in place of None in Config.get as default value because None is a valid value
|
||||
_MISSING = object()
|
||||
extra_config_values_env_key_sep = "__"
|
||||
extra_config_values_env_key_prefix = [
|
||||
"CLEARML_AGENT" + extra_config_values_env_key_sep,
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -90,6 +92,7 @@ class Config(object):
|
||||
self._env = env or os.environ.get("TRAINS_ENV", Environment.default)
|
||||
self.config_paths = set()
|
||||
self.is_server = is_server
|
||||
self._overrides_configs = None
|
||||
|
||||
if self._verbose:
|
||||
print("Config env:%s" % str(self._env))
|
||||
@@ -100,6 +103,7 @@ class Config(object):
|
||||
)
|
||||
if self._env not in get_options(Environment):
|
||||
raise ValueError("Invalid environment %s" % env)
|
||||
|
||||
if relative_to is not None:
|
||||
self.load_relative_to(relative_to)
|
||||
|
||||
@@ -158,7 +162,9 @@ class Config(object):
|
||||
if LOCAL_CONFIG_PATHS:
|
||||
config = functools.reduce(
|
||||
lambda cfg, path: ConfigTree.merge_configs(
|
||||
cfg, self._read_recursive(path, verbose=self._verbose), copy_trees=True
|
||||
cfg,
|
||||
self._read_recursive(path, verbose=self._verbose),
|
||||
copy_trees=True,
|
||||
),
|
||||
LOCAL_CONFIG_PATHS,
|
||||
config,
|
||||
@@ -181,9 +187,38 @@ class Config(object):
|
||||
config,
|
||||
)
|
||||
|
||||
config = ConfigTree.merge_configs(
|
||||
config, self._read_extra_env_config_values(), copy_trees=True
|
||||
)
|
||||
|
||||
if self._overrides_configs:
|
||||
config = functools.reduce(
|
||||
lambda cfg, override: ConfigTree.merge_configs(cfg, override, copy_trees=True),
|
||||
self._overrides_configs,
|
||||
config,
|
||||
)
|
||||
|
||||
config["env"] = env
|
||||
return config
|
||||
|
||||
def _read_extra_env_config_values(self) -> ConfigTree:
|
||||
""" Loads extra configuration from environment-injected values """
|
||||
result = ConfigTree()
|
||||
|
||||
for prefix in self.extra_config_values_env_key_prefix:
|
||||
keys = sorted(k for k in os.environ if k.startswith(prefix))
|
||||
for key in keys:
|
||||
path = (
|
||||
key[len(prefix) :]
|
||||
.replace(self.extra_config_values_env_key_sep, ".")
|
||||
.lower()
|
||||
)
|
||||
result = ConfigTree.merge_configs(
|
||||
result, ConfigFactory.parse_string("{}: {}".format(path, os.environ[key]))
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def replace(self, config):
|
||||
self._config = config
|
||||
|
||||
@@ -340,3 +375,10 @@ class Config(object):
|
||||
except Exception as ex:
|
||||
print("Failed loading %s: %s" % (file_path, ex))
|
||||
raise
|
||||
|
||||
def set_overrides(self, *dicts):
|
||||
""" Set several override dictionaries or ConfigTree objects which should be merged onto the configuration """
|
||||
self._overrides_configs = [
|
||||
d if isinstance(d, ConfigTree) else pyhocon.ConfigFactory.from_dict(d) for d in dicts
|
||||
]
|
||||
self.reload()
|
||||
|
||||
@@ -24,6 +24,14 @@ def text_to_bool(value):
|
||||
return bool(strtobool(value))
|
||||
|
||||
|
||||
def safe_text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
try:
|
||||
return text_to_bool(value)
|
||||
except ValueError:
|
||||
return bool(value)
|
||||
|
||||
|
||||
def any_to_bool(value):
|
||||
# type: (Optional[Union[int, float, Text]]) -> bool
|
||||
if isinstance(value, six.text_type):
|
||||
|
||||
@@ -64,8 +64,8 @@ class Entry(object):
|
||||
converter = self.default_conversions().get(self.type, self.type)
|
||||
return converter(value)
|
||||
|
||||
def get_pair(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Tuple[Text, Any]]
|
||||
def get_pair(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Tuple[Text, Any]]
|
||||
for key in self.keys:
|
||||
value = self._get(key)
|
||||
if value is NotSet:
|
||||
@@ -75,13 +75,20 @@ class Entry(object):
|
||||
except Exception as ex:
|
||||
self.error("invalid value {key}={value}: {ex}".format(**locals()))
|
||||
break
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if value_cb:
|
||||
value_cb(key, value)
|
||||
except Exception:
|
||||
pass
|
||||
return key, value
|
||||
|
||||
result = self.default if default is NotSet else default
|
||||
return self.key, result
|
||||
|
||||
def get(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter)[1]
|
||||
def get(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter, value_cb=value_cb)[1]
|
||||
|
||||
def set(self, value):
|
||||
# type: (Any, Any) -> (Text, Any)
|
||||
|
||||
@@ -55,7 +55,7 @@ def backward_compatibility_support():
|
||||
continue
|
||||
|
||||
# set OS environ:
|
||||
keys = environ.keys()
|
||||
keys = list(environ.keys())
|
||||
for k in keys:
|
||||
if not k.startswith('CLEARML_'):
|
||||
continue
|
||||
|
||||
@@ -1,3 +1,14 @@
|
||||
import base64
|
||||
import os
|
||||
from os.path import expandvars, expanduser
|
||||
from pathlib import Path
|
||||
from typing import List, TYPE_CHECKING
|
||||
|
||||
from pyhocon import HOCONConverter, ConfigTree
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .config import Config
|
||||
|
||||
|
||||
def get_items(cls):
|
||||
""" get key/value items from an enum-like class (members represent enumeration key/value) """
|
||||
@@ -7,3 +18,95 @@ def get_items(cls):
|
||||
def get_options(cls):
|
||||
""" get options from an enum-like class (members represent enumeration key/value) """
|
||||
return get_items(cls).values()
|
||||
|
||||
|
||||
def apply_environment(config):
|
||||
# type: (Config) -> List[str]
|
||||
env_vars = config.get("environment", None)
|
||||
if not env_vars:
|
||||
return []
|
||||
if isinstance(env_vars, (list, tuple)):
|
||||
env_vars = dict(env_vars)
|
||||
|
||||
keys = list(filter(None, env_vars.keys()))
|
||||
|
||||
for key in keys:
|
||||
os.environ[str(key)] = str(env_vars[key] or "")
|
||||
|
||||
return keys
|
||||
|
||||
|
||||
def apply_files(config):
|
||||
# type: (Config) -> None
|
||||
files = config.get("files", None)
|
||||
if not files:
|
||||
return
|
||||
|
||||
if isinstance(files, (list, tuple)):
|
||||
files = dict(files)
|
||||
|
||||
print("Creating files from configuration")
|
||||
for key, data in files.items():
|
||||
path = data.get("path")
|
||||
fmt = data.get("format", "string")
|
||||
target_fmt = data.get("target_format", "string")
|
||||
overwrite = bool(data.get("overwrite", True))
|
||||
contents = data.get("contents")
|
||||
|
||||
target = Path(expanduser(expandvars(path)))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if target.is_dir():
|
||||
print("Skipped [{}]: is a directory {}".format(key, target))
|
||||
continue
|
||||
|
||||
if not overwrite and target.is_file():
|
||||
print("Skipped [{}]: file exists {}".format(key, target))
|
||||
continue
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: can't access {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
if contents:
|
||||
try:
|
||||
if fmt == "base64":
|
||||
contents = base64.b64decode(contents)
|
||||
if target_fmt != "bytes":
|
||||
contents = contents.decode("utf-8")
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed decoding {} ({})".format(key, fmt, ex))
|
||||
continue
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed creating path {} ({})".format(key, target.parent, ex))
|
||||
continue
|
||||
|
||||
try:
|
||||
if target_fmt == "bytes":
|
||||
try:
|
||||
target.write_bytes(contents)
|
||||
except TypeError:
|
||||
# simpler error so the user won't get confused
|
||||
raise TypeError("a bytes-like object is required")
|
||||
else:
|
||||
try:
|
||||
if target_fmt == "json":
|
||||
text = HOCONConverter.to_json(contents)
|
||||
elif target_fmt in ("yaml", "yml"):
|
||||
text = HOCONConverter.to_yaml(contents)
|
||||
else:
|
||||
if isinstance(contents, ConfigTree):
|
||||
contents = contents.as_plain_ordered_dict()
|
||||
text = str(contents)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed encoding to {} ({})".format(key, target_fmt, ex))
|
||||
continue
|
||||
target.write_text(text)
|
||||
print("Saved [{}]: {}".format(key, target))
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed saving file {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
@@ -118,11 +118,13 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
""" The name of the REST service used by this command """
|
||||
pass
|
||||
|
||||
def get(self, endpoint, *args, **kwargs):
|
||||
return self._session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def get(self, endpoint, *args, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
return session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def post(self, endpoint, *args, **kwargs):
|
||||
return self._session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def post(self, endpoint, *args, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
return session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def get_with_act_as(self, endpoint, *args, **kwargs):
|
||||
return self._session.get_with_act_as(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
@@ -11,10 +11,10 @@ from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
|
||||
|
||||
description = """
|
||||
Please create new clearml credentials through the profile page in your clearml web app (e.g. https://demoapp.demo.clear.ml/profile)
|
||||
Or with the free hosted service at https://app.community.clear.ml/profile
|
||||
Please create new clearml credentials through the settings page in your `clearml-server` web app,
|
||||
or create a free account at https://app.clear.ml/settings/webapp-configuration
|
||||
|
||||
In the profile page, press "Create new credentials", then press "Copy to clipboard".
|
||||
In the settings > workspace page, press "Create new credentials", then press "Copy to clipboard".
|
||||
|
||||
Paste copied configuration here:
|
||||
"""
|
||||
@@ -157,7 +157,7 @@ def main():
|
||||
' api_server: %s\n' \
|
||||
' web_server: %s\n' \
|
||||
' files_server: %s\n' \
|
||||
' # Credentials are generated using the webapp, %s/profile\n' \
|
||||
' # Credentials are generated using the webapp, %s/settings\n' \
|
||||
' # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY\n' \
|
||||
' credentials {"access_key": "%s", "secret_key": "%s"}\n' \
|
||||
'}\n\n' % (api_host, web_host, files_host,
|
||||
|
||||
@@ -21,14 +21,16 @@ class Events(ServiceCommandSection):
|
||||
""" Events command service endpoint """
|
||||
return 'events'
|
||||
|
||||
def send_events(self, list_events):
|
||||
def send_events(self, list_events, session=None):
|
||||
def send_packet(jsonlines):
|
||||
if not jsonlines:
|
||||
return 0
|
||||
num_lines = len(jsonlines)
|
||||
jsonlines = '\n'.join(jsonlines)
|
||||
|
||||
new_events = self.post('add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'})
|
||||
new_events = self.post(
|
||||
'add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'}, session=session
|
||||
)
|
||||
if new_events['added'] != num_lines:
|
||||
print('Error (%s) sending events only %d of %d registered' %
|
||||
(new_events['errors'], new_events['added'], num_lines))
|
||||
@@ -57,7 +59,7 @@ class Events(ServiceCommandSection):
|
||||
# print('Sending events done: %d / %d events sent' % (sent_events, len(list_events)))
|
||||
return sent_events
|
||||
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG'):
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG', session=None):
|
||||
log_events = []
|
||||
base_timestamp = int(time.time() * 1000)
|
||||
base_log_items = {
|
||||
@@ -94,4 +96,4 @@ class Events(ServiceCommandSection):
|
||||
log_events.append(get_event(count))
|
||||
|
||||
# now send the events
|
||||
return self.send_events(list_events=log_events)
|
||||
return self.send_events(list_events=log_events, session=session)
|
||||
|
||||
166
clearml_agent/commands/resolver.py
Normal file
166
clearml_agent/commands/resolver.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import json
|
||||
import re
|
||||
import shlex
|
||||
from clearml_agent.helper.package.requirements import (
|
||||
RequirementsManager, MarkerRequirement,
|
||||
compare_version_rules, )
|
||||
|
||||
|
||||
def resolve_default_container(session, task_id, container_config):
|
||||
container_lookup = session.config.get('agent.default_docker.match_rules', None)
|
||||
if not session.check_min_api_version("2.13") or not container_lookup:
|
||||
return container_config
|
||||
|
||||
# check backend support before sending any more requests (because they will fail and crash the Task)
|
||||
try:
|
||||
session.verify_feature_set('advanced')
|
||||
except ValueError:
|
||||
return container_config
|
||||
|
||||
result = session.send_request(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
version='2.14',
|
||||
json={'id': [task_id],
|
||||
'only_fields': ['script.requirements', 'script.binary',
|
||||
'script.repository', 'script.branch',
|
||||
'project', 'container'],
|
||||
'search_hidden': True},
|
||||
method='get',
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
task_info = result.json()['data']['tasks'][0] if result.ok else {}
|
||||
except (ValueError, TypeError):
|
||||
return container_config
|
||||
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
# store tasks repository
|
||||
repository = task_info.get('script', {}).get('repository') or ''
|
||||
branch = task_info.get('script', {}).get('branch') or ''
|
||||
binary = task_info.get('script', {}).get('binary') or ''
|
||||
requested_container = task_info.get('container', {})
|
||||
|
||||
# get project full path
|
||||
project_full_name = ''
|
||||
if task_info.get('project', None):
|
||||
result = session.send_request(
|
||||
service='projects',
|
||||
action='get_all',
|
||||
version='2.13',
|
||||
json={
|
||||
'id': [task_info.get('project')],
|
||||
'only_fields': ['name'],
|
||||
},
|
||||
method='get',
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
if result.ok:
|
||||
project_full_name = result.json()['data']['projects'][0]['name'] or ''
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
task_packages_lookup = {}
|
||||
for entry in container_lookup:
|
||||
match = entry.get('match', None)
|
||||
if not match:
|
||||
continue
|
||||
if match.get('project', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('project', None), project_full_name):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('project', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.repository', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.repository', None), repository):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.repository', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.branch', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.branch', None), branch):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.branch', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.binary', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.binary', None), binary):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.binary', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('container', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('container', None), requested_container.get('image', '')):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('container', None), entry))
|
||||
continue
|
||||
|
||||
matched = True
|
||||
for req_section in ['script.requirements.pip', 'script.requirements.conda']:
|
||||
if not match.get(req_section, None):
|
||||
continue
|
||||
|
||||
match_pip_reqs = [MarkerRequirement(Requirement.parse('{} {}'.format(k, v)))
|
||||
for k, v in match.get(req_section, None).items()]
|
||||
|
||||
if not task_packages_lookup.get(req_section):
|
||||
req_section_parts = req_section.split('.')
|
||||
task_packages_lookup[req_section] = \
|
||||
RequirementsManager.parse_requirements_section_to_marker_requirements(
|
||||
requirements=task_info.get(req_section_parts[0], {}).get(
|
||||
req_section_parts[1], {}).get(req_section_parts[2], None)
|
||||
)
|
||||
|
||||
matched_all_reqs = True
|
||||
for mr in match_pip_reqs:
|
||||
matched_req = False
|
||||
for pr in task_packages_lookup[req_section]:
|
||||
if mr.req.name != pr.req.name:
|
||||
continue
|
||||
if compare_version_rules(mr.specs, pr.specs):
|
||||
matched_req = True
|
||||
break
|
||||
if not matched_req:
|
||||
matched_all_reqs = False
|
||||
break
|
||||
|
||||
# if ew have a match, check second section
|
||||
if matched_all_reqs:
|
||||
continue
|
||||
# no match stop
|
||||
matched = False
|
||||
break
|
||||
|
||||
if matched:
|
||||
if not container_config.get('container'):
|
||||
container_config['container'] = entry.get('image', None)
|
||||
if not container_config.get('arguments'):
|
||||
container_config['arguments'] = entry.get('arguments', None)
|
||||
container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
|
||||
print('Matching default container with rule:\n{}'.format(json.dumps(entry)))
|
||||
return container_config
|
||||
|
||||
return container_config
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,10 @@
|
||||
import shlex
|
||||
from datetime import timedelta
|
||||
from distutils.util import strtobool
|
||||
from enum import IntEnum
|
||||
from os import getenv, environ
|
||||
from typing import Text, Optional, Union, Tuple, Any
|
||||
|
||||
from furl import furl
|
||||
from pathlib2 import Path
|
||||
|
||||
import six
|
||||
@@ -34,6 +34,7 @@ class EnvironmentConfig(object):
|
||||
conversions = {
|
||||
bool: lambda value: bool(strtobool(value)),
|
||||
six.text_type: lambda s: six.text_type(s).strip(),
|
||||
list: lambda s: shlex.split(s.strip()),
|
||||
}
|
||||
|
||||
def __init__(self, *names, **kwargs):
|
||||
@@ -62,14 +63,19 @@ class EnvironmentConfig(object):
|
||||
return None
|
||||
|
||||
|
||||
ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AGENT_AUTH_TOKEN = EnvironmentConfig("CLEARML_AUTH_TOKEN")
|
||||
ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
|
||||
ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")
|
||||
|
||||
ENVIRONMENT_CONFIG = {
|
||||
"api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
|
||||
"api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ),
|
||||
"api.web_server": EnvironmentConfig("CLEARML_WEB_HOST", "TRAINS_WEB_HOST", ),
|
||||
"api.credentials.access_key": EnvironmentConfig(
|
||||
"CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
|
||||
),
|
||||
"api.credentials.secret_key": EnvironmentConfig(
|
||||
"CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY",
|
||||
),
|
||||
"api.credentials.secret_key": ENV_AGENT_SECRET_KEY,
|
||||
"agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
|
||||
"agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
|
||||
"agent.cuda_version": EnvironmentConfig(
|
||||
@@ -82,10 +88,10 @@ ENVIRONMENT_CONFIG = {
|
||||
names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
|
||||
),
|
||||
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
|
||||
"sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"),
|
||||
"sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
|
||||
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
|
||||
"sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
||||
'account_key': EnvironmentConfig("AZURE_STORAGE_KEY")},
|
||||
'account_key': ENV_AZURE_ACCOUNT_KEY},
|
||||
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
|
||||
}
|
||||
|
||||
@@ -120,20 +126,60 @@ DEFAULT_VENV_UPDATE_URL = (
|
||||
"https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
|
||||
)
|
||||
WORKING_REPOSITORY_DIR = "task_repository"
|
||||
WORKING_STANDALONE_DIR = "code"
|
||||
DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
|
||||
PIP_EXTRA_INDICES = [
|
||||
]
|
||||
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
|
||||
ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAGE')
|
||||
ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
|
||||
ENV_WORKER_TAGS = EnvironmentConfig('CLEARML_WORKER_TAGS')
|
||||
ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PIP_VENV_INSTALL')
|
||||
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL', type=bool)
|
||||
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
|
||||
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
|
||||
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
|
||||
ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
|
||||
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig('CLEARML_AGENT_DISABLE_SSH_MOUNT', type=bool)
|
||||
ENV_SSH_AUTH_SOCK = EnvironmentConfig('SSH_AUTH_SOCK')
|
||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
|
||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
|
||||
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
|
||||
'TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
|
||||
ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
|
||||
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
|
||||
|
||||
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
|
||||
"""
|
||||
Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
The script can also be specified using the `agent.custom_build_script` configuration setting.
|
||||
|
||||
When running the script, the following environment variables will be set:
|
||||
- CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
contents in JSON format
|
||||
- CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
- CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
- CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
- CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
- CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
this file must be in the following JSON format:
|
||||
```json
|
||||
{
|
||||
"binary": "/absolute/path/to/python-executable",
|
||||
"entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
"working_dir": "/absolute/path/to/task-working/dir"
|
||||
}
|
||||
```
|
||||
If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
skip virtual environment creation.
|
||||
|
||||
In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
standard flow.
|
||||
"""
|
||||
|
||||
|
||||
class FileBuffering(IntEnum):
|
||||
|
||||
@@ -84,3 +84,13 @@ class MissingPackageError(CommandFailedError):
|
||||
def __str__(self):
|
||||
return '{self.__class__.__name__}: ' \
|
||||
'"{self.name}" package is required. Please run "pip install {self.name}"'.format(self=self)
|
||||
|
||||
|
||||
class CustomBuildScriptFailed(CommandFailedError):
|
||||
def __init__(self, errno, *args, **kwargs):
|
||||
super(CustomBuildScriptFailed, self).__init__(*args, **kwargs)
|
||||
self.errno = errno
|
||||
|
||||
|
||||
class SkippedCustomBuildScript(CommandFailedError):
|
||||
pass
|
||||
|
||||
@@ -4,19 +4,20 @@ import warnings
|
||||
from .requirement import Requirement
|
||||
|
||||
|
||||
def parse(reqstr):
|
||||
def parse(reqstr, cwd=None):
|
||||
"""
|
||||
Parse a requirements file into a list of Requirements
|
||||
|
||||
See: pip/req.py:parse_requirements()
|
||||
|
||||
:param reqstr: a string or file like object containing requirements
|
||||
:param cwd: Optional current working dir for -r file.txt loading
|
||||
:returns: a *generator* of Requirement objects
|
||||
"""
|
||||
filename = getattr(reqstr, 'name', None)
|
||||
try:
|
||||
# Python 2.x compatibility
|
||||
if not isinstance(reqstr, basestring):
|
||||
if not isinstance(reqstr, basestring): # noqa
|
||||
reqstr = reqstr.read()
|
||||
except NameError:
|
||||
# Python 3.x only
|
||||
@@ -30,10 +31,12 @@ def parse(reqstr):
|
||||
elif not line or line.startswith('#'):
|
||||
# comments are lines that start with # only
|
||||
continue
|
||||
elif line.startswith('-r') or line.startswith('--requirement'):
|
||||
elif line.startswith('-r ') or line.startswith('--requirement '):
|
||||
_, new_filename = line.split()
|
||||
new_file_path = os.path.join(os.path.dirname(filename or '.'),
|
||||
new_filename)
|
||||
new_file_path = os.path.join(
|
||||
os.path.dirname(filename or '.') if filename or not cwd else cwd, new_filename)
|
||||
if not os.path.exists(new_file_path):
|
||||
continue
|
||||
with open(new_file_path) as f:
|
||||
for requirement in parse(f):
|
||||
yield requirement
|
||||
|
||||
@@ -20,6 +20,15 @@ VCS_REGEX = re.compile(
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
VCS_EXT_REGEX = re.compile(
|
||||
r'^(?P<scheme>{0})(@)'.format(r'|'.join(
|
||||
[scheme.replace('+', r'\+') for scheme in ['git+git']])) +
|
||||
r'((?P<login>[^/@]+)@)?'
|
||||
r'(?P<path>[^#@]+)'
|
||||
r'(@(?P<revision>[^#]+))?'
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
# This matches just about everyting
|
||||
LOCAL_REGEX = re.compile(
|
||||
r'^((?P<scheme>file)://)?'
|
||||
@@ -100,7 +109,7 @@ class Requirement(object):
|
||||
|
||||
req = cls('-e {0}'.format(line))
|
||||
req.editable = True
|
||||
vcs_match = VCS_REGEX.match(line)
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
if vcs_match is not None:
|
||||
@@ -147,7 +156,7 @@ class Requirement(object):
|
||||
|
||||
req = cls(line)
|
||||
|
||||
vcs_match = VCS_REGEX.match(line)
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
uri_match = URI_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
@@ -226,7 +235,7 @@ class Requirement(object):
|
||||
# check if the name is valid & parsed
|
||||
Req.parse(name)
|
||||
# if we are here, name is a valid package name, check if the vcs part is valid
|
||||
if VCS_REGEX.match(uri):
|
||||
if VCS_REGEX.match(uri) or VCS_EXT_REGEX.match(uri):
|
||||
req = cls.parse_line(uri)
|
||||
req.name = name
|
||||
return req
|
||||
|
||||
@@ -1,20 +1,24 @@
|
||||
from __future__ import print_function, division, unicode_literals
|
||||
|
||||
import base64
|
||||
import functools
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from time import sleep
|
||||
from typing import Text, List, Callable, Any, Collection, Optional, Union
|
||||
|
||||
import yaml
|
||||
import json
|
||||
from time import sleep
|
||||
from typing import Text, List
|
||||
|
||||
from clearml_agent.commands.events import Events
|
||||
from clearml_agent.commands.worker import Worker
|
||||
from clearml_agent.commands.worker import Worker, get_task_container, set_task_container
|
||||
from clearml_agent.definitions import ENV_DOCKER_IMAGE
|
||||
from clearml_agent.errors import APIError
|
||||
from clearml_agent.helper.base import safe_remove_file
|
||||
@@ -27,20 +31,24 @@ from clearml_agent.interface.base import ObjectID
|
||||
class K8sIntegration(Worker):
|
||||
K8S_PENDING_QUEUE = "k8s_scheduler"
|
||||
|
||||
KUBECTL_APPLY_CMD = "kubectl apply -f"
|
||||
K8S_DEFAULT_NAMESPACE = "clearml"
|
||||
AGENT_LABEL = "CLEARML=agent"
|
||||
LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"
|
||||
|
||||
KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \
|
||||
"--image {docker_image} " \
|
||||
"--restart=Never --replicas=1 " \
|
||||
"--generator=run-pod/v1 " \
|
||||
"--namespace=clearml"
|
||||
KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"
|
||||
|
||||
KUBECTL_RUN_CMD = "kubectl run clearml-id-{task_id} " \
|
||||
"--image {docker_image} {docker_args} " \
|
||||
"--restart=Never " \
|
||||
"--namespace={namespace}"
|
||||
|
||||
KUBECTL_DELETE_CMD = "kubectl delete pods " \
|
||||
"--selector=TRAINS=agent " \
|
||||
"--selector={selector} " \
|
||||
"--field-selector=status.phase!=Pending,status.phase!=Running " \
|
||||
"--namespace=clearml"
|
||||
"--namespace={namespace}"
|
||||
|
||||
BASH_INSTALL_SSH_CMD = [
|
||||
"apt-get update",
|
||||
"apt-get install -y openssh-server",
|
||||
"mkdir -p /var/run/sshd",
|
||||
"echo 'root:training' | chpasswd",
|
||||
@@ -67,12 +75,10 @@ class K8sIntegration(Worker):
|
||||
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
|
||||
"$LOCAL_PYTHON -m pip install clearml-agent",
|
||||
"{extra_bash_init_cmd}",
|
||||
"{extra_docker_bash_script}",
|
||||
"$LOCAL_PYTHON -m clearml_agent execute --full-monitoring --require-queue --id {task_id}"
|
||||
]
|
||||
|
||||
AGENT_LABEL = "TRAINS=agent"
|
||||
LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"
|
||||
|
||||
_edit_hyperparams_version = "2.9"
|
||||
|
||||
def __init__(
|
||||
@@ -83,11 +89,15 @@ class K8sIntegration(Worker):
|
||||
debug=False,
|
||||
ports_mode=False,
|
||||
num_of_services=20,
|
||||
base_pod_num=1,
|
||||
user_props_cb=None,
|
||||
overrides_yaml=None,
|
||||
template_yaml=None,
|
||||
trains_conf_file=None,
|
||||
clearml_conf_file=None,
|
||||
extra_bash_init_script=None,
|
||||
namespace=None,
|
||||
max_pods_limit=None,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the k8s integration glue layer daemon
|
||||
@@ -95,7 +105,7 @@ class K8sIntegration(Worker):
|
||||
:param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
|
||||
:param str|callable kubectl_cmd: kubectl command line str, supports formatting (default: KUBECTL_RUN_CMD)
|
||||
example: "task={task_id} image={docker_image} queue_id={queue_id}"
|
||||
or a callable function: kubectl_cmd(task_id, docker_image, queue_id, task_data)
|
||||
or a callable function: kubectl_cmd(task_id, docker_image, docker_args, queue_id, task_data)
|
||||
:param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
|
||||
Notice this string will use format() call, if you have curly brackets they should be doubled { -> {{
|
||||
Format arguments passed: {task_id} and {extra_bash_init_cmd}
|
||||
@@ -104,14 +114,17 @@ class K8sIntegration(Worker):
|
||||
Requires the `num_of_services` parameter.
|
||||
:param int num_of_services: Number of k8s services configured in the cluster. Required if `port_mode` is True.
|
||||
(default: 20)
|
||||
:param int base_pod_num: Used when `ports_mode` is True, sets the base pod number to a given value (default: 1)
|
||||
:param callable user_props_cb: An Optional callable allowing additional user properties to be specified
|
||||
when scheduling a task to run in a pod. Callable can receive an optional pod number and should return
|
||||
a dictionary of user properties (name and value). Signature is [[Optional[int]], Dict[str,str]]
|
||||
:param str overrides_yaml: YAML file containing the overrides for the pod (optional)
|
||||
:param str template_yaml: YAML file containing the template for the pod (optional).
|
||||
If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
|
||||
:param str trains_conf_file: clearml.conf file to be use by the pod itself (optional)
|
||||
:param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional)
|
||||
:param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
|
||||
:param str namespace: K8S namespace to be used when creating the new pods (default: clearml)
|
||||
:param int max_pods_limit: Maximum number of pods that K8S glue can run at the same time
|
||||
"""
|
||||
super(K8sIntegration, self).__init__()
|
||||
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
|
||||
@@ -125,16 +138,19 @@ class K8sIntegration(Worker):
|
||||
self.log.logger.setLevel(logging.INFO)
|
||||
self.ports_mode = ports_mode
|
||||
self.num_of_services = num_of_services
|
||||
self.base_pod_num = base_pod_num
|
||||
self._edit_hyperparams_support = None
|
||||
self._user_props_cb = user_props_cb
|
||||
self.trains_conf_file = None
|
||||
self.conf_file_content = None
|
||||
self.overrides_json_string = None
|
||||
self.template_dict = None
|
||||
self.extra_bash_init_script = extra_bash_init_script or None
|
||||
if self.extra_bash_init_script and not isinstance(self.extra_bash_init_script, str):
|
||||
self.extra_bash_init_script = ' ; '.join(self.extra_bash_init_script) # noqa
|
||||
self.namespace = namespace or self.K8S_DEFAULT_NAMESPACE
|
||||
self.pod_limits = []
|
||||
self.pod_requests = []
|
||||
self.max_pods_limit = max_pods_limit if not self.ports_mode else None
|
||||
if overrides_yaml:
|
||||
with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
|
||||
overrides = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
||||
@@ -161,11 +177,122 @@ class K8sIntegration(Worker):
|
||||
with open(os.path.expandvars(os.path.expanduser(str(template_yaml))), 'rt') as f:
|
||||
self.template_dict = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
||||
|
||||
if trains_conf_file:
|
||||
with open(os.path.expandvars(os.path.expanduser(str(trains_conf_file))), 'rt') as f:
|
||||
self.trains_conf_file = f.read()
|
||||
clearml_conf_file = clearml_conf_file or kwargs.get('trains_conf_file')
|
||||
|
||||
if clearml_conf_file:
|
||||
with open(os.path.expandvars(os.path.expanduser(str(clearml_conf_file))), 'rt') as f:
|
||||
self.conf_file_content = f.read()
|
||||
# make sure we use system packages!
|
||||
self.trains_conf_file += '\nagent.package_manager.system_site_packages=true\n'
|
||||
self.conf_file_content += '\nagent.package_manager.system_site_packages=true\n'
|
||||
|
||||
self._agent_label = None
|
||||
|
||||
self._monitor_hanging_pods()
|
||||
|
||||
def _monitor_hanging_pods(self):
|
||||
_check_pod_thread = Thread(target=self._monitor_hanging_pods_daemon)
|
||||
_check_pod_thread.daemon = True
|
||||
_check_pod_thread.start()
|
||||
|
||||
@staticmethod
|
||||
def _get_path(d, *path, default=None):
|
||||
try:
|
||||
return functools.reduce(
|
||||
lambda a, b: a[b], path, d
|
||||
)
|
||||
except (IndexError, KeyError):
|
||||
return default
|
||||
|
||||
def _monitor_hanging_pods_daemon(self):
|
||||
last_tasks_msgs = {} # last msg updated for every task
|
||||
|
||||
while True:
|
||||
output = get_bash_output('kubectl get pods -n {namespace} -o=JSON'.format(
|
||||
namespace=self.namespace
|
||||
))
|
||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||
try:
|
||||
output_config = json.loads(output)
|
||||
except Exception as ex:
|
||||
self.log.warning('K8S Glue pods monitor: Failed parsing kubectl output:\n{}\nEx: {}'.format(output, ex))
|
||||
sleep(self._polling_interval)
|
||||
continue
|
||||
pods = output_config.get('items', [])
|
||||
task_ids = set()
|
||||
for pod in pods:
|
||||
if self._get_path(pod, 'status', 'phase') != "Pending":
|
||||
continue
|
||||
|
||||
pod_name = pod.get('metadata', {}).get('name', None)
|
||||
if not pod_name:
|
||||
continue
|
||||
|
||||
task_id = pod_name.rpartition('-')[-1]
|
||||
if not task_id:
|
||||
continue
|
||||
|
||||
task_ids.add(task_id)
|
||||
|
||||
msg = None
|
||||
|
||||
waiting = self._get_path(pod, 'status', 'containerStatuses', 0, 'state', 'waiting')
|
||||
if not waiting:
|
||||
condition = self._get_path(pod, 'status', 'conditions', 0)
|
||||
if condition:
|
||||
reason = condition.get('reason')
|
||||
if reason == 'Unschedulable':
|
||||
message = condition.get('message')
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
else:
|
||||
reason = waiting.get("reason", None)
|
||||
message = waiting.get("message", None)
|
||||
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
|
||||
if reason == 'ImagePullBackOff':
|
||||
delete_pod_cmd = 'kubectl delete pods {} -n {}'.format(pod_name, self.namespace)
|
||||
get_bash_output(delete_pod_cmd)
|
||||
try:
|
||||
self._session.api_client.tasks.failed(
|
||||
task=task_id,
|
||||
status_reason="K8S glue error: {}".format(msg),
|
||||
status_message="Changed by K8S glue",
|
||||
force=True
|
||||
)
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed deleting task "{}"\nEX: {}'.format(task_id, ex)
|
||||
)
|
||||
|
||||
# clean up any msg for this task
|
||||
last_tasks_msgs.pop(task_id, None)
|
||||
continue
|
||||
if msg and last_tasks_msgs.get(task_id, None) != msg:
|
||||
try:
|
||||
result = self._session.send_request(
|
||||
service='tasks',
|
||||
action='update',
|
||||
json={"task": task_id, "status_message": "K8S glue status: {}".format(msg)},
|
||||
method='get',
|
||||
async_enable=False,
|
||||
)
|
||||
if not result.ok:
|
||||
result_msg = self._get_path(result.json(), 'meta', 'result_msg')
|
||||
raise Exception(result_msg or result.text)
|
||||
|
||||
# update last msg for this task
|
||||
last_tasks_msgs[task_id] = msg
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed setting status message for task "{}"\nEX: {}'.format(
|
||||
task_id, ex
|
||||
)
|
||||
)
|
||||
|
||||
# clean up any last message for a task that wasn't seen as a pod
|
||||
last_tasks_msgs = {k: v for k, v in last_tasks_msgs.items() if k in task_ids}
|
||||
|
||||
sleep(self._polling_interval)
|
||||
|
||||
def _set_task_user_properties(self, task_id: str, **properties: str):
|
||||
if self._edit_hyperparams_support is not True:
|
||||
@@ -197,6 +324,44 @@ class K8sIntegration(Worker):
|
||||
if error.code == 404:
|
||||
self._edit_hyperparams_support = self._session.api_version
|
||||
|
||||
def _get_agent_label(self):
|
||||
if not self.worker_id:
|
||||
print('WARNING! no worker ID found!!!')
|
||||
return self.AGENT_LABEL
|
||||
|
||||
if not self._agent_label:
|
||||
h = hashlib.md5()
|
||||
h.update(str(self.worker_id).encode('utf-8'))
|
||||
self._agent_label = '{}-{}'.format(self.AGENT_LABEL, h.hexdigest()[:8])
|
||||
|
||||
return self._agent_label
|
||||
|
||||
def _get_number_used_pods(self):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||
error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
|
||||
|
||||
if not output:
|
||||
# No such pod exist so we can use the pod_number we found
|
||||
return 0
|
||||
|
||||
try:
|
||||
current_pod_count = len(json.loads(output).get("items", []))
|
||||
except (ValueError, TypeError) as ex:
|
||||
return -1
|
||||
|
||||
return current_pod_count
|
||||
except Exception as ex:
|
||||
print('Failed getting number of used pods: {}'.format(ex))
|
||||
return -2
|
||||
|
||||
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
|
||||
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
|
||||
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
|
||||
@@ -204,29 +369,36 @@ class K8sIntegration(Worker):
|
||||
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
|
||||
try:
|
||||
print('Pushing task {} into temporary pending queue'.format(task_id))
|
||||
self._session.api_client.tasks.reset(task_id)
|
||||
self._session.api_client.tasks.enqueue(task_id, queue=self.k8s_pending_queue_name,
|
||||
status_reason='k8s pending scheduler')
|
||||
res = self._session.api_client.tasks.stop(task_id, force=True)
|
||||
res = self._session.api_client.tasks.enqueue(
|
||||
task_id,
|
||||
queue=self.k8s_pending_queue_name,
|
||||
status_reason='k8s pending scheduler',
|
||||
)
|
||||
if res.meta.result_code != 200:
|
||||
raise Exception(res.meta.result_msg)
|
||||
except Exception as e:
|
||||
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
|
||||
task_id, self.k8s_pending_queue_name, e))
|
||||
return
|
||||
|
||||
if task_data.execution.docker_cmd:
|
||||
docker_parts = task_data.execution.docker_cmd
|
||||
else:
|
||||
docker_parts = str(ENV_DOCKER_IMAGE.get() or
|
||||
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
|
||||
|
||||
# take the first part, this is the docker image name (not arguments)
|
||||
docker_parts = docker_parts.split()
|
||||
docker_image = docker_parts[0]
|
||||
docker_args = docker_parts[1:] if len(docker_parts) > 1 else []
|
||||
container = get_task_container(self._session, task_id)
|
||||
if not container.get('image'):
|
||||
container['image'] = str(
|
||||
ENV_DOCKER_IMAGE.get() or self._session.config.get("agent.default_docker.image", "nvidia/cuda")
|
||||
)
|
||||
container['arguments'] = self._session.config.get("agent.default_docker.arguments", None)
|
||||
set_task_container(
|
||||
self._session, task_id, docker_image=container['image'], docker_arguments=container['arguments']
|
||||
)
|
||||
|
||||
# get the clearml.conf encoded file
|
||||
# noinspection PyProtectedMember
|
||||
hocon_config_encoded = (self.trains_conf_file or self._session._config_file).encode('ascii')
|
||||
create_trains_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
|
||||
hocon_config_encoded = (
|
||||
self.conf_file_content
|
||||
or Path(self._session._config_file).read_text()
|
||||
).encode("ascii")
|
||||
create_clearml_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
|
||||
base64.b64encode(
|
||||
hocon_config_encoded
|
||||
).decode('ascii')
|
||||
@@ -241,17 +413,22 @@ class K8sIntegration(Worker):
|
||||
except Exception:
|
||||
queue_name = 'k8s'
|
||||
|
||||
# conform queue name to k8s standards
|
||||
safe_queue_name = queue_name.lower().strip()
|
||||
safe_queue_name = re.sub(r'\W+', '', safe_queue_name).replace('_', '').replace('-', '')
|
||||
|
||||
# Search for a free pod number
|
||||
pod_number = 1
|
||||
while self.ports_mode:
|
||||
kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n clearml".format(
|
||||
pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
|
||||
agent_label=self.AGENT_LABEL
|
||||
)
|
||||
pod_count = 0
|
||||
pod_number = self.base_pod_num
|
||||
while self.ports_mode or self.max_pods_limit:
|
||||
pod_number = self.base_pod_num + pod_count
|
||||
if self.ports_mode:
|
||||
kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
|
||||
pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
else:
|
||||
kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||
@@ -260,48 +437,86 @@ class K8sIntegration(Worker):
|
||||
if not output:
|
||||
# No such pod exist so we can use the pod_number we found
|
||||
break
|
||||
if pod_number >= self.num_of_services:
|
||||
# All pod numbers are taken, exit
|
||||
|
||||
if self.max_pods_limit:
|
||||
try:
|
||||
current_pod_count = len(json.loads(output).get("items", []))
|
||||
except (ValueError, TypeError) as ex:
|
||||
self.log.warning(
|
||||
"K8S Glue pods monitor: Failed parsing kubectl output:\n{}\ntask '{}' "
|
||||
"will be enqueued back to queue '{}'\nEx: {}".format(
|
||||
output, task_id, queue, ex
|
||||
)
|
||||
)
|
||||
self._session.api_client.tasks.stop(task_id, force=True)
|
||||
self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error')
|
||||
return
|
||||
max_count = self.max_pods_limit
|
||||
else:
|
||||
current_pod_count = pod_count
|
||||
max_count = self.num_of_services - 1
|
||||
|
||||
if current_pod_count >= max_count:
|
||||
# All pods are taken, exit
|
||||
self.log.debug(
|
||||
"kubectl last result: {}\n{}".format(error, output))
|
||||
self.log.warning(
|
||||
"kubectl last result: {}\n{}\nAll k8s services are in use, task '{}' "
|
||||
"All k8s services are in use, task '{}' "
|
||||
"will be enqueued back to queue '{}'".format(
|
||||
error, output, task_id, queue
|
||||
task_id, queue
|
||||
)
|
||||
)
|
||||
self._session.api_client.tasks.reset(task_id)
|
||||
self._session.api_client.tasks.stop(task_id, force=True)
|
||||
self._session.api_client.tasks.enqueue(
|
||||
task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
|
||||
return
|
||||
pod_number += 1
|
||||
elif self.max_pods_limit:
|
||||
# max pods limit hasn't reached yet, so we can create the pod
|
||||
break
|
||||
pod_count += 1
|
||||
|
||||
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]
|
||||
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + \
|
||||
[self._get_agent_label()]
|
||||
labels.append("clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)))
|
||||
labels.append("clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name)))
|
||||
|
||||
if self.ports_mode:
|
||||
print("Kubernetes scheduling task id={} on pod={}".format(task_id, pod_number))
|
||||
print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
|
||||
else:
|
||||
print("Kubernetes scheduling task id={}".format(task_id))
|
||||
|
||||
kubectl_kwargs = dict(
|
||||
create_clearml_conf=create_clearml_conf,
|
||||
labels=labels,
|
||||
docker_image=container['image'],
|
||||
docker_args=container['arguments'],
|
||||
docker_bash=container.get('setup_shell_script'),
|
||||
task_id=task_id,
|
||||
queue=queue
|
||||
)
|
||||
|
||||
if self.template_dict:
|
||||
output, error = self._kubectl_apply(
|
||||
create_trains_conf=create_trains_conf,
|
||||
labels=labels, docker_image=docker_image, docker_args=docker_args,
|
||||
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
||||
output, error = self._kubectl_apply(**kubectl_kwargs)
|
||||
else:
|
||||
output, error = self._kubectl_run(
|
||||
create_trains_conf=create_trains_conf,
|
||||
labels=labels, docker_image=docker_image,
|
||||
task_data=task_data,
|
||||
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
||||
output, error = self._kubectl_run(task_data=task_data, **kubectl_kwargs)
|
||||
|
||||
error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
|
||||
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
|
||||
print('kubectl output:\n{}\n{}'.format(error, output))
|
||||
if error:
|
||||
self.log.error("Running kubectl encountered an error: {}".format(error))
|
||||
send_log = "Running kubectl encountered an error: {}".format(error)
|
||||
self.log.error(send_log)
|
||||
self.send_logs(task_id, send_log.splitlines())
|
||||
|
||||
user_props = {"k8s-queue": str(queue_name)}
|
||||
if self.ports_mode:
|
||||
user_props.update({"k8s-pod-number": pod_number, "k8s-pod-label": labels[0]})
|
||||
user_props.update(
|
||||
{
|
||||
"k8s-pod-number": pod_number,
|
||||
"k8s-pod-label": labels[0],
|
||||
"k8s-internal-pod-count": pod_count,
|
||||
}
|
||||
)
|
||||
|
||||
if self._user_props_cb:
|
||||
# noinspection PyBroadException
|
||||
@@ -317,40 +532,66 @@ class K8sIntegration(Worker):
|
||||
**user_props
|
||||
)
|
||||
|
||||
def _parse_docker_args(self, docker_args):
|
||||
# type: (list) -> dict
|
||||
kube_args = {'env': []}
|
||||
while docker_args:
|
||||
cmd = docker_args.pop().strip()
|
||||
if cmd in ('-e', '--env',):
|
||||
env = docker_args.pop().strip()
|
||||
key, value = env.split('=', 1)
|
||||
kube_args[key] += {key: value}
|
||||
def _get_docker_args(self, docker_args, flags, target=None, convert=None):
|
||||
# type: (List[str], Collection[str], Optional[str], Callable[[str], Any]) -> Union[dict, List[str]]
|
||||
"""
|
||||
Get docker args matching specific flags.
|
||||
|
||||
:argument docker_args: List of docker argument strings (flags and values)
|
||||
:argument flags: List of flags/names to intercept (e.g. "--env" etc.)
|
||||
:argument target: Controls return format. If provided, returns a dict with a target field containing a list
|
||||
of result strings, otherwise returns a list of result strings
|
||||
:argument convert: Optional conversion function for each result string
|
||||
"""
|
||||
args = docker_args[:] if docker_args else []
|
||||
results = []
|
||||
while args:
|
||||
cmd = args.pop(0).strip()
|
||||
if cmd in flags:
|
||||
env = args.pop(0).strip()
|
||||
if convert:
|
||||
env = convert(env)
|
||||
results.append(env)
|
||||
else:
|
||||
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
|
||||
return kube_args
|
||||
if target:
|
||||
return {target: results} if results else {}
|
||||
return results
|
||||
|
||||
def _kubectl_apply(self, create_trains_conf, docker_image, docker_args, labels, queue, task_id, queue_name):
|
||||
def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, docker_bash, labels, queue, task_id):
|
||||
template = deepcopy(self.template_dict)
|
||||
template.setdefault('apiVersion', 'v1')
|
||||
template['kind'] = 'Pod'
|
||||
template.setdefault('metadata', {})
|
||||
name = 'clearml-{queue}-id-{task_id}'.format(queue=queue_name, task_id=task_id)
|
||||
name = 'clearml-id-{task_id}'.format(task_id=task_id)
|
||||
template['metadata']['name'] = name
|
||||
template.setdefault('spec', {})
|
||||
template['spec'].setdefault('containers', [])
|
||||
template['spec'].setdefault('restartPolicy', 'Never')
|
||||
if labels:
|
||||
labels_dict = dict(pair.split('=', 1) for pair in labels)
|
||||
template['metadata'].setdefault('labels', {})
|
||||
template['metadata']['labels'].update(labels_dict)
|
||||
container = self._parse_docker_args(docker_args)
|
||||
|
||||
container = self._get_docker_args(
|
||||
docker_args,
|
||||
target="env",
|
||||
flags={"-e", "--env"},
|
||||
convert=lambda env: {'name': env.partition("=")[0], 'value': env.partition("=")[2]},
|
||||
)
|
||||
|
||||
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
|
||||
else self.container_bash_script
|
||||
|
||||
extra_docker_bash_script = '\n'.join(self._session.config.get("agent.extra_docker_shell_script", None) or [])
|
||||
if docker_bash:
|
||||
extra_docker_bash_script += '\n' + str(docker_bash) + '\n'
|
||||
|
||||
script_encoded = '\n'.join(
|
||||
['#!/bin/bash', ] +
|
||||
[line.format(extra_bash_init_cmd=self.extra_bash_init_script or '', task_id=task_id)
|
||||
[line.format(extra_bash_init_cmd=self.extra_bash_init_script or '',
|
||||
task_id=task_id,
|
||||
extra_docker_bash_script=extra_docker_bash_script)
|
||||
for line in container_bash_script])
|
||||
|
||||
create_init_script = \
|
||||
@@ -360,18 +601,23 @@ class K8sIntegration(Worker):
|
||||
script_encoded.encode('ascii')
|
||||
).decode('ascii'))
|
||||
|
||||
container = merge_dicts(
|
||||
# Notice: we always leave with exit code 0, so pods are never restarted
|
||||
container = self._merge_containers(
|
||||
container,
|
||||
dict(name=name, image=docker_image,
|
||||
command=['/bin/bash'],
|
||||
args=['-c', '{} ; {}'.format(create_trains_conf, create_init_script)])
|
||||
args=['-c', '{} ; {} ; exit 0'.format(create_clearml_conf, create_init_script)])
|
||||
)
|
||||
|
||||
if template['spec']['containers']:
|
||||
template['spec']['containers'][0] = merge_dicts(template['spec']['containers'][0], container)
|
||||
template['spec']['containers'][0] = self._merge_containers(template['spec']['containers'][0], container)
|
||||
else:
|
||||
template['spec']['containers'].append(container)
|
||||
|
||||
if self._docker_force_pull:
|
||||
for c in template['spec']['containers']:
|
||||
c.setdefault('imagePullPolicy', 'Always')
|
||||
|
||||
fp, yaml_file = tempfile.mkstemp(prefix='clearml_k8stmpl_', suffix='.yml')
|
||||
os.close(fp)
|
||||
with open(yaml_file, 'wt') as f:
|
||||
@@ -381,6 +627,7 @@ class K8sIntegration(Worker):
|
||||
task_id=task_id,
|
||||
docker_image=docker_image,
|
||||
queue_id=queue,
|
||||
namespace=self.namespace
|
||||
)
|
||||
# make sure we provide a list
|
||||
if isinstance(kubectl_cmd, str):
|
||||
@@ -398,15 +645,20 @@ class K8sIntegration(Worker):
|
||||
|
||||
return output, error
|
||||
|
||||
def _kubectl_run(self, create_trains_conf, docker_image, labels, queue, task_data, task_id, queue_name):
|
||||
def _kubectl_run(
|
||||
self, create_clearml_conf, docker_image, docker_args, docker_bash, labels, queue, task_data, task_id
|
||||
):
|
||||
if callable(self.kubectl_cmd):
|
||||
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data, queue_name)
|
||||
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, docker_args, queue, task_data)
|
||||
else:
|
||||
kubectl_cmd = self.kubectl_cmd.format(
|
||||
queue_name=queue_name,
|
||||
task_id=task_id,
|
||||
docker_image=docker_image,
|
||||
queue_id=queue
|
||||
docker_args=" ".join(self._get_docker_args(
|
||||
docker_args, flags={"-e", "--env"}, convert=lambda env: '--env={}'.format(env))
|
||||
),
|
||||
queue_id=queue,
|
||||
namespace=self.namespace,
|
||||
)
|
||||
# make sure we provide a list
|
||||
if isinstance(kubectl_cmd, str):
|
||||
@@ -420,6 +672,9 @@ class K8sIntegration(Worker):
|
||||
if self.pod_requests:
|
||||
kubectl_cmd += ['--requests', ",".join(self.pod_requests)]
|
||||
|
||||
if self._docker_force_pull and not any(x.startswith("--image-pull-policy=") for x in kubectl_cmd):
|
||||
kubectl_cmd += ["--image-pull-policy='always'"]
|
||||
|
||||
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
|
||||
else self.container_bash_script
|
||||
container_bash_script = ' ; '.join(container_bash_script)
|
||||
@@ -430,8 +685,11 @@ class K8sIntegration(Worker):
|
||||
"--",
|
||||
"/bin/sh",
|
||||
"-c",
|
||||
"{} ; {}".format(create_trains_conf, container_bash_script.format(
|
||||
extra_bash_init_cmd=self.extra_bash_init_script, task_id=task_id)),
|
||||
"{} ; {}".format(create_clearml_conf, container_bash_script.format(
|
||||
extra_bash_init_cmd=self.extra_bash_init_script or "",
|
||||
extra_docker_bash_script=docker_bash or "",
|
||||
task_id=task_id
|
||||
)),
|
||||
]
|
||||
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
@@ -461,10 +719,26 @@ class K8sIntegration(Worker):
|
||||
|
||||
_last_machine_update_ts = 0
|
||||
while True:
|
||||
# check if have pod limit, then check if we hit it.
|
||||
if self.max_pods_limit:
|
||||
current_pods = self._get_number_used_pods()
|
||||
if current_pods >= self.max_pods_limit:
|
||||
print("Maximum pod limit reached {}/{}, sleeping for {:.1f} seconds".format(
|
||||
current_pods, self.max_pods_limit, self._polling_interval))
|
||||
# delete old completed / failed pods
|
||||
get_bash_output(
|
||||
self.KUBECTL_DELETE_CMD.format(namespace=self.namespace, selector=self._get_agent_label())
|
||||
)
|
||||
# go to sleep
|
||||
sleep(self._polling_interval)
|
||||
continue
|
||||
|
||||
# iterate over queues (priority style, queues[0] is highest)
|
||||
for queue in queues:
|
||||
# delete old completed / failed pods
|
||||
get_bash_output(self.KUBECTL_DELETE_CMD)
|
||||
get_bash_output(
|
||||
self.KUBECTL_DELETE_CMD.format(namespace=self.namespace, selector=self._get_agent_label())
|
||||
)
|
||||
|
||||
# get next task in queue
|
||||
try:
|
||||
@@ -516,3 +790,27 @@ class K8sIntegration(Worker):
|
||||
@classmethod
|
||||
def get_ssh_server_bash(cls, ssh_port_number):
|
||||
return ' ; '.join(line.format(port=ssh_port_number) for line in cls.BASH_INSTALL_SSH_CMD)
|
||||
|
||||
@staticmethod
|
||||
def _merge_containers(c1, c2):
|
||||
def merge_env(k, d1, d2, not_set):
|
||||
if k != "env":
|
||||
return not_set
|
||||
# Merge environment lists, second list overrides first
|
||||
return list({
|
||||
item['name']: item for envs in (d1, d2) for item in envs
|
||||
}.values())
|
||||
|
||||
return merge_dicts(
|
||||
c1, c2, custom_merge_func=merge_env
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _safe_k8s_label_value(value):
|
||||
""" Conform string to k8s standards for a label value """
|
||||
value = value.lower().strip()
|
||||
value = re.sub(r'^[^A-Za-z0-9]+', '', value) # strip leading non-alphanumeric chars
|
||||
value = re.sub(r'[^A-Za-z0-9]+$', '', value) # strip trailing non-alphanumeric chars
|
||||
value = re.sub(r'\W+', '-', value) # allow only word chars (this removed "." which is supported, but nvm)
|
||||
value = re.sub(r'-+', '-', value) # don't leave messy "--" after replacing previous chars
|
||||
return value[:63]
|
||||
|
||||
@@ -24,7 +24,6 @@ import pyhocon
|
||||
import yaml
|
||||
from attr import fields_dict
|
||||
from pathlib2 import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
import six
|
||||
from six.moves import reduce
|
||||
@@ -205,10 +204,13 @@ def get_python_path(script_dir, entry_point, package_api, is_conda_env=False):
|
||||
["-c", "import sys; print('{}'.join(sys.path))".format(python_path_sep)])
|
||||
org_python_path = python_path_cmd.get_output(cwd=script_dir)
|
||||
# Add path of the script directory and executable directory
|
||||
python_path = '{}{python_path_sep}{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(),
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
python_path = '{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(), python_path_sep=python_path_sep)
|
||||
if entry_point:
|
||||
python_path += '{}{python_path_sep}'.format(
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
|
||||
if is_windows_platform():
|
||||
python_path = python_path.replace('/', '\\')
|
||||
|
||||
@@ -399,12 +401,6 @@ class TqdmStream(object):
|
||||
self.buffer.write('\n')
|
||||
|
||||
|
||||
class TqdmLog(tqdm):
|
||||
|
||||
def __init__(self, iterable=None, file=None, **kwargs):
|
||||
super(TqdmLog, self).__init__(iterable, file=TqdmStream(file or sys.stderr), **kwargs)
|
||||
|
||||
|
||||
def url_join(first, *rest):
|
||||
"""
|
||||
Join url parts similarly to Path.join
|
||||
@@ -510,6 +506,38 @@ def is_conda(config):
|
||||
return config['agent.package_manager.type'].lower() == 'conda'
|
||||
|
||||
|
||||
def convert_cuda_version_to_float_single_digit_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into a float representation, e.g. 11.4
|
||||
Notice returns String Single digit only!
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = str(cuda_version or 0)
|
||||
# if we have patch version we parse it here
|
||||
cuda_version_parts = [int(v) for v in cuda_version.split('.')]
|
||||
if len(cuda_version_parts) > 1 or cuda_version_parts[0] < 60:
|
||||
cuda_version = 10 * cuda_version_parts[0]
|
||||
if len(cuda_version_parts) > 1:
|
||||
cuda_version += float(".{:d}".format(cuda_version_parts[1]))*10
|
||||
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
else:
|
||||
cuda_version = cuda_version_parts[0]
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
|
||||
return cuda_version_full
|
||||
|
||||
|
||||
def convert_cuda_version_to_int_10_base_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into an integer version, e.g. 112 for cuda 11.2
|
||||
Return string
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = convert_cuda_version_to_float_single_digit_str(cuda_version)
|
||||
return str(int(float(cuda_version)*10))
|
||||
|
||||
|
||||
class NonStrictAttrs(object):
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import unicode_literals, print_function
|
||||
|
||||
import csv
|
||||
import sys
|
||||
from collections import Iterable
|
||||
from collections.abc import Iterable
|
||||
from typing import List, Dict, Text, Any
|
||||
|
||||
from attr import attrs, attrib
|
||||
|
||||
@@ -1,17 +1,23 @@
|
||||
from typing import Callable, Dict, Any
|
||||
from typing import Callable, Dict, Any, Optional
|
||||
|
||||
_not_set = object()
|
||||
|
||||
|
||||
def filter_keys(filter_, dct): # type: (Callable[[Any], bool], Dict) -> Dict
|
||||
return {key: value for key, value in dct.items() if filter_(key)}
|
||||
|
||||
|
||||
def merge_dicts(dict1, dict2):
|
||||
def merge_dicts(dict1, dict2, custom_merge_func=None):
|
||||
# type: (Any, Any, Optional[Callable[[str, Any, Any, Any], Any]]) -> Any
|
||||
""" Recursively merges dict2 into dict1 """
|
||||
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
|
||||
return dict2
|
||||
for k in dict2:
|
||||
if k in dict1:
|
||||
dict1[k] = merge_dicts(dict1[k], dict2[k])
|
||||
res = None
|
||||
if custom_merge_func:
|
||||
res = custom_merge_func(k, dict1[k], dict2[k], _not_set)
|
||||
dict1[k] = merge_dicts(dict1[k], dict2[k], custom_merge_func) if res is _not_set else res
|
||||
else:
|
||||
dict1[k] = dict2[k]
|
||||
return dict1
|
||||
|
||||
@@ -20,6 +20,7 @@ import platform
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import psutil
|
||||
from ..gpu import pynvml as N
|
||||
@@ -390,3 +391,38 @@ def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||
'''
|
||||
return GPUStatCollection.new_query(shutdown=shutdown, per_process_stats=per_process_stats,
|
||||
get_driver_info=get_driver_info)
|
||||
|
||||
|
||||
def get_driver_cuda_version():
|
||||
# type: () -> Optional[str]
|
||||
"""
|
||||
:return: Return detected CUDA version from driver. On fail return value is None.
|
||||
Example: `110` is cuda version 11.0
|
||||
"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlInit()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||
except BaseException:
|
||||
cuda_version = ''
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlShutdown()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
# for some reason we get CUDA version 11020 instead of 11200, so this is the fix
|
||||
if cuda_version and len(cuda_version) >= 4 and cuda_version[2] == '0' and cuda_version[3] != '0':
|
||||
return cuda_version[:2]+cuda_version[3]
|
||||
|
||||
return cuda_version[:3] if cuda_version else None
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
225
clearml_agent/helper/os/folder_cache.py
Normal file
225
clearml_agent/helper/os/folder_cache.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import os
|
||||
import shutil
|
||||
from logging import warning
|
||||
from random import random
|
||||
from time import time
|
||||
from typing import List, Optional, Sequence
|
||||
|
||||
import psutil
|
||||
from pathlib2 import Path
|
||||
|
||||
from .locks import FileLock
|
||||
|
||||
|
||||
class FolderCache(object):
|
||||
_lock_filename = '.clearml.lock'
|
||||
_lock_timeout_seconds = 30
|
||||
_temp_entry_prefix = '_temp.'
|
||||
|
||||
def __init__(self, cache_folder, max_cache_entries=5, min_free_space_gb=None):
|
||||
self._cache_folder = Path(os.path.expandvars(cache_folder)).expanduser().absolute()
|
||||
self._cache_folder.mkdir(parents=True, exist_ok=True)
|
||||
self._max_cache_entries = max_cache_entries
|
||||
self._last_copied_entry_folder = None
|
||||
self._min_free_space_gb = min_free_space_gb if min_free_space_gb and min_free_space_gb > 0 else None
|
||||
self._lock = FileLock((self._cache_folder / self._lock_filename).as_posix())
|
||||
|
||||
def get_cache_folder(self):
|
||||
# type: () -> Path
|
||||
"""
|
||||
:return: Return the base cache folder
|
||||
"""
|
||||
return self._cache_folder
|
||||
|
||||
def copy_cached_entry(self, keys, destination):
|
||||
# type: (List[str], Path) -> Optional[Path]
|
||||
"""
|
||||
Copy a cached entry into a destination directory, if the cached entry does not exist return None
|
||||
:param keys:
|
||||
:param destination:
|
||||
:return: Target path, None if cached entry does not exist
|
||||
"""
|
||||
self._last_copied_entry_folder = None
|
||||
if not keys:
|
||||
return None
|
||||
|
||||
# lock so we make sure no one deletes it before we copy it
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
return None
|
||||
|
||||
src = None
|
||||
try:
|
||||
src = self.get_entry(keys)
|
||||
if src:
|
||||
destination = Path(destination).absolute()
|
||||
destination.mkdir(parents=True, exist_ok=True)
|
||||
shutil.rmtree(destination.as_posix())
|
||||
shutil.copytree(src.as_posix(), dst=destination.as_posix(), symlinks=True)
|
||||
except BaseException as ex:
|
||||
warning('Could not copy cache folder {} to {}: {}'.format(src, destination, ex))
|
||||
self._lock.release()
|
||||
return None
|
||||
|
||||
# release Lock
|
||||
self._lock.release()
|
||||
|
||||
self._last_copied_entry_folder = src
|
||||
return destination if src else None
|
||||
|
||||
def get_entry(self, keys):
|
||||
# type: (List[str]) -> Optional[Path]
|
||||
"""
|
||||
Return a folder (a sub-folder of inside the cache_folder) matching one of the keys
|
||||
:param keys: List of keys, return the first match to one of the keys, notice keys cannot contain '.'
|
||||
:return: Path to the sub-folder or None if none was found
|
||||
"""
|
||||
if not keys:
|
||||
return None
|
||||
# conform keys
|
||||
keys = [keys] if isinstance(keys, str) else keys
|
||||
keys = sorted([k.replace('.', '_') for k in keys])
|
||||
for cache_folder in self._cache_folder.glob('*'):
|
||||
if cache_folder.is_dir() and any(True for k in cache_folder.name.split('.') if k in keys):
|
||||
cache_folder.touch()
|
||||
return cache_folder
|
||||
return None
|
||||
|
||||
def add_entry(self, keys, source_folder, exclude_sub_folders=None):
|
||||
# type: (List[str], Path, Optional[Sequence[str]]) -> bool
|
||||
"""
|
||||
Add a local folder into the cache, copy all sub-folders inside `source_folder`
|
||||
excluding folders matching `exclude_sub_folders` list
|
||||
:param keys: Cache entry keys list (str)
|
||||
:param source_folder: Folder to copy into the cache
|
||||
:param exclude_sub_folders: List of sub-folders to exclude from the copy operation
|
||||
:return: return True is new entry was added to cache
|
||||
"""
|
||||
if not keys:
|
||||
return False
|
||||
|
||||
keys = [keys] if isinstance(keys, str) else keys
|
||||
keys = sorted([k.replace('.', '_') for k in keys])
|
||||
|
||||
# If entry already exists skip it
|
||||
cached_entry = self.get_entry(keys)
|
||||
if cached_entry:
|
||||
# make sure the entry contains all keys
|
||||
cached_keys = cached_entry.name.split('.')
|
||||
if set(keys) - set(cached_keys):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
# failed locking do nothing
|
||||
return True
|
||||
keys = sorted(list(set(keys) | set(cached_keys)))
|
||||
dst = cached_entry.parent / '.'.join(keys)
|
||||
# rename
|
||||
try:
|
||||
shutil.move(src=cached_entry.as_posix(), dst=dst.as_posix())
|
||||
except BaseException as ex:
|
||||
warning('Could not rename cache entry {} to {}: ex'.format(
|
||||
cached_entry.as_posix(), dst.as_posix(), ex))
|
||||
# release lock
|
||||
self._lock.release()
|
||||
return True
|
||||
|
||||
# make sure we remove old entries
|
||||
self._remove_old_entries()
|
||||
|
||||
# if we do not have enough free space, do nothing.
|
||||
if not self._check_min_free_space():
|
||||
warning('Could not add cache entry, not enough free space on drive, '
|
||||
'free space threshold {} GB. Clearing all cache entries!'.format(self._min_free_space_gb))
|
||||
self._remove_old_entries(max_cache_entries=0)
|
||||
return False
|
||||
|
||||
# create the new entry for us
|
||||
exclude_sub_folders = exclude_sub_folders or []
|
||||
source_folder = Path(source_folder).absolute()
|
||||
# create temp folder
|
||||
temp_folder = \
|
||||
self._temp_entry_prefix + \
|
||||
'{}.{}'.format(str(time()).replace('.', '_'), str(random()).replace('.', '_'))
|
||||
temp_folder = self._cache_folder / temp_folder
|
||||
temp_folder.mkdir(parents=True, exist_ok=False)
|
||||
|
||||
for f in source_folder.glob('*'):
|
||||
if f.name in exclude_sub_folders:
|
||||
continue
|
||||
if f.is_dir():
|
||||
shutil.copytree(
|
||||
src=f.as_posix(), dst=(temp_folder / f.name).as_posix(),
|
||||
symlinks=True, ignore_dangling_symlinks=True)
|
||||
else:
|
||||
shutil.copy(
|
||||
src=f.as_posix(), dst=(temp_folder / f.name).as_posix(),
|
||||
follow_symlinks=False)
|
||||
|
||||
# rename the target folder
|
||||
target_cache_folder = self._cache_folder / '.'.join(keys)
|
||||
# if we failed moving it means someone else created the cached entry before us, we can just leave
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
shutil.move(src=temp_folder.as_posix(), dst=target_cache_folder.as_posix())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
shutil.rmtree(path=temp_folder.as_posix())
|
||||
except BaseException:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_last_copied_entry(self):
|
||||
# type: () -> Optional[Path]
|
||||
"""
|
||||
:return: the last copied cached entry folder inside the cache
|
||||
"""
|
||||
return self._last_copied_entry_folder
|
||||
|
||||
def _remove_old_entries(self, max_cache_entries=None):
|
||||
# type: (Optional[int]) -> ()
|
||||
"""
|
||||
Notice we only keep self._max_cache_entries-1, assuming we will be adding a new entry soon
|
||||
:param int max_cache_entries: if not None use instead of self._max_cache_entries
|
||||
"""
|
||||
folder_entries = [(cache_folder, cache_folder.stat().st_mtime)
|
||||
for cache_folder in self._cache_folder.glob('*')
|
||||
if cache_folder.is_dir() and not cache_folder.name.startswith(self._temp_entry_prefix)]
|
||||
folder_entries = sorted(folder_entries, key=lambda x: x[1], reverse=True)
|
||||
|
||||
# lock so we make sure no one deletes it before we copy it
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
return
|
||||
|
||||
number_of_entries_to_keep = self._max_cache_entries - 1 \
|
||||
if max_cache_entries is None else max(0, int(max_cache_entries))
|
||||
for folder, ts in folder_entries[number_of_entries_to_keep:]:
|
||||
try:
|
||||
shutil.rmtree(folder.as_posix(), ignore_errors=True)
|
||||
except BaseException as ex:
|
||||
warning('Could not delete cache entry {}: {}'.format(folder.as_posix(), ex))
|
||||
|
||||
self._lock.release()
|
||||
|
||||
def _check_min_free_space(self):
|
||||
# type: () -> bool
|
||||
"""
|
||||
:return: return False if we hit the free space limit.
|
||||
If not free space limit provided, always return True
|
||||
"""
|
||||
if not self._min_free_space_gb or not self._cache_folder:
|
||||
return True
|
||||
free_space = float(psutil.disk_usage(self._cache_folder.as_posix()).free)
|
||||
free_space /= 2**30
|
||||
return free_space > self._min_free_space_gb
|
||||
211
clearml_agent/helper/os/locks.py
Normal file
211
clearml_agent/helper/os/locks.py
Normal file
@@ -0,0 +1,211 @@
|
||||
import os
|
||||
import time
|
||||
import tempfile
|
||||
import contextlib
|
||||
|
||||
from .portalocker import constants, exceptions, lock, unlock
|
||||
|
||||
|
||||
current_time = getattr(time, "monotonic", time.time)
|
||||
|
||||
DEFAULT_TIMEOUT = 10 ** 8
|
||||
DEFAULT_CHECK_INTERVAL = 0.25
|
||||
LOCK_METHOD = constants.LOCK_EX | constants.LOCK_NB
|
||||
|
||||
__all__ = [
|
||||
'FileLock',
|
||||
'open_atomic',
|
||||
]
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def open_atomic(filename, binary=True):
|
||||
"""Open a file for atomic writing. Instead of locking this method allows
|
||||
you to write the entire file and move it to the actual location. Note that
|
||||
this makes the assumption that a rename is atomic on your platform which
|
||||
is generally the case but not a guarantee.
|
||||
|
||||
http://docs.python.org/library/os.html#os.rename
|
||||
|
||||
>>> filename = 'test_file.txt'
|
||||
>>> if os.path.exists(filename):
|
||||
... os.remove(filename)
|
||||
|
||||
>>> with open_atomic(filename) as fh:
|
||||
... written = fh.write(b'test')
|
||||
>>> assert os.path.exists(filename)
|
||||
>>> os.remove(filename)
|
||||
|
||||
"""
|
||||
assert not os.path.exists(filename), '%r exists' % filename
|
||||
path, name = os.path.split(filename)
|
||||
|
||||
# Create the parent directory if it doesn't exist
|
||||
if path and not os.path.isdir(path): # pragma: no cover
|
||||
os.makedirs(path)
|
||||
|
||||
temp_fh = tempfile.NamedTemporaryFile(
|
||||
mode=binary and 'wb' or 'w',
|
||||
dir=path,
|
||||
delete=False,
|
||||
)
|
||||
yield temp_fh
|
||||
temp_fh.flush()
|
||||
os.fsync(temp_fh.fileno())
|
||||
temp_fh.close()
|
||||
try:
|
||||
os.rename(temp_fh.name, filename)
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_fh.name)
|
||||
except Exception: # noqa
|
||||
pass
|
||||
|
||||
|
||||
class FileLock(object):
|
||||
|
||||
def __init__(
|
||||
self, filename, mode='a', timeout=DEFAULT_TIMEOUT,
|
||||
check_interval=DEFAULT_CHECK_INTERVAL, fail_when_locked=False,
|
||||
flags=LOCK_METHOD, **file_open_kwargs):
|
||||
"""Lock manager with build-in timeout
|
||||
|
||||
filename -- filename
|
||||
mode -- the open mode, 'a' or 'ab' should be used for writing
|
||||
truncate -- use truncate to emulate 'w' mode, None is disabled, 0 is
|
||||
truncate to 0 bytes
|
||||
timeout -- timeout when trying to acquire a lock
|
||||
check_interval -- check interval while waiting
|
||||
fail_when_locked -- after the initial lock failed, return an error
|
||||
or lock the file
|
||||
**file_open_kwargs -- The kwargs for the `open(...)` call
|
||||
|
||||
fail_when_locked is useful when multiple threads/processes can race
|
||||
when creating a file. If set to true than the system will wait till
|
||||
the lock was acquired and then return an AlreadyLocked exception.
|
||||
|
||||
Note that the file is opened first and locked later. So using 'w' as
|
||||
mode will result in truncate _BEFORE_ the lock is checked.
|
||||
"""
|
||||
|
||||
if 'w' in mode:
|
||||
truncate = True
|
||||
mode = mode.replace('w', 'a')
|
||||
else:
|
||||
truncate = False
|
||||
|
||||
self.fh = None
|
||||
self.filename = filename
|
||||
self.mode = mode
|
||||
self.truncate = truncate
|
||||
self.timeout = timeout
|
||||
self.check_interval = check_interval
|
||||
self.fail_when_locked = fail_when_locked
|
||||
self.flags = flags
|
||||
self.file_open_kwargs = file_open_kwargs
|
||||
|
||||
def acquire(
|
||||
self, timeout=None, check_interval=None, fail_when_locked=None):
|
||||
"""Acquire the locked filehandle"""
|
||||
if timeout is None:
|
||||
timeout = self.timeout
|
||||
if timeout is None:
|
||||
timeout = 0
|
||||
|
||||
if check_interval is None:
|
||||
check_interval = self.check_interval
|
||||
|
||||
if fail_when_locked is None:
|
||||
fail_when_locked = self.fail_when_locked
|
||||
|
||||
# If we already have a filehandle, return it
|
||||
fh = self.fh
|
||||
if fh:
|
||||
return fh
|
||||
|
||||
# Get a new filehandler
|
||||
fh = self._get_fh()
|
||||
try:
|
||||
# Try to lock
|
||||
fh = self._get_lock(fh)
|
||||
except exceptions.LockException as exception:
|
||||
# Try till the timeout has passed
|
||||
timeoutend = current_time() + timeout
|
||||
while timeoutend > current_time():
|
||||
# Wait a bit
|
||||
time.sleep(check_interval)
|
||||
|
||||
# Try again
|
||||
try:
|
||||
|
||||
# We already tried to the get the lock
|
||||
# If fail_when_locked is true, then stop trying
|
||||
if fail_when_locked:
|
||||
raise exceptions.AlreadyLocked(exception)
|
||||
|
||||
else: # pragma: no cover
|
||||
# We've got the lock
|
||||
fh = self._get_lock(fh)
|
||||
break
|
||||
|
||||
except exceptions.LockException:
|
||||
pass
|
||||
|
||||
else:
|
||||
# We got a timeout... reraising
|
||||
raise exceptions.LockException(exception)
|
||||
|
||||
# Prepare the filehandle (truncate if needed)
|
||||
fh = self._prepare_fh(fh)
|
||||
|
||||
self.fh = fh
|
||||
return fh
|
||||
|
||||
def release(self):
|
||||
"""Releases the currently locked file handle"""
|
||||
if self.fh:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
unlock(self.fh)
|
||||
except Exception:
|
||||
pass
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self.fh.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.fh = None
|
||||
|
||||
def _get_fh(self):
|
||||
"""Get a new filehandle"""
|
||||
return open(self.filename, self.mode, **self.file_open_kwargs)
|
||||
|
||||
def _get_lock(self, fh):
|
||||
"""
|
||||
Try to lock the given filehandle
|
||||
|
||||
returns LockException if it fails"""
|
||||
lock(fh, self.flags)
|
||||
return fh
|
||||
|
||||
def _prepare_fh(self, fh):
|
||||
"""
|
||||
Prepare the filehandle for usage
|
||||
|
||||
If truncate is a number, the file will be truncated to that amount of
|
||||
bytes
|
||||
"""
|
||||
if self.truncate:
|
||||
fh.seek(0)
|
||||
fh.truncate(0)
|
||||
|
||||
return fh
|
||||
|
||||
def __enter__(self):
|
||||
return self.acquire()
|
||||
|
||||
def __exit__(self, type_, value, tb):
|
||||
self.release()
|
||||
|
||||
def __delete__(self, instance): # pragma: no cover
|
||||
instance.release()
|
||||
193
clearml_agent/helper/os/portalocker.py
Normal file
193
clearml_agent/helper/os/portalocker.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
class exceptions:
|
||||
class BaseLockException(Exception):
|
||||
# Error codes:
|
||||
LOCK_FAILED = 1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.fh = kwargs.pop('fh', None)
|
||||
Exception.__init__(self, *args, **kwargs)
|
||||
|
||||
class LockException(BaseLockException):
|
||||
pass
|
||||
|
||||
class AlreadyLocked(BaseLockException):
|
||||
pass
|
||||
|
||||
class FileToLarge(BaseLockException):
|
||||
pass
|
||||
|
||||
|
||||
class constants:
|
||||
# The actual tests will execute the code anyhow so the following code can
|
||||
# safely be ignored from the coverage tests
|
||||
if os.name == 'nt': # pragma: no cover
|
||||
import msvcrt
|
||||
|
||||
LOCK_EX = 0x1 #: exclusive lock
|
||||
LOCK_SH = 0x2 #: shared lock
|
||||
LOCK_NB = 0x4 #: non-blocking
|
||||
LOCK_UN = msvcrt.LK_UNLCK #: unlock
|
||||
|
||||
LOCKFILE_FAIL_IMMEDIATELY = 1
|
||||
LOCKFILE_EXCLUSIVE_LOCK = 2
|
||||
|
||||
elif os.name == 'posix': # pragma: no cover
|
||||
import fcntl
|
||||
|
||||
LOCK_EX = fcntl.LOCK_EX #: exclusive lock
|
||||
LOCK_SH = fcntl.LOCK_SH #: shared lock
|
||||
LOCK_NB = fcntl.LOCK_NB #: non-blocking
|
||||
LOCK_UN = fcntl.LOCK_UN #: unlock
|
||||
|
||||
else: # pragma: no cover
|
||||
raise RuntimeError('PortaLocker only defined for nt and posix platforms')
|
||||
|
||||
|
||||
if os.name == 'nt': # pragma: no cover
|
||||
import msvcrt
|
||||
|
||||
if sys.version_info.major == 2:
|
||||
lock_length = -1
|
||||
else:
|
||||
lock_length = int(2**31 - 1)
|
||||
|
||||
def lock(file_, flags):
|
||||
if flags & constants.LOCK_SH:
|
||||
import win32file
|
||||
import pywintypes
|
||||
import winerror
|
||||
__overlapped = pywintypes.OVERLAPPED()
|
||||
if sys.version_info.major == 2:
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = constants.LOCKFILE_FAIL_IMMEDIATELY
|
||||
else:
|
||||
mode = 0
|
||||
|
||||
else:
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = msvcrt.LK_NBRLCK
|
||||
else:
|
||||
mode = msvcrt.LK_RLCK
|
||||
|
||||
# is there any reason not to reuse the following structure?
|
||||
hfile = win32file._get_osfhandle(file_.fileno())
|
||||
try:
|
||||
win32file.LockFileEx(hfile, mode, 0, -0x10000, __overlapped)
|
||||
except pywintypes.error as exc_value:
|
||||
# error: (33, 'LockFileEx', 'The process cannot access the file
|
||||
# because another process has locked a portion of the file.')
|
||||
if exc_value.winerror == winerror.ERROR_LOCK_VIOLATION:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
else:
|
||||
# Q: Are there exceptions/codes we should be dealing with
|
||||
# here?
|
||||
raise
|
||||
else:
|
||||
mode = constants.LOCKFILE_EXCLUSIVE_LOCK
|
||||
if flags & constants.LOCK_NB:
|
||||
mode |= constants.LOCKFILE_FAIL_IMMEDIATELY
|
||||
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = msvcrt.LK_NBLCK
|
||||
else:
|
||||
mode = msvcrt.LK_LOCK
|
||||
|
||||
# windows locks byte ranges, so make sure to lock from file start
|
||||
try:
|
||||
savepos = file_.tell()
|
||||
if savepos:
|
||||
# [ ] test exclusive lock fails on seek here
|
||||
# [ ] test if shared lock passes this point
|
||||
file_.seek(0)
|
||||
# [x] check if 0 param locks entire file (not documented in
|
||||
# Python)
|
||||
# [x] fails with "IOError: [Errno 13] Permission denied",
|
||||
# but -1 seems to do the trick
|
||||
|
||||
try:
|
||||
msvcrt.locking(file_.fileno(), mode, lock_length)
|
||||
except IOError as exc_value:
|
||||
# [ ] be more specific here
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
finally:
|
||||
if savepos:
|
||||
file_.seek(savepos)
|
||||
except IOError as exc_value:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
|
||||
fh=file_)
|
||||
|
||||
def unlock(file_):
|
||||
try:
|
||||
savepos = file_.tell()
|
||||
if savepos:
|
||||
file_.seek(0)
|
||||
|
||||
try:
|
||||
msvcrt.locking(file_.fileno(), constants.LOCK_UN, lock_length)
|
||||
except IOError as exc_value:
|
||||
if exc_value.strerror == 'Permission denied':
|
||||
import pywintypes
|
||||
import win32file
|
||||
import winerror
|
||||
__overlapped = pywintypes.OVERLAPPED()
|
||||
hfile = win32file._get_osfhandle(file_.fileno())
|
||||
try:
|
||||
win32file.UnlockFileEx(
|
||||
hfile, 0, -0x10000, __overlapped)
|
||||
except pywintypes.error as exc_value:
|
||||
if exc_value.winerror == winerror.ERROR_NOT_LOCKED:
|
||||
# error: (158, 'UnlockFileEx',
|
||||
# 'The segment is already unlocked.')
|
||||
# To match the 'posix' implementation, silently
|
||||
# ignore this error
|
||||
pass
|
||||
else:
|
||||
# Q: Are there exceptions/codes we should be
|
||||
# dealing with here?
|
||||
raise
|
||||
else:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
finally:
|
||||
if savepos:
|
||||
file_.seek(savepos)
|
||||
except IOError as exc_value:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
|
||||
fh=file_)
|
||||
|
||||
elif os.name == 'posix': # pragma: no cover
|
||||
import fcntl
|
||||
|
||||
def lock(file_, flags):
|
||||
locking_exceptions = IOError,
|
||||
try: # pragma: no cover
|
||||
locking_exceptions += BlockingIOError,
|
||||
except NameError: # pragma: no cover
|
||||
pass
|
||||
|
||||
try:
|
||||
fcntl.flock(file_.fileno(), flags)
|
||||
except locking_exceptions as exc_value:
|
||||
# The exception code varies on different systems so we'll catch
|
||||
# every IO error
|
||||
raise exceptions.LockException(exc_value, fh=file_)
|
||||
|
||||
def unlock(file_):
|
||||
fcntl.flock(file_.fileno(), constants.LOCK_UN)
|
||||
|
||||
else: # pragma: no cover
|
||||
raise RuntimeError('PortaLocker only defined for nt and posix platforms')
|
||||
@@ -1,11 +1,18 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from typing import Text, Iterable, Union
|
||||
from hashlib import md5
|
||||
from typing import Text, Iterable, Union, Optional, Dict, List
|
||||
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.definitions import ENV_VENV_CACHE_PATH
|
||||
from clearml_agent.helper.base import mkstemp, safe_remove_file, join_lines, select_for_platform
|
||||
from clearml_agent.helper.console import ensure_binary
|
||||
from clearml_agent.helper.os.folder_cache import FolderCache
|
||||
from clearml_agent.helper.process import Executable, Argv, PathLike
|
||||
|
||||
|
||||
@@ -18,6 +25,12 @@ class PackageManager(object):
|
||||
_selected_manager = None
|
||||
_cwd = None
|
||||
_pip_version = None
|
||||
_config_cache_folder = 'agent.venvs_cache.path'
|
||||
_config_cache_max_entries = 'agent.venvs_cache.max_entries'
|
||||
_config_cache_free_space_threshold = 'agent.venvs_cache.free_space_threshold_gb'
|
||||
|
||||
def __init__(self):
|
||||
self._cache_manager = None
|
||||
|
||||
@abc.abstractproperty
|
||||
def bin(self):
|
||||
@@ -153,3 +166,103 @@ class PackageManager(object):
|
||||
@classmethod
|
||||
def get_pip_version(cls):
|
||||
return cls._pip_version or ''
|
||||
|
||||
def get_cached_venv(self, requirements, docker_cmd, python_version, cuda_version, destination_folder):
|
||||
# type: (Dict, Optional[Union[dict, str]], Optional[str], Optional[str], Path) -> Optional[Path]
|
||||
"""
|
||||
Copy a cached copy of the venv (based on the requirements) into destination_folder.
|
||||
Return None if failed or cached entry does not exist
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return None
|
||||
|
||||
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
|
||||
return self._get_cache_manager().copy_cached_entry(keys, destination_folder)
|
||||
|
||||
def add_cached_venv(
|
||||
self,
|
||||
requirements, # type: Union[Dict, List[Dict]]
|
||||
docker_cmd, # type: Optional[Union[dict, str]]
|
||||
python_version, # type: Optional[str]
|
||||
cuda_version, # type: Optional[str]
|
||||
source_folder, # type: Path
|
||||
exclude_sub_folders=None # type: Optional[List[str]]
|
||||
):
|
||||
# type: (...) -> ()
|
||||
"""
|
||||
Copy the local venv folder into the venv cache (keys are based on the requirements+python+docker).
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
|
||||
return self._get_cache_manager().add_entry(
|
||||
keys=keys, source_folder=source_folder, exclude_sub_folders=exclude_sub_folders)
|
||||
|
||||
def get_cache_folder(self):
|
||||
# type: () -> Optional[Path]
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
return self._get_cache_manager().get_cache_folder()
|
||||
|
||||
def get_last_used_entry_cache(self):
|
||||
# type: () -> Optional[Path]
|
||||
"""
|
||||
:return: the last used cached folder entry
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
return self._get_cache_manager().get_last_copied_entry()
|
||||
|
||||
@classmethod
|
||||
def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
|
||||
# type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]
|
||||
requirements_list = requirements_list or dict()
|
||||
if not isinstance(requirements_list, (list, tuple)):
|
||||
requirements_list = [requirements_list]
|
||||
docker_cmd = dict(docker_cmd=docker_cmd) if isinstance(docker_cmd, str) else docker_cmd or dict()
|
||||
docker_cmd = OrderedDict(sorted(docker_cmd.items(), key=lambda t: t[0]))
|
||||
if 'docker_cmd' in docker_cmd:
|
||||
# we only take the first part of the docker_cmd which is the docker image name
|
||||
docker_cmd['docker_cmd'] = docker_cmd['docker_cmd'].strip('\r\n\t ').split(' ')[0]
|
||||
|
||||
keys = []
|
||||
strip_chars = '\n\r\t '
|
||||
for requirements in requirements_list:
|
||||
pip, conda = ('pip', 'conda')
|
||||
pip_reqs = requirements.get(pip, '')
|
||||
conda_reqs = requirements.get(conda, '')
|
||||
if isinstance(pip_reqs, str):
|
||||
pip_reqs = pip_reqs.split('\n')
|
||||
if isinstance(conda_reqs, str):
|
||||
conda_reqs = conda_reqs.split('\n')
|
||||
pip_reqs = sorted([p.strip(strip_chars) for p in pip_reqs
|
||||
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
|
||||
conda_reqs = sorted([p.strip(strip_chars) for p in conda_reqs
|
||||
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
|
||||
if not pip_reqs and not conda_reqs:
|
||||
continue
|
||||
# do not process "-r" or "--requirement" because we cannot know what we have in the git repo.
|
||||
if any(r.strip().startswith('-r ') or r.strip().startswith('--requirement ') for r in pip_reqs):
|
||||
continue
|
||||
hash_text = '{class_type}\n{docker_cmd}\n{cuda_ver}\n{python_version}\n{pip_reqs}\n{conda_reqs}'.format(
|
||||
class_type=str(cls),
|
||||
docker_cmd=str(docker_cmd or ''),
|
||||
cuda_ver=str(cuda_version or ''),
|
||||
python_version=str(python_version or ''),
|
||||
pip_reqs=str(pip_reqs or ''),
|
||||
conda_reqs=str(conda_reqs or ''),
|
||||
)
|
||||
keys.append(md5(ensure_binary(hash_text)).hexdigest())
|
||||
return sorted(list(set(keys)))
|
||||
|
||||
def _get_cache_manager(self):
|
||||
if not self._cache_manager:
|
||||
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
|
||||
if not cache_folder:
|
||||
return None
|
||||
|
||||
max_entries = int(self.session.config.get(self._config_cache_max_entries, 10))
|
||||
free_space_threshold = float(self.session.config.get(self._config_cache_free_space_threshold, 0))
|
||||
self._cache_manager = FolderCache(
|
||||
cache_folder, max_cache_entries=max_entries, min_free_space_gb=free_space_threshold)
|
||||
return self._cache_manager
|
||||
|
||||
@@ -19,7 +19,9 @@ from clearml_agent.external.requirements_parser import parse
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
from clearml_agent.errors import CommandFailedError
|
||||
from clearml_agent.helper.base import rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo
|
||||
from clearml_agent.helper.base import (
|
||||
rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo,
|
||||
convert_cuda_version_to_float_single_digit_str, convert_cuda_version_to_int_10_base_str, )
|
||||
from clearml_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
|
||||
from clearml_agent.helper.package.requirements import SimpleVersion
|
||||
from clearml_agent.session import Session
|
||||
@@ -69,6 +71,7 @@ class CondaAPI(PackageManager):
|
||||
:param python: base python version to use (e.g python3.6)
|
||||
:param path: path of env
|
||||
"""
|
||||
super(CondaAPI, self).__init__()
|
||||
self.session = session
|
||||
self.python = python
|
||||
self.source = None
|
||||
@@ -140,19 +143,7 @@ class CondaAPI(PackageManager):
|
||||
"""
|
||||
if self.conda_env_as_base_docker and self.conda_pre_build_env_path:
|
||||
if Path(self.conda_pre_build_env_path).is_dir():
|
||||
print("Using pre-existing Conda environment from {}".format(self.conda_pre_build_env_path))
|
||||
self.path = Path(self.conda_pre_build_env_path)
|
||||
self.source = ("conda", "activate", self.path.as_posix())
|
||||
self.pip = CondaPip(
|
||||
session=self.session,
|
||||
source=self.source,
|
||||
python=self.python,
|
||||
requirements_manager=self.requirements_manager,
|
||||
path=self.path,
|
||||
)
|
||||
conda_env = self._get_conda_sh()
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
self.env_read_only = True
|
||||
self._init_existing_environment(self.conda_pre_build_env_path)
|
||||
return self
|
||||
elif Path(self.conda_pre_build_env_path).is_file():
|
||||
print("Restoring Conda environment from {}".format(self.conda_pre_build_env_path))
|
||||
@@ -178,7 +169,7 @@ class CondaAPI(PackageManager):
|
||||
raise ValueError("Could not restore Conda environment, cannot find {}".format(
|
||||
self.conda_pre_build_env_path))
|
||||
|
||||
output = Argv(
|
||||
command = Argv(
|
||||
self.conda,
|
||||
"create",
|
||||
"--yes",
|
||||
@@ -186,7 +177,9 @@ class CondaAPI(PackageManager):
|
||||
"--prefix",
|
||||
self.path,
|
||||
"python={}".format(self.python),
|
||||
).get_output(stderr=DEVNULL)
|
||||
)
|
||||
print('Executing Conda: {}'.format(command.serialize()))
|
||||
output = command.get_output(stderr=DEVNULL)
|
||||
match = re.search(
|
||||
r"\W*(.*activate) ({})".format(re.escape(str(self.path))), output
|
||||
)
|
||||
@@ -200,16 +193,23 @@ class CondaAPI(PackageManager):
|
||||
if conda_env.is_file() and not is_windows_platform():
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
|
||||
# install cuda toolkit
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = float(int(self.session.config['agent.cuda_version'])) / 10.0
|
||||
if cuda_version > 0:
|
||||
self._install('cudatoolkit={:.1f}'.format(cuda_version))
|
||||
except Exception:
|
||||
pass
|
||||
return self
|
||||
|
||||
def _init_existing_environment(self, conda_pre_build_env_path):
|
||||
print("Using pre-existing Conda environment from {}".format(conda_pre_build_env_path))
|
||||
self.path = Path(conda_pre_build_env_path)
|
||||
self.source = ("conda", "activate", self.path.as_posix())
|
||||
self.pip = CondaPip(
|
||||
session=self.session,
|
||||
source=self.source,
|
||||
python=self.python,
|
||||
requirements_manager=self.requirements_manager,
|
||||
path=self.path,
|
||||
)
|
||||
conda_env = self._get_conda_sh()
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
self.env_read_only = True
|
||||
|
||||
def remove(self):
|
||||
"""
|
||||
Delete a conda environment.
|
||||
@@ -452,9 +452,18 @@ class CondaAPI(PackageManager):
|
||||
requirements['conda'] = requirements['conda'].split('\n')
|
||||
has_torch = False
|
||||
has_matplotlib = False
|
||||
has_cudatoolkit = False
|
||||
cuda_version_full = 0
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = int(self.session.config.get('agent.cuda_version', 0))
|
||||
except:
|
||||
# notice this is an integer version: 112 (means 11.2)
|
||||
cuda_version = str(self.session.config.get('agent.cuda_version', "")).strip()
|
||||
if not cuda_version:
|
||||
cuda_version = 0
|
||||
else:
|
||||
cuda_version_full = convert_cuda_version_to_float_single_digit_str(cuda_version)
|
||||
cuda_version = int(convert_cuda_version_to_int_10_base_str(cuda_version))
|
||||
except Exception:
|
||||
cuda_version = 0
|
||||
|
||||
# notice 'conda' entry with empty string is a valid conda requirements list, it means pip only
|
||||
@@ -471,6 +480,7 @@ class CondaAPI(PackageManager):
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
m.validate_local_file_ref()
|
||||
# conda does not support version control links
|
||||
if m.vcs:
|
||||
pip_requirements.append(m)
|
||||
@@ -484,6 +494,19 @@ class CondaAPI(PackageManager):
|
||||
if '.' not in m.specs[0][1]:
|
||||
continue
|
||||
|
||||
if m.name.lower() == 'cudatoolkit':
|
||||
# skip cuda if we are running on CPU
|
||||
if not cuda_version:
|
||||
continue
|
||||
|
||||
has_cudatoolkit = True
|
||||
# cuda version, only major.minor
|
||||
requested_cuda_version = '.'.join(m.specs[0][1].split('.')[:2])
|
||||
# make sure that the cuda_version we support can install the requested cuda (major version)
|
||||
if int(float(requested_cuda_version)) > int(float(cuda_version)/10.0):
|
||||
continue
|
||||
m.specs = [(m.specs[0][0], str(requested_cuda_version)), ]
|
||||
|
||||
conda_supported_req_names.append(m.name.lower())
|
||||
if m.req.name.lower() == 'matplotlib':
|
||||
has_matplotlib = True
|
||||
@@ -500,7 +523,14 @@ class CondaAPI(PackageManager):
|
||||
|
||||
reqs.append(m)
|
||||
|
||||
if not has_cudatoolkit and cuda_version:
|
||||
m = MarkerRequirement(Requirement.parse("cudatoolkit == {}".format(cuda_version_full)))
|
||||
has_cudatoolkit = True
|
||||
reqs.append(m)
|
||||
|
||||
# if we have a conda list, the rest should be installed with pip,
|
||||
# this means any experiment that was executed with pip environment,
|
||||
# will be installed using pip
|
||||
if requirements.get('conda', None) is not None:
|
||||
for r in requirements['pip']:
|
||||
try:
|
||||
@@ -511,10 +541,10 @@ class CondaAPI(PackageManager):
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
# skip over local files (we cannot change the version to a local file)
|
||||
if m.local_file:
|
||||
continue
|
||||
m_name = m.name.lower()
|
||||
# remove local files reference if it does not exist (leave the package name)
|
||||
m.validate_local_file_ref()
|
||||
|
||||
m_name = (m.name or '').lower()
|
||||
if m_name in conda_supported_req_names:
|
||||
# this package is in the conda list,
|
||||
# make sure that if we changed version and we match it in conda
|
||||
@@ -551,10 +581,14 @@ class CondaAPI(PackageManager):
|
||||
# conform conda packages (version/name)
|
||||
for r in reqs:
|
||||
# change _ to - in name but not the prefix _ (as this is conda prefix)
|
||||
if not r.name.startswith('_') and not requirements.get('conda', None):
|
||||
if r.name and not r.name.startswith('_') and not requirements.get('conda', None):
|
||||
r.name = r.name.replace('_', '-')
|
||||
# remove .post from version numbers, it fails ~= version, and change == to ~=
|
||||
if r.specs and r.specs[0]:
|
||||
|
||||
if has_cudatoolkit and r.specs and len(r.specs[0]) > 1 and r.name == 'cudatoolkit':
|
||||
# select specific cuda version if it came from the requirements
|
||||
r.specs = [(r.specs[0][0].replace('==', '='), r.specs[0][1].split('.post')[0])]
|
||||
elif r.specs and r.specs[0] and len(r.specs[0]) > 1:
|
||||
# remove .post from version numbers it fails with ~= version, and change == to ~=
|
||||
r.specs = [(r.specs[0][0].replace('==', '~='), r.specs[0][1].split('.post')[0])]
|
||||
|
||||
while reqs:
|
||||
@@ -567,6 +601,8 @@ class CondaAPI(PackageManager):
|
||||
conda_env['dependencies'] = [clean_ver(r) for r in reqs]
|
||||
with self.temp_file("conda_env", yaml.dump(conda_env), suffix=".yml") as name:
|
||||
print('Conda: Trying to install requirements:\n{}'.format(conda_env['dependencies']))
|
||||
if self.session.debug_mode:
|
||||
print('{}:\n{}'.format(name, yaml.dump(conda_env)))
|
||||
result = self._run_command(
|
||||
("env", "update", "-p", self.path, "--file", name)
|
||||
)
|
||||
@@ -597,6 +633,8 @@ class CondaAPI(PackageManager):
|
||||
pip_req_str = [r.tostr() for r in pip_requirements if r.name not in ('pip', 'virtualenv', )]
|
||||
print('Conda: Installing requirements: step 2 - using pip:\n{}'.format(pip_req_str))
|
||||
PackageManager._selected_manager = self.pip
|
||||
if self.session.debug_mode:
|
||||
print('pip requirements.txt:\n{}'.format('\n'.join(pip_req_str)))
|
||||
self.pip.load_requirements({'pip': '\n'.join(pip_req_str)})
|
||||
except Exception as e:
|
||||
print(e)
|
||||
@@ -640,12 +678,16 @@ class CondaAPI(PackageManager):
|
||||
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
|
||||
return ansi_escape.sub('', line)
|
||||
|
||||
# make sure we are not running it with our own PYTHONPATH
|
||||
env = dict(**os.environ)
|
||||
env.pop('PYTHONPATH', None)
|
||||
|
||||
command = Argv(*command) # type: Executable
|
||||
if not raw:
|
||||
command = (self.conda,) + command + ("--quiet", "--json")
|
||||
try:
|
||||
print('Executing Conda: {}'.format(command.serialize()))
|
||||
result = command.get_output(stdin=DEVNULL, **kwargs)
|
||||
result = command.get_output(stdin=DEVNULL, env=env, **kwargs)
|
||||
if self.session.debug_mode:
|
||||
print(result)
|
||||
except Exception as e:
|
||||
@@ -665,6 +707,8 @@ class CondaAPI(PackageManager):
|
||||
return result
|
||||
|
||||
def get_python_command(self, extra=()):
|
||||
if not self.source:
|
||||
self._init_existing_environment(self.path)
|
||||
return CommandSequence(self.source, self.pip.get_python_command(extra=extra))
|
||||
|
||||
def _get_conda_sh(self):
|
||||
|
||||
@@ -2,6 +2,8 @@ import re
|
||||
from collections import OrderedDict
|
||||
from typing import Text
|
||||
|
||||
from pathlib2 import Path
|
||||
|
||||
from .base import PackageManager
|
||||
from .requirements import SimpleSubstitution
|
||||
from ..base import safe_furl as furl
|
||||
@@ -10,13 +12,27 @@ from ..base import safe_furl as furl
|
||||
class ExternalRequirements(SimpleSubstitution):
|
||||
|
||||
name = "external_link"
|
||||
cwd = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ExternalRequirements, self).__init__(*args, **kwargs)
|
||||
self.post_install_req = []
|
||||
self.post_install_req_lookup = OrderedDict()
|
||||
self.post_install_local_req_lookup = OrderedDict()
|
||||
|
||||
def match(self, req):
|
||||
# match local folder building:
|
||||
if self.is_local_folder_package(req):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
folder_path = req.req.line.strip().split('#')[0].strip()
|
||||
if self.cwd and not Path(folder_path).is_absolute():
|
||||
folder_path = (Path(self.cwd) / Path(folder_path)).absolute().as_posix()
|
||||
self.post_install_local_req_lookup['file://{}'.format(folder_path)] = req.req.line
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
# match both editable or code or unparsed
|
||||
if not (not req.name or req.req and (req.req.editable or req.req.vcs)):
|
||||
return False
|
||||
@@ -35,31 +51,7 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
except:
|
||||
freeze_base = ''
|
||||
|
||||
req_line = req.tostr(markers=False)
|
||||
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
|
||||
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
|
||||
|
||||
if req.req.vcs and req_line.startswith('git+'):
|
||||
try:
|
||||
url_no_frag = furl(req_line)
|
||||
url_no_frag.set(fragment=None)
|
||||
# reverse replace
|
||||
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
req_line,
|
||||
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
|
||||
req_line = new_req_line
|
||||
except Exception:
|
||||
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
|
||||
req_line = self._add_vcs_credentials(req, session)
|
||||
|
||||
# if we have older pip version we have to make sure we replace back the package name with the
|
||||
# git repository link. In new versions this is supported and we get "package @ git+https://..."
|
||||
@@ -79,6 +71,43 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
if not PackageManager.out_of_scope_install_package(req_line):
|
||||
raise ValueError("Failed installing GIT/HTTPs package \'{}\'".format(req_line))
|
||||
|
||||
@staticmethod
|
||||
def _add_vcs_credentials(req, session):
|
||||
req_line = req.tostr(markers=False)
|
||||
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
|
||||
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
|
||||
if req.req.vcs and req_line.startswith('git+'):
|
||||
try:
|
||||
url_no_frag = furl(req_line)
|
||||
url_no_frag.set(fragment=None)
|
||||
# reverse replace
|
||||
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
# remove ssh:// or git:// prefix for git detection and credentials
|
||||
scheme = ''
|
||||
if vcs_url and (vcs_url.startswith('ssh://') or vcs_url.startswith('git://')):
|
||||
scheme = 'ssh://' # notice git:// is actually ssh://
|
||||
vcs_url = vcs_url[6:]
|
||||
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}{}'.format(
|
||||
'' if scheme and '://' in vcs.url else scheme,
|
||||
vcs.url_with_auth, fragment
|
||||
)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
req_line,
|
||||
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
|
||||
req_line = new_req_line
|
||||
except Exception:
|
||||
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
|
||||
return req_line
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
@@ -103,4 +132,48 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
if r not in self.post_install_req_lookup]
|
||||
list_of_requirements[k] += [self.post_install_req_lookup.get(r, '')
|
||||
for r in self.post_install_req_lookup.keys() if r in original_requirements]
|
||||
|
||||
if self.post_install_local_req_lookup:
|
||||
original_requirements = list_of_requirements[k]
|
||||
list_of_requirements[k] = [
|
||||
r for r in original_requirements
|
||||
if len(r.split('@', 1)) != 2 or r.split('@', 1)[1].strip() not in self.post_install_local_req_lookup]
|
||||
|
||||
list_of_requirements[k] += [
|
||||
self.post_install_local_req_lookup.get(r.split('@', 1)[1].strip(), '')
|
||||
for r in original_requirements
|
||||
if len(r.split('@', 1)) == 2 and r.split('@', 1)[1].strip() in self.post_install_local_req_lookup]
|
||||
|
||||
return list_of_requirements
|
||||
|
||||
@classmethod
|
||||
def is_local_folder_package(cls, req):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not req.name and req.req and not req.req.editable and not req.req.vcs and \
|
||||
req.req.line and req.req.line.strip().split('#')[0] and \
|
||||
not req.req.line.strip().split('#')[0].lower().endswith('.whl') and \
|
||||
not (req.req.line.strip().startswith('-r ') or req.req.line.strip().startswith('--requirement ')):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
class OnlyExternalRequirements(ExternalRequirements):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(OnlyExternalRequirements, self).__init__(*args, **kwargs)
|
||||
|
||||
def match(self, req):
|
||||
return True
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
# Do not store the skipped requirements
|
||||
# mark skip package
|
||||
if super(OnlyExternalRequirements, self).match(req):
|
||||
return self._add_vcs_credentials(req, self._session)
|
||||
return Text('')
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import os
|
||||
import sys
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Text, Optional
|
||||
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
|
||||
@@ -17,7 +19,8 @@ class SystemPip(PackageManager):
|
||||
"""
|
||||
Program interface to the system pip.
|
||||
"""
|
||||
self._bin = interpreter or sys.executable
|
||||
super(SystemPip, self).__init__()
|
||||
self._bin = Path(interpreter or sys.executable)
|
||||
self.session = session
|
||||
|
||||
@property
|
||||
@@ -81,7 +84,10 @@ class SystemPip(PackageManager):
|
||||
:param kwargs: kwargs for get_output/check_output command
|
||||
"""
|
||||
command = self._make_command(command)
|
||||
return (command.get_output if output else command.check_call)(stdin=DEVNULL, **kwargs)
|
||||
# make sure we are not running it with our own PYTHONPATH
|
||||
env = dict(**os.environ)
|
||||
env.pop('PYTHONPATH', None)
|
||||
return (command.get_output if output else command.check_call)(stdin=DEVNULL, env=env, **kwargs)
|
||||
|
||||
def _make_command(self, command):
|
||||
return Argv(self.bin, '-m', 'pip', '--disable-pip-version-check', *command)
|
||||
|
||||
@@ -12,7 +12,7 @@ from ..requirements import RequirementsManager
|
||||
|
||||
class VirtualenvPip(SystemPip, PackageManager):
|
||||
def __init__(self, session, python, requirements_manager, path, interpreter=None, execution_info=None, **kwargs):
|
||||
# type: (Session, float, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
|
||||
# type: (Session, str, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
|
||||
"""
|
||||
Program interface to virtualenv pip.
|
||||
Must be given either path to virtualenv or source command.
|
||||
|
||||
@@ -5,6 +5,7 @@ import attr
|
||||
import sys
|
||||
import os
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.helper.process import Argv, DEVNULL, check_if_command_exists
|
||||
from clearml_agent.session import Session, POETRY
|
||||
|
||||
@@ -81,6 +82,32 @@ class PoetryConfig:
|
||||
@_guard_enabled
|
||||
def initialize(self, cwd=None):
|
||||
if not self._initialized:
|
||||
if self.session.config.get("agent.package_manager.poetry_version", None) is not None:
|
||||
version = str(self.session.config.get("agent.package_manager.poetry_version"))
|
||||
print('Upgrading Poetry package {}'.format(version))
|
||||
# first upgrade pip if we need to
|
||||
try:
|
||||
from clearml_agent.helper.package.pip_api.venv import VirtualenvPip
|
||||
pip = VirtualenvPip(
|
||||
session=self.session, python=self._python,
|
||||
requirements_manager=None, path=None, interpreter=self._python)
|
||||
pip.upgrade_pip()
|
||||
except Exception as ex:
|
||||
self.log.warning("failed upgrading pip: {}".format(ex))
|
||||
|
||||
# now install poetry
|
||||
try:
|
||||
version = version.replace(' ', '')
|
||||
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
|
||||
version = version
|
||||
elif version:
|
||||
version = "==" + version
|
||||
argv = Argv(self._python, "-m", "pip", "install", "poetry{}".format(version),
|
||||
"--upgrade", "--disable-pip-version-check")
|
||||
print(argv.get_output())
|
||||
except Exception as ex:
|
||||
self.log.warning("failed upgrading poetry: {}".format(ex))
|
||||
|
||||
self._initialized = True
|
||||
try:
|
||||
self._config("--local", "virtualenvs.in-project", "true", cwd=cwd)
|
||||
|
||||
@@ -174,36 +174,42 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
self.log = self._session.get_logger(__name__)
|
||||
self.package_manager = self.config["agent.package_manager.type"].lower()
|
||||
self.os = os_name or self.get_platform()
|
||||
self.cuda = "cuda{}".format(self.cuda_version).lower()
|
||||
self.python_version_string = str(self.config["agent.default_python"])
|
||||
self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
|
||||
if '.' not in self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
"must have both major and minor parts of the version (for example: '3.7')".format(
|
||||
self.python_version_string
|
||||
)
|
||||
)
|
||||
self.python = "python{}".format(self.python_major_minor_str)
|
||||
|
||||
self.exceptions = [
|
||||
PytorchResolutionError(message)
|
||||
for message in (
|
||||
None,
|
||||
'cuda version "{}" is not supported'.format(self.cuda),
|
||||
'python version "{}" is not supported'.format(
|
||||
self.python_version_string
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
try:
|
||||
self.validate_python_version()
|
||||
except PytorchResolutionError as e:
|
||||
self.log.warn("will not be able to install pytorch wheels: %s", e.args[0])
|
||||
|
||||
self.cuda = None
|
||||
self.python_version_string = None
|
||||
self.python_major_minor_str = None
|
||||
self.python = None
|
||||
self.exceptions = []
|
||||
self._original_req = []
|
||||
|
||||
def _init_python_ver_cuda_ver(self):
|
||||
if self.cuda is None:
|
||||
self.cuda = "cuda{}".format(self.cuda_version).lower()
|
||||
if self.python_version_string is None:
|
||||
self.python_version_string = str(self.config["agent.default_python"])
|
||||
if self.python_major_minor_str is None:
|
||||
self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
|
||||
if '.' not in self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
"must have both major and minor parts of the version (for example: '3.7')".format(
|
||||
self.python_version_string
|
||||
)
|
||||
)
|
||||
if self.python is None:
|
||||
self.python = "python{}".format(self.python_major_minor_str)
|
||||
|
||||
if not self.exceptions:
|
||||
self.exceptions = [
|
||||
PytorchResolutionError(message)
|
||||
for message in (
|
||||
None,
|
||||
'cuda version "{}" is not supported'.format(self.cuda),
|
||||
'python version "{}" is not supported'.format(
|
||||
self.python_version_string
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
@property
|
||||
def is_conda(self):
|
||||
return self.package_manager == "conda"
|
||||
@@ -216,6 +222,8 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
"""
|
||||
Make sure python version has both major and minor versions as required for choosing pytorch wheel
|
||||
"""
|
||||
self._init_python_ver_cuda_ver()
|
||||
|
||||
if self.is_pip and not self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
@@ -263,6 +271,9 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
continue
|
||||
if len(parts) < 5 or platform_wheel not in parts[4]:
|
||||
continue
|
||||
# yes this is for linux python 2.7 support, this is the only python 2.7 we support...
|
||||
if py_ver and py_ver[0] == '2' and len(parts) > 3 and not parts[3].endswith('u'):
|
||||
continue
|
||||
# update the closest matched version (from above)
|
||||
if not closest_v:
|
||||
closest_v = v
|
||||
@@ -291,6 +302,7 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
|
||||
def get_url_for_platform(self, req):
|
||||
# check if package is already installed with system packages
|
||||
self.validate_python_version()
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if self.config.get("agent.package_manager.system_site_packages", None):
|
||||
|
||||
@@ -14,9 +14,13 @@ from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
|
||||
import six
|
||||
import logging
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES
|
||||
from clearml_agent.helper.base import warning, is_conda, which, join_lines, is_windows_platform
|
||||
from clearml_agent.helper.base import (
|
||||
warning, is_conda, which, join_lines, is_windows_platform,
|
||||
convert_cuda_version_to_int_10_base_str, )
|
||||
from clearml_agent.helper.process import Argv, PathLike
|
||||
from clearml_agent.helper.gpu.gpustat import get_driver_cuda_version
|
||||
from clearml_agent.session import Session, normalize_cuda_version
|
||||
from clearml_agent.external.requirements_parser import parse
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
@@ -152,6 +156,31 @@ class MarkerRequirement(object):
|
||||
return SimpleVersion.compare_versions(
|
||||
version_a=requested_version, op=op, version_b=version, num_parts=num_parts)
|
||||
|
||||
def remove_local_file_ref(self):
|
||||
if not self.local_file or self.vcs or self.editable or self.path:
|
||||
return False
|
||||
parts = re.split(r"@\s*{}".format(self.req.uri), self.req.line)
|
||||
# if we did not find anything do nothing
|
||||
if len(parts) < 2:
|
||||
return False
|
||||
self.req.line = ''.join(parts).strip()
|
||||
self.req.uri = None
|
||||
self.req.local_file = False
|
||||
return True
|
||||
|
||||
def validate_local_file_ref(self):
|
||||
# if local file does not exist, remove the reference to it
|
||||
if self.vcs or self.editable or self.path or not self.local_file or not self.name or \
|
||||
not self.uri or not self.uri.startswith("file://"):
|
||||
return
|
||||
local_path = Path(self.uri[len("file://"):])
|
||||
if not local_path.exists():
|
||||
line = self.line
|
||||
if self.remove_local_file_ref():
|
||||
# print warning
|
||||
logging.getLogger(__name__).warning(
|
||||
'Local file not found [{}], references removed !'.format(line))
|
||||
|
||||
|
||||
class SimpleVersion:
|
||||
_sub_versions_pep440 = ['a', 'b', 'rc', '.post', '.dev', '+', ]
|
||||
@@ -207,7 +236,11 @@ class SimpleVersion:
|
||||
if not version_b:
|
||||
return True
|
||||
|
||||
if not num_parts:
|
||||
num_parts = max(len(version_a.split('.')), len(version_b.split('.')), )
|
||||
|
||||
if op == '~=':
|
||||
num_parts = len(version_b.split('.')) - 1
|
||||
num_parts = max(num_parts, 2)
|
||||
op = '=='
|
||||
ignore_sub_versions = True
|
||||
@@ -244,6 +277,16 @@ class SimpleVersion:
|
||||
return version_a_key < version_b_key
|
||||
raise ValueError('Unrecognized comparison operator [{}]'.format(op))
|
||||
|
||||
@classmethod
|
||||
def max_version(cls, version_a, version_b):
|
||||
return version_a if cls.compare_versions(
|
||||
version_a=version_a, op='>=', version_b=version_b, num_parts=None) else version_b
|
||||
|
||||
@classmethod
|
||||
def min_version(cls, version_a, version_b):
|
||||
return version_a if cls.compare_versions(
|
||||
version_a=version_a, op='<=', version_b=version_b, num_parts=None) else version_b
|
||||
|
||||
@staticmethod
|
||||
def _parse_letter_version(
|
||||
letter, # type: str
|
||||
@@ -312,6 +355,77 @@ class SimpleVersion:
|
||||
return ()
|
||||
|
||||
|
||||
def compare_version_rules(specs_a, specs_b):
|
||||
# specs_a/b are a list of tuples: [('==', '1.2.3'), ] or [('>=', '1.2'), ('<', '1.3')]
|
||||
# section definition:
|
||||
class Section(object):
|
||||
def __init__(self, left=None, left_eq=False, right=None, right_eq=False):
|
||||
self.left, self.left_eq, self.right, self.right_eq = left, left_eq, right, right_eq
|
||||
# first create a list of in/out sections for each spec
|
||||
# >, >= are left rule
|
||||
# <, <= are right rule
|
||||
# ~= x.y.z is converted to: >= x.y and < x.y+1
|
||||
# ==/=== are converted to: >= and <=
|
||||
# != x.y.z will split a section into: left < x.y.z and right > x.y.z
|
||||
def create_section(specs):
|
||||
section = Section()
|
||||
for op, v in specs:
|
||||
a = section
|
||||
if op == '>':
|
||||
a.left = v
|
||||
a.left_eq = False
|
||||
elif op == '>=':
|
||||
a.left = v
|
||||
a.left_eq = True
|
||||
elif op == '<':
|
||||
a.right = v
|
||||
a.right_eq = False
|
||||
elif op == '<=':
|
||||
a.right = v
|
||||
a.right_eq = True
|
||||
elif op == '==':
|
||||
a.left = v
|
||||
a.left_eq = True
|
||||
a.right = v
|
||||
a.right_eq = True
|
||||
elif op == '~=':
|
||||
new_v = v.split('.')
|
||||
a_left = '.'.join(new_v[:-1])
|
||||
a.left = a_left if not a.left else SimpleVersion.max_version(a_left, a.left)
|
||||
a.left_eq = True
|
||||
a_right = '.'.join(new_v[:-2] + [str(int(new_v[-2])+1)])
|
||||
a.right = a_right if not a.right else SimpleVersion.min_version(a_right, a.right)
|
||||
a.right_eq = False if a.right == a_right else a.right_eq
|
||||
|
||||
return section
|
||||
|
||||
section_a = create_section(specs_a)
|
||||
section_b = create_section(specs_b)
|
||||
i = Section()
|
||||
# then we have a list of sections for spec A/B
|
||||
if section_a.left == section_b.left:
|
||||
i.left = section_a.left
|
||||
i.left_eq = section_a.left_eq and section_b.left_eq
|
||||
else:
|
||||
i.left = SimpleVersion.max_version(section_a.left, section_b.left)
|
||||
i.left_eq = section_a.left_eq if i.left == section_a.left else section_b.left_eq
|
||||
if section_a.right == section_b.right:
|
||||
i.right = section_a.right
|
||||
i.right_eq = section_a.right_eq and section_b.right_eq
|
||||
else:
|
||||
i.right = SimpleVersion.min_version(section_a.right, section_b.right)
|
||||
i.right_eq = section_a.right_eq if i.right == section_a.right else section_b.right_eq
|
||||
|
||||
# return true if any section from A intersects a section from B
|
||||
valid = True
|
||||
valid &= SimpleVersion.compare_versions(
|
||||
version_a=i.left, op='<=' if i.left_eq else '<', version_b=i.right, num_parts=None)
|
||||
valid &= SimpleVersion.compare_versions(
|
||||
version_a=i.right, op='>=' if i.left_eq else '>', version_b=i.left, num_parts=None)
|
||||
|
||||
return valid
|
||||
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class RequirementSubstitution(object):
|
||||
|
||||
@@ -362,7 +476,7 @@ class RequirementSubstitution(object):
|
||||
|
||||
@property
|
||||
def cuda_version(self):
|
||||
return self.config['agent.cuda_version']
|
||||
return convert_cuda_version_to_int_10_base_str(self.config['agent.cuda_version'])
|
||||
|
||||
@property
|
||||
def cudnn_version(self):
|
||||
@@ -446,10 +560,15 @@ class RequirementsManager(object):
|
||||
'cu'+agent['cuda_version'] if self.found_cuda else 'cpu')
|
||||
self.translator = RequirementsTranslator(session, interpreter=base_interpreter,
|
||||
cache_dir=pip_cache_dir.as_posix())
|
||||
self._base_interpreter = base_interpreter
|
||||
self._cwd = None
|
||||
|
||||
def register(self, cls): # type: (Type[RequirementSubstitution]) -> None
|
||||
self.handlers.append(cls(self._session))
|
||||
|
||||
def set_cwd(self, cwd):
|
||||
self._cwd = str(cwd) if cwd else None
|
||||
|
||||
def _replace_one(self, req): # type: (MarkerRequirement) -> Optional[Text]
|
||||
match = re.search(r';\s*(.*)', Text(req))
|
||||
if match:
|
||||
@@ -462,19 +581,9 @@ class RequirementsManager(object):
|
||||
return None
|
||||
|
||||
def replace(self, requirements): # type: (Text) -> Text
|
||||
def safe_parse(req_str):
|
||||
try:
|
||||
return next(parse(req_str))
|
||||
except Exception as ex:
|
||||
return Requirement(req_str)
|
||||
parsed_requirements = self.parse_requirements_section_to_marker_requirements(
|
||||
requirements=requirements, cwd=self._cwd)
|
||||
|
||||
parsed_requirements = tuple(
|
||||
map(
|
||||
MarkerRequirement,
|
||||
[safe_parse(line) for line in (requirements.splitlines()
|
||||
if isinstance(requirements, six.text_type) else requirements)]
|
||||
)
|
||||
)
|
||||
if not parsed_requirements:
|
||||
# return the original requirements just in case
|
||||
return requirements
|
||||
@@ -529,6 +638,9 @@ class RequirementsManager(object):
|
||||
pass
|
||||
return requirements
|
||||
|
||||
def get_interpreter(self):
|
||||
return self._base_interpreter
|
||||
|
||||
@staticmethod
|
||||
def get_cuda_version(config): # type: (ConfigTree) -> (Text, Text)
|
||||
# we assume os.environ already updated the config['agent.cuda_version'] & config['agent.cudnn_version']
|
||||
@@ -537,6 +649,9 @@ class RequirementsManager(object):
|
||||
if cuda_version and cudnn_version:
|
||||
return normalize_cuda_version(cuda_version), normalize_cuda_version(cudnn_version)
|
||||
|
||||
if not cuda_version:
|
||||
cuda_version = get_driver_cuda_version()
|
||||
|
||||
if not cuda_version and is_windows_platform():
|
||||
try:
|
||||
cuda_vers = [int(k.replace('CUDA_PATH_V', '').replace('_', '')) for k in os.environ.keys()
|
||||
@@ -602,3 +717,28 @@ class RequirementsManager(object):
|
||||
return (normalize_cuda_version(cuda_version or 0),
|
||||
normalize_cuda_version(cudnn_version or 0))
|
||||
|
||||
@staticmethod
|
||||
def parse_requirements_section_to_marker_requirements(requirements, cwd=None):
|
||||
def safe_parse(req_str):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
return list(parse(req_str, cwd=cwd))
|
||||
except Exception as ex:
|
||||
return [Requirement(req_str)]
|
||||
|
||||
def create_req(x):
|
||||
r = MarkerRequirement(x)
|
||||
r.validate_local_file_ref()
|
||||
return r
|
||||
|
||||
if not requirements:
|
||||
return tuple()
|
||||
|
||||
parsed_requirements = tuple(
|
||||
map(
|
||||
create_req,
|
||||
[r for line in (requirements.splitlines() if isinstance(requirements, str) else requirements)
|
||||
for r in safe_parse(line)]
|
||||
)
|
||||
)
|
||||
return parsed_requirements
|
||||
|
||||
@@ -7,7 +7,7 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from copy import copy
|
||||
from distutils.spawn import find_executable
|
||||
from itertools import chain, repeat, islice
|
||||
from os.path import devnull
|
||||
@@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
|
||||
return output if not strip or not output else output.strip()
|
||||
|
||||
|
||||
def terminate_process(pid, timeout=10.):
|
||||
def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
proc = psutil.Process(pid)
|
||||
children = proc.children(recursive=True) if include_children else []
|
||||
proc.terminate()
|
||||
cnt = 0
|
||||
while proc.is_running() and cnt < timeout:
|
||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||
sleep(1.)
|
||||
cnt += 1
|
||||
proc.terminate()
|
||||
|
||||
# terminate children
|
||||
for c in children:
|
||||
c.terminate()
|
||||
|
||||
cnt = 0
|
||||
while proc.is_running() and cnt < timeout:
|
||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||
sleep(1.)
|
||||
cnt += 1
|
||||
|
||||
# kill children
|
||||
for c in children:
|
||||
c.kill()
|
||||
|
||||
proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
@@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10.):
|
||||
return True
|
||||
|
||||
|
||||
def kill_all_child_processes(pid=None):
|
||||
def kill_all_child_processes(pid=None, include_parent=True):
|
||||
# get current process if pid not provided
|
||||
include_parent = True
|
||||
if not pid:
|
||||
pid = os.getpid()
|
||||
include_parent = False
|
||||
@@ -84,6 +94,23 @@ def kill_all_child_processes(pid=None):
|
||||
parent.kill()
|
||||
|
||||
|
||||
def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
|
||||
# get current process if pid not provided
|
||||
if not pid:
|
||||
pid = os.getpid()
|
||||
include_parent = False
|
||||
try:
|
||||
parent = psutil.Process(pid)
|
||||
except psutil.Error:
|
||||
# could not find parent process id
|
||||
return
|
||||
for child in parent.children(recursive=False):
|
||||
print('Terminating child process {}'.format(child.pid))
|
||||
terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True)
|
||||
if include_parent:
|
||||
terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
|
||||
|
||||
|
||||
def get_docker_id(docker_cmd_contains):
|
||||
try:
|
||||
containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"')
|
||||
@@ -103,9 +130,10 @@ def shutdown_docker_process(docker_cmd_contains=None, docker_id=None):
|
||||
docker_id = get_docker_id(docker_cmd_contains=docker_cmd_contains)
|
||||
if docker_id:
|
||||
# we found our docker, stop it
|
||||
get_bash_output(cmd='docker stop -t 1 {}'.format(docker_id))
|
||||
return get_bash_output(cmd='docker stop -t 1 {}'.format(docker_id))
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def commit_docker(container_name, docker_cmd_contains=None, docker_id=None, apply_change=None):
|
||||
@@ -193,6 +221,7 @@ class Argv(Executable):
|
||||
"""
|
||||
self.argv = argv
|
||||
self._log = kwargs.pop("log", None)
|
||||
self._display_argv = kwargs.pop("display_argv", argv)
|
||||
if not self._log:
|
||||
self._log = logging.getLogger(__name__)
|
||||
self._log.propagate = False
|
||||
@@ -217,10 +246,10 @@ class Argv(Executable):
|
||||
return self.argv
|
||||
|
||||
def __repr__(self):
|
||||
return "<Argv{}>".format(self.argv)
|
||||
return "<Argv{}>".format(self._display_argv)
|
||||
|
||||
def __str__(self):
|
||||
return "Executing: {}".format(self.argv)
|
||||
return "Executing: {}".format(self._display_argv)
|
||||
|
||||
def __iter__(self):
|
||||
if is_windows_platform():
|
||||
@@ -276,9 +305,9 @@ class CommandSequence(Executable):
|
||||
self.commands = []
|
||||
for c in commands:
|
||||
if isinstance(c, CommandSequence):
|
||||
self.commands.extend(deepcopy(c.commands))
|
||||
self.commands.extend([copy(p) for p in c.commands])
|
||||
elif isinstance(c, Argv):
|
||||
self.commands.append(deepcopy(c))
|
||||
self.commands.append(copy(c))
|
||||
else:
|
||||
self.commands.append(Argv(*c, log=self._log))
|
||||
|
||||
@@ -420,7 +449,7 @@ SOURCE_COMMAND = select_for_platform(linux="source", windows="call")
|
||||
class ExitStatus(object):
|
||||
success = 0
|
||||
failure = 1
|
||||
interrupted = 2
|
||||
interrupted = -2
|
||||
|
||||
|
||||
COMMAND_SUCCESS = 0
|
||||
|
||||
@@ -4,7 +4,9 @@ import shutil
|
||||
import subprocess
|
||||
from distutils.spawn import find_executable
|
||||
from hashlib import md5
|
||||
from os import environ, getenv
|
||||
from os import environ
|
||||
from random import random
|
||||
from threading import Lock
|
||||
from typing import Text, Sequence, Mapping, Iterable, TypeVar, Callable, Tuple, Optional
|
||||
|
||||
import attr
|
||||
@@ -23,6 +25,7 @@ from clearml_agent.helper.base import (
|
||||
normalize_path,
|
||||
create_file_if_not_exists,
|
||||
)
|
||||
from clearml_agent.helper.os.locks import FileLock
|
||||
from clearml_agent.helper.process import DEVNULL, Argv, PathLike, COMMAND_SUCCESS
|
||||
from clearml_agent.session import Session
|
||||
|
||||
@@ -88,7 +91,7 @@ class VCS(object):
|
||||
# additional environment variables for VCS commands
|
||||
COMMAND_ENV = {}
|
||||
|
||||
PATCH_ADDED_FILE_RE = re.compile(r"^\+\+\+ b/(?P<path>.*)")
|
||||
PATCH_ADDED_FILE_RE = re.compile(r"^--- a/(?P<path>.*)")
|
||||
|
||||
def __init__(self, session, url, location, revision):
|
||||
# type: (Session, Text, PathLike, Text) -> ()
|
||||
@@ -105,7 +108,7 @@ class VCS(object):
|
||||
)
|
||||
self.url = url
|
||||
self.location = Text(location)
|
||||
self.revision = revision
|
||||
self._revision = revision
|
||||
self.log = self.session.get_logger(__name__)
|
||||
|
||||
@property
|
||||
@@ -115,21 +118,21 @@ class VCS(object):
|
||||
"""
|
||||
return self.add_auth(self.session.config, self.url)
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def executable_name(self):
|
||||
"""
|
||||
Name of command executable
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def main_branch(self):
|
||||
"""
|
||||
Name of default/main branch
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def checkout_flags(self):
|
||||
# type: () -> Sequence[Text]
|
||||
"""
|
||||
@@ -137,7 +140,7 @@ class VCS(object):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def patch_base(self):
|
||||
# type: () -> Sequence[Text]
|
||||
"""
|
||||
@@ -254,15 +257,15 @@ class VCS(object):
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def replace_http_url(cls, url, port=None):
|
||||
# type: (Text, Optional[int]) -> Text
|
||||
def replace_http_url(cls, url, port=None, username=None):
|
||||
# type: (Text, Optional[int], Optional[str]) -> Text
|
||||
"""
|
||||
Replace HTTPS URL with SSH URL when applicable
|
||||
"""
|
||||
parsed_url = furl(url)
|
||||
if parsed_url.scheme == "https":
|
||||
parsed_url.scheme = "ssh"
|
||||
parsed_url.username = "git"
|
||||
parsed_url.username = username or "git"
|
||||
parsed_url.password = None
|
||||
# make sure there is no port in the final url (safe_furl support)
|
||||
# the original port was an https port, and we do not know if there is a different ssh port,
|
||||
@@ -271,6 +274,18 @@ class VCS(object):
|
||||
url = parsed_url.url
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def rewrite_ssh_url(cls, url, port=None, username=None):
|
||||
# type: (Text, Optional[int], Optional[str]) -> Text
|
||||
"""
|
||||
Rewrite SSH URL with custom port and username
|
||||
"""
|
||||
parsed_url = furl(url)
|
||||
if parsed_url.scheme == "ssh":
|
||||
parsed_url.username = username or "git"
|
||||
parsed_url.port = port or None
|
||||
return parsed_url.url
|
||||
|
||||
def _set_ssh_url(self):
|
||||
"""
|
||||
Replace instance URL with SSH substitution result and report to log.
|
||||
@@ -285,12 +300,29 @@ class VCS(object):
|
||||
return
|
||||
if parsed_url.scheme == "https":
|
||||
new_url = self.replace_http_url(
|
||||
self.url, port=self.session.config.get('agent.force_git_ssh_port', None))
|
||||
self.url,
|
||||
port=self.session.config.get('agent.force_git_ssh_port', None),
|
||||
username=self.session.config.get('agent.force_git_ssh_user', None)
|
||||
)
|
||||
if new_url != self.url:
|
||||
print("Using SSH credentials - replacing https url '{}' with ssh url '{}'".format(
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
return
|
||||
# rewrite ssh URLs only if either ssh port or ssh user are forced in config
|
||||
if parsed_url.scheme == "ssh" and (
|
||||
self.session.config.get('agent.force_git_ssh_port', None) or
|
||||
self.session.config.get('agent.force_git_ssh_user', None)
|
||||
):
|
||||
new_url = self.rewrite_ssh_url(
|
||||
self.url,
|
||||
port=self.session.config.get('agent.force_git_ssh_port', None),
|
||||
username=self.session.config.get('agent.force_git_ssh_user', None)
|
||||
)
|
||||
if new_url != self.url:
|
||||
print("Using SSH credentials - ssh url '{}' with ssh url '{}'".format(
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
|
||||
if not self.session.config.agent.translate_ssh:
|
||||
return
|
||||
@@ -358,7 +390,7 @@ class VCS(object):
|
||||
"""
|
||||
Checkout repository at specified revision
|
||||
"""
|
||||
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
|
||||
self.call("checkout", self._revision, *self.checkout_flags, cwd=self.location)
|
||||
|
||||
@abc.abstractmethod
|
||||
def pull(self):
|
||||
@@ -450,7 +482,7 @@ class VCS(object):
|
||||
parsed_url = furl(url)
|
||||
except ValueError:
|
||||
return url
|
||||
if parsed_url.scheme in ["", "ssh"] or parsed_url.scheme.startswith("git"):
|
||||
if parsed_url.scheme in ["", "ssh"] or (parsed_url.scheme or '').startswith("git"):
|
||||
return parsed_url.url
|
||||
config_user = ENV_AGENT_GIT_USER.get() or config.get("agent.{}_user".format(cls.executable_name), None)
|
||||
config_pass = ENV_AGENT_GIT_PASS.get() or config.get("agent.{}_pass".format(cls.executable_name), None)
|
||||
@@ -464,7 +496,7 @@ class VCS(object):
|
||||
parsed_url.set(username=config_user, password=config_pass)
|
||||
return parsed_url.url
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def info_commands(self):
|
||||
# type: () -> Mapping[Text, Argv]
|
||||
"""
|
||||
@@ -487,7 +519,7 @@ class VCS(object):
|
||||
|
||||
class Git(VCS):
|
||||
executable_name = "git"
|
||||
main_branch = "master"
|
||||
main_branch = ("master", "main")
|
||||
clone_flags = ("--quiet", "--recursive")
|
||||
checkout_flags = ("--force",)
|
||||
COMMAND_ENV = {
|
||||
@@ -499,7 +531,9 @@ class Git(VCS):
|
||||
|
||||
@staticmethod
|
||||
def remote_branch_name(branch):
|
||||
return "origin/{}".format(branch)
|
||||
return [
|
||||
"origin/{}".format(b) for b in ([branch] if isinstance(branch, str) else branch)
|
||||
]
|
||||
|
||||
def executable_not_found_error_help(self):
|
||||
return 'Cannot find "{}" executable. {}'.format(
|
||||
@@ -521,10 +555,18 @@ class Git(VCS):
|
||||
"""
|
||||
Checkout repository at specified revision
|
||||
"""
|
||||
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
|
||||
revisions = [self._revision] if isinstance(self._revision, str) else self._revision
|
||||
for i, revision in enumerate(revisions):
|
||||
try:
|
||||
self.call("checkout", revision, *self.checkout_flags, cwd=self.location)
|
||||
break
|
||||
except subprocess.CalledProcessError:
|
||||
if i == len(revisions) - 1:
|
||||
raise
|
||||
|
||||
try:
|
||||
self.call("submodule", "update", "--recursive", cwd=self.location)
|
||||
except:
|
||||
except: # noqa
|
||||
pass
|
||||
|
||||
info_commands = dict(
|
||||
@@ -561,7 +603,7 @@ class Hg(VCS):
|
||||
"pull",
|
||||
self.url_with_auth,
|
||||
cwd=self.location,
|
||||
*(("-r", self.revision) if self.revision else ())
|
||||
*(("-r", self._revision) if self._revision else ())
|
||||
)
|
||||
|
||||
info_commands = dict(
|
||||
@@ -582,7 +624,10 @@ def clone_repository_cached(session, execution, destination):
|
||||
:return: repository information
|
||||
:raises: CommandFailedError if git/hg is not installed
|
||||
"""
|
||||
repo_url = execution.repository # type: str
|
||||
# mock lock
|
||||
repo_lock = Lock()
|
||||
repo_lock_timeout_sec = 300
|
||||
repo_url = execution.repository or '' # type: str
|
||||
parsed_url = furl(repo_url)
|
||||
no_password_url = parsed_url.copy().remove(password=True).url
|
||||
|
||||
@@ -593,41 +638,54 @@ def clone_repository_cached(session, execution, destination):
|
||||
if standalone_mode:
|
||||
cached_repo_path = clone_folder
|
||||
else:
|
||||
cached_repo_path = (
|
||||
Path(session.config["agent.vcs_cache.path"]).expanduser()
|
||||
/ "{}.{}".format(clone_folder_name, md5(ensure_binary(repo_url)).hexdigest())
|
||||
/ clone_folder_name
|
||||
) # type: Path
|
||||
vcs_cache_path = Path(session.config["agent.vcs_cache.path"]).expanduser()
|
||||
repo_hash = md5(ensure_binary(repo_url)).hexdigest()
|
||||
# create lock
|
||||
repo_lock = FileLock(filename=(vcs_cache_path / '{}.lock'.format(repo_hash)).as_posix())
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
repo_lock.acquire(timeout=repo_lock_timeout_sec)
|
||||
except BaseException:
|
||||
print('Could not lock cache folder "{}" (timeout {} sec), using temp vcs cache.'.format(
|
||||
clone_folder_name, repo_lock_timeout_sec))
|
||||
repo_hash = '{}_{}'.format(repo_hash, str(random()).replace('.', ''))
|
||||
# use mock lock for the context
|
||||
repo_lock = Lock()
|
||||
# select vcs cache folder
|
||||
cached_repo_path = vcs_cache_path / "{}.{}".format(clone_folder_name, repo_hash) / clone_folder_name
|
||||
|
||||
vcs = VcsFactory.create(
|
||||
session, execution_info=execution, location=cached_repo_path
|
||||
)
|
||||
if not find_executable(vcs.executable_name):
|
||||
raise CommandFailedError(vcs.executable_not_found_error_help())
|
||||
with repo_lock:
|
||||
vcs = VcsFactory.create(
|
||||
session, execution_info=execution, location=cached_repo_path
|
||||
)
|
||||
if not find_executable(vcs.executable_name):
|
||||
raise CommandFailedError(vcs.executable_not_found_error_help())
|
||||
|
||||
if not standalone_mode:
|
||||
if session.config["agent.vcs_cache.enabled"] and cached_repo_path.exists():
|
||||
print('Using cached repository in "{}"'.format(cached_repo_path))
|
||||
if not standalone_mode:
|
||||
if session.config["agent.vcs_cache.enabled"] and cached_repo_path.exists():
|
||||
print('Using cached repository in "{}"'.format(cached_repo_path))
|
||||
|
||||
else:
|
||||
print("cloning: {}".format(no_password_url))
|
||||
rm_tree(cached_repo_path)
|
||||
# We clone the entire repository, not a specific branch
|
||||
vcs.clone() # branch=execution.branch)
|
||||
else:
|
||||
print("cloning: {}".format(no_password_url))
|
||||
rm_tree(cached_repo_path)
|
||||
# We clone the entire repository, not a specific branch
|
||||
vcs.clone() # branch=execution.branch)
|
||||
|
||||
vcs.pull()
|
||||
rm_tree(destination)
|
||||
shutil.copytree(Text(cached_repo_path), Text(clone_folder))
|
||||
if not clone_folder.is_dir():
|
||||
raise CommandFailedError(
|
||||
"copying of repository failed: from {} to {}".format(
|
||||
cached_repo_path, clone_folder
|
||||
vcs.pull()
|
||||
rm_tree(destination)
|
||||
shutil.copytree(Text(cached_repo_path), Text(clone_folder),
|
||||
symlinks=select_for_platform(linux=True, windows=False),
|
||||
ignore_dangling_symlinks=True)
|
||||
if not clone_folder.is_dir():
|
||||
raise CommandFailedError(
|
||||
"copying of repository failed: from {} to {}".format(
|
||||
cached_repo_path, clone_folder
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# checkout in the newly copy destination
|
||||
vcs.location = Text(clone_folder)
|
||||
vcs.checkout()
|
||||
# checkout in the newly copy destination
|
||||
vcs.location = Text(clone_folder)
|
||||
vcs.checkout()
|
||||
|
||||
repo_info = vcs.get_repository_copy_info(clone_folder)
|
||||
|
||||
@@ -635,3 +693,70 @@ def clone_repository_cached(session, execution, destination):
|
||||
repo_info = attr.evolve(repo_info, url=no_password_url)
|
||||
|
||||
return vcs, repo_info
|
||||
|
||||
|
||||
def fix_package_import_diff_patch(entry_script_file):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open(entry_script_file, 'rt') as f:
|
||||
lines = f.readlines()
|
||||
except Exception:
|
||||
return
|
||||
# make sre we are the first import (i.e. we patched the source code)
|
||||
if not lines or not lines[0].strip().startswith('from clearml ') or 'Task.init' not in lines[1]:
|
||||
return
|
||||
|
||||
original_lines = lines
|
||||
# skip over the first two lines, they are ours
|
||||
# then skip over empty or comment lines
|
||||
lines = [(i, line.split('#', 1)[0].rstrip()) for i, line in enumerate(lines)
|
||||
if i >= 2 and line.strip('\r\n\t ') and not line.strip().startswith('#')]
|
||||
|
||||
# remove triple quotes ' """ '
|
||||
nested_c = -1
|
||||
skip_lines = []
|
||||
for i, line_pair in enumerate(lines):
|
||||
for _ in line_pair[1].split('"""')[1:]:
|
||||
if nested_c >= 0:
|
||||
skip_lines.extend(list(range(nested_c, i+1)))
|
||||
nested_c = -1
|
||||
else:
|
||||
nested_c = i
|
||||
# now select all the
|
||||
lines = [pair for i, pair in enumerate(lines) if i not in skip_lines]
|
||||
|
||||
from_future = re.compile(r"^from[\s]*__future__[\s]*")
|
||||
import_future = re.compile(r"^import[\s]*__future__[\s]*")
|
||||
# test if we have __future__ import
|
||||
found_index = -1
|
||||
for a_i, (_, a_line) in enumerate(lines):
|
||||
if found_index >= a_i:
|
||||
continue
|
||||
if from_future.match(a_line) or import_future.match(a_line):
|
||||
found_index = a_i
|
||||
# check the last import block
|
||||
i, line = lines[found_index]
|
||||
# wither we have \\ character at the end of the line or the line is indented
|
||||
parenthesized_lines = '(' in line and ')' not in line
|
||||
while line.endswith('\\') or parenthesized_lines:
|
||||
found_index += 1
|
||||
i, line = lines[found_index]
|
||||
if ')' in line:
|
||||
break
|
||||
|
||||
else:
|
||||
break
|
||||
|
||||
# no imports found
|
||||
if found_index < 0:
|
||||
return
|
||||
|
||||
# now we need to move back the patched two lines
|
||||
entry_line = lines[found_index][0]
|
||||
new_lines = original_lines[2:entry_line + 1] + original_lines[0:2] + original_lines[entry_line + 1:]
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open(entry_script_file, 'wt') as f:
|
||||
f.writelines(new_lines)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import unicode_literals, division
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
from collections import deque
|
||||
from itertools import starmap
|
||||
from threading import Thread, Event
|
||||
@@ -12,6 +13,7 @@ import attr
|
||||
import psutil
|
||||
from pathlib2 import Path
|
||||
from clearml_agent.session import Session
|
||||
from clearml_agent.definitions import ENV_WORKER_TAGS
|
||||
|
||||
try:
|
||||
from .gpu import gpustat
|
||||
@@ -59,6 +61,7 @@ class ResourceMonitor(object):
|
||||
sample_frequency_per_sec=2.0,
|
||||
report_frequency_sec=30.0,
|
||||
first_report_sec=None,
|
||||
worker_tags=None,
|
||||
):
|
||||
self.session = session
|
||||
self.queue = deque(maxlen=1)
|
||||
@@ -76,6 +79,9 @@ class ResourceMonitor(object):
|
||||
self._gpustat_fail = 0
|
||||
self._gpustat = gpustat
|
||||
self._active_gpus = None
|
||||
if not worker_tags and ENV_WORKER_TAGS.get():
|
||||
worker_tags = shlex.split(ENV_WORKER_TAGS.get())
|
||||
self._worker_tags = worker_tags
|
||||
if os.environ.get('NVIDIA_VISIBLE_DEVICES') == 'none':
|
||||
# NVIDIA_VISIBLE_DEVICES set to none, marks cpu_only flag
|
||||
# active_gpus == False means no GPU reporting
|
||||
@@ -118,6 +124,7 @@ class ResourceMonitor(object):
|
||||
machine_stats=stats,
|
||||
timestamp=(int(time()) * 1000),
|
||||
worker=self._worker_id,
|
||||
tags=self._worker_tags,
|
||||
**self.get_report().to_dict()
|
||||
)
|
||||
log.debug("sending report: %s", report)
|
||||
|
||||
@@ -129,8 +129,9 @@ def get_uptime_string(entry):
|
||||
|
||||
|
||||
def get_runtime_properties_string(runtime_properties):
|
||||
# type: (List[dict]) -> Tuple[Optional[str], str]
|
||||
# type: (Optional[List[dict]]) -> Tuple[Optional[str], str]
|
||||
server_string = []
|
||||
runtime_properties = runtime_properties or []
|
||||
force_flag = next(
|
||||
(prop for prop in runtime_properties if prop["key"] == UptimeConf.worker_key),
|
||||
None,
|
||||
|
||||
@@ -7,7 +7,7 @@ from tempfile import gettempdir, NamedTemporaryFile
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
from clearml_agent.definitions import ENV_DOCKER_HOST_MOUNT
|
||||
from clearml_agent.helper.base import warning
|
||||
from clearml_agent.helper.base import warning, is_windows_platform, safe_remove_file
|
||||
|
||||
|
||||
class Singleton(object):
|
||||
@@ -22,6 +22,13 @@ class Singleton(object):
|
||||
_lock_timeout = 10
|
||||
_pid = None
|
||||
|
||||
@classmethod
|
||||
def close_pid_file(cls):
|
||||
if cls._pid_file:
|
||||
cls._pid_file.close()
|
||||
safe_remove_file(cls._pid_file.name)
|
||||
cls._pid_file = None
|
||||
|
||||
@classmethod
|
||||
def update_pid_file(cls):
|
||||
new_pid = str(os.getpid())
|
||||
@@ -115,7 +122,7 @@ class Singleton(object):
|
||||
|
||||
@classmethod
|
||||
def _register_instance(cls, unique_worker_id=None, worker_name=None, api_client=None, allow_double=False):
|
||||
if cls.worker_id:
|
||||
if cls.worker_id and cls.instance_slot is not None:
|
||||
return cls.worker_id, cls.instance_slot
|
||||
# make sure we have a unique name
|
||||
instance_num = 0
|
||||
@@ -167,7 +174,9 @@ class Singleton(object):
|
||||
# create lock
|
||||
cls._pid = str(os.getpid())
|
||||
cls._pid_file = NamedTemporaryFile(
|
||||
dir=cls._get_temp_folder(), prefix=cls.prefix + cls.sep + cls._pid + cls.sep, suffix=cls.ext)
|
||||
dir=cls._get_temp_folder(), prefix=cls.prefix + cls.sep + cls._pid + cls.sep, suffix=cls.ext,
|
||||
delete=False if is_windows_platform() else True
|
||||
)
|
||||
cls._pid_file.write(('{}\n{}'.format(unique_worker_id, cls.instance_slot)).encode())
|
||||
cls._pid_file.flush()
|
||||
cls.worker_id = unique_worker_id
|
||||
|
||||
@@ -50,7 +50,7 @@ DAEMON_ARGS = dict({
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
@@ -78,7 +78,16 @@ DAEMON_ARGS = dict({
|
||||
},
|
||||
'--services-mode': {
|
||||
'help': 'Launch multiple long-term docker services. Implies docker & cpu-only flags.',
|
||||
'action': 'store_true',
|
||||
'nargs': '?',
|
||||
'const': -1,
|
||||
'type': int,
|
||||
'default': None,
|
||||
},
|
||||
'--child-report-tags': {
|
||||
'help': 'List of tags to send with the status reports from the worker that runs a task',
|
||||
'nargs': '+',
|
||||
'type': str,
|
||||
'default': None,
|
||||
},
|
||||
'--create-queue': {
|
||||
'help': 'Create requested queue if it does not exist already.',
|
||||
@@ -90,7 +99,16 @@ DAEMON_ARGS = dict({
|
||||
'aliases': ['-d'],
|
||||
},
|
||||
'--stop': {
|
||||
'help': 'Stop the running agent (based on the same set of arguments)',
|
||||
'help': 'Stop the running agent (based on the same set of arguments). '
|
||||
'Optional: provide a list of specific local worker IDs to stop',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
},
|
||||
'--dynamic-gpus': {
|
||||
'help': 'Allow to dynamically allocate gpus based on queue properties, '
|
||||
'configure with \'--queue <queue_name>=<num_gpus>\'.'
|
||||
' Example: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 single_gpu=1\''
|
||||
' Example Opportunistic: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 max_quad_gpus=1-4 \'',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--uptime': {
|
||||
@@ -101,7 +119,7 @@ DAEMON_ARGS = dict({
|
||||
'default': None,
|
||||
},
|
||||
'--downtime': {
|
||||
'help': 'Specify uptime for clearml-agent in "<hours> <days>" format. for example, use "09-13 TUE" to set '
|
||||
'help': 'Specify downtime for clearml-agent in "<hours> <days>" format. for example, use "09-13 TUE" to set '
|
||||
'Tuesday\'s downtime to 09-13'
|
||||
'Note: Make sure to have only on of uptime/downtime configuration and not both.',
|
||||
'nargs': '*',
|
||||
@@ -111,6 +129,10 @@ DAEMON_ARGS = dict({
|
||||
'help': 'Print the worker\'s schedule (uptime properties, server\'s runtime properties and listening queues)',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--use-owner-token': {
|
||||
'help': 'Generate and use task owner token for the execution of the task',
|
||||
'action': 'store_true',
|
||||
}
|
||||
}, **WORKER_ARGS)
|
||||
|
||||
COMMANDS = {
|
||||
@@ -145,7 +167,7 @@ COMMANDS = {
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
@@ -179,11 +201,18 @@ COMMANDS = {
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Build the experiment inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
},
|
||||
'--force-docker': {
|
||||
'help': 'Force using the agent-specified docker image (either explicitly in the --docker argument or '
|
||||
'using the agent\'s default docker image). If provided, the agent will not use any docker '
|
||||
'container information stored on the task itself (default False)',
|
||||
'default': False,
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--python-version': {
|
||||
'help': 'Virtual environment python version to use',
|
||||
},
|
||||
|
||||
@@ -204,7 +204,7 @@ class Session(_Session):
|
||||
folder_keys = ('agent.venvs_dir', 'agent.vcs_cache.path',
|
||||
'agent.pip_download_cache.path',
|
||||
'agent.docker_pip_cache', 'agent.docker_apt_cache')
|
||||
singleton_folders = ('agent.venvs_dir', 'agent.vcs_cache.path', 'agent.docker_apt_cache')
|
||||
singleton_folders = ('agent.venvs_dir', 'agent.docker_apt_cache')
|
||||
|
||||
if ENV_TASK_EXECUTE_AS_USER.get():
|
||||
folder_keys = tuple(list(folder_keys) + ['sdk.storage.cache.default_base_dir'])
|
||||
@@ -229,26 +229,35 @@ class Session(_Session):
|
||||
except:
|
||||
pass
|
||||
|
||||
def print_configuration(self, remove_secret_keys=("secret", "pass", "token", "account_key")):
|
||||
def print_configuration(
|
||||
self,
|
||||
remove_secret_keys=("secret", "pass", "token", "account_key", "contents"),
|
||||
skip_value_keys=("environment", )
|
||||
):
|
||||
# remove all the secrets from the print
|
||||
def recursive_remove_secrets(dictionary, secret_keys=()):
|
||||
def recursive_remove_secrets(dictionary, secret_keys=(), empty_keys=()):
|
||||
for k in list(dictionary):
|
||||
for s in secret_keys:
|
||||
if s in k:
|
||||
dictionary.pop(k)
|
||||
break
|
||||
for s in empty_keys:
|
||||
if s == k:
|
||||
dictionary[k] = {key: '****' for key in dictionary[k]} \
|
||||
if isinstance(dictionary[k], dict) else '****'
|
||||
break
|
||||
if isinstance(dictionary.get(k, None), dict):
|
||||
recursive_remove_secrets(dictionary[k], secret_keys=secret_keys)
|
||||
recursive_remove_secrets(dictionary[k], secret_keys=secret_keys, empty_keys=empty_keys)
|
||||
elif isinstance(dictionary.get(k, None), (list, tuple)):
|
||||
for item in dictionary[k]:
|
||||
if isinstance(item, dict):
|
||||
recursive_remove_secrets(item, secret_keys=secret_keys)
|
||||
recursive_remove_secrets(item, secret_keys=secret_keys, empty_keys=empty_keys)
|
||||
|
||||
config = deepcopy(self.config.to_dict())
|
||||
# remove the env variable, it's not important
|
||||
config.pop('env', None)
|
||||
if remove_secret_keys:
|
||||
recursive_remove_secrets(config, secret_keys=remove_secret_keys)
|
||||
if remove_secret_keys or skip_value_keys:
|
||||
recursive_remove_secrets(config, secret_keys=remove_secret_keys, empty_keys=skip_value_keys)
|
||||
# remove logging.loggers.urllib3.level from the print
|
||||
try:
|
||||
config['logging']['loggers']['urllib3'].pop('level', None)
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '0.17.1'
|
||||
__version__ = '1.2.0rc6'
|
||||
|
||||
@@ -10,9 +10,9 @@ RUN apt-get dist-upgrade -y
|
||||
RUN apt-get install -y curl python3-pip git
|
||||
RUN curl -sSL https://get.docker.com/ | sh
|
||||
RUN python3 -m pip install -U pip
|
||||
RUN python3 -m pip install trains-agent
|
||||
RUN python3 -m pip install clearml-agent
|
||||
RUN python3 -m pip install -U "cryptography>=2.9"
|
||||
|
||||
ENV TRAINS_DOCKER_SKIP_GPUS_FLAG=1
|
||||
ENV CLEARML_DOCKER_SKIP_GPUS_FLAG=1
|
||||
|
||||
ENTRYPOINT ["/usr/agent/entrypoint.sh"]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
LOWER_PIP_UPDATE_VERSION="$(echo "$PIP_UPDATE_VERSION" | tr '[:upper:]' '[:lower:]')"
|
||||
LOWER_TRAINS_AGENT_UPDATE_VERSION="$(echo "$TRAINS_AGENT_UPDATE_VERSION" | tr '[:upper:]' '[:lower:]')"
|
||||
LOWER_CLEARML_AGENT_UPDATE_VERSION="$(echo "${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}" | tr '[:upper:]' '[:lower:]')"
|
||||
|
||||
if [ "$LOWER_PIP_UPDATE_VERSION" = "yes" ] || [ "$LOWER_PIP_UPDATE_VERSION" = "true" ] ; then
|
||||
python3 -m pip install -U pip
|
||||
@@ -9,11 +9,11 @@ elif [ ! -z "$LOWER_PIP_UPDATE_VERSION" ] ; then
|
||||
python3 -m pip install pip$LOWER_PIP_UPDATE_VERSION ;
|
||||
fi
|
||||
|
||||
echo "TRAINS_AGENT_UPDATE_VERSION = $LOWER_TRAINS_AGENT_UPDATE_VERSION"
|
||||
if [ "$LOWER_TRAINS_AGENT_UPDATE_VERSION" = "yes" ] || [ "$LOWER_TRAINS_AGENT_UPDATE_VERSION" = "true" ] ; then
|
||||
python3 -m pip install trains-agent -U
|
||||
elif [ ! -z "$LOWER_TRAINS_AGENT_UPDATE_VERSION" ] ; then
|
||||
python3 -m pip install trains-agent$LOWER_TRAINS_AGENT_UPDATE_VERSION ;
|
||||
echo "CLEARML_AGENT_UPDATE_VERSION = $LOWER_CLEARML_AGENT_UPDATE_VERSION"
|
||||
if [ "$LOWER_CLEARML_AGENT_UPDATE_VERSION" = "yes" ] || [ "$LOWER_CLEARML_AGENT_UPDATE_VERSION" = "true" ] ; then
|
||||
python3 -m pip install clearml-agent -U
|
||||
elif [ ! -z "$LOWER_CLEARML_AGENT_UPDATE_VERSION" ] ; then
|
||||
python3 -m pip install clearml-agent$LOWER_CLEARML_AGENT_UPDATE_VERSION ;
|
||||
fi
|
||||
|
||||
python3 -m trains_agent daemon --docker "$TRAINS_AGENT_DEFAULT_BASE_DOCKER" --force-current-version $TRAINS_AGENT_EXTRA_ARGS
|
||||
python3 -m clearml_agent daemon --docker "${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}" --force-current-version ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
|
||||
14
docker/k8s-glue/README.md
Normal file
14
docker/k8s-glue/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
This folder contains an example docker and templates for running the k8s glue as a pod in a k8s cluster
|
||||
|
||||
Please note that ClearML credentials and server addresses should either be filled in the clearml.conf file before
|
||||
building the glue docker or provided in the k8s-glue.yml template.
|
||||
|
||||
To run, you'll need to:
|
||||
* Create a secret from pod_template.yml:
|
||||
```bash
|
||||
kubectl -n clearml create secret generic k8s-glue-pod-template --from-file=pod_template.yml
|
||||
```
|
||||
* Apply the k8s glue template:
|
||||
```bash
|
||||
kubectl -n clearml apply -f k8s-glue.yml
|
||||
```
|
||||
402
docker/k8s-glue/build-resources/clearml.conf
Normal file
402
docker/k8s-glue/build-resources/clearml.conf
Normal file
@@ -0,0 +1,402 @@
|
||||
# CLEARML-AGENT configuration file
|
||||
api {
|
||||
# Notice: 'host' is the api server (default port 8008), not the web server.
|
||||
api_server: ""
|
||||
web_server: ""
|
||||
files_server: ""
|
||||
# Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
|
||||
credentials {"access_key": "", "secret_key": ""}
|
||||
}
|
||||
|
||||
# Set GIT user/pass credentials
|
||||
# leave blank for GIT SSH credentials
|
||||
agent.git_user=""
|
||||
agent.git_pass=""
|
||||
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
agent.package_manager.extra_index_url= [
|
||||
|
||||
]
|
||||
|
||||
agent {
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Override with os environment: CLEARML_WORKER_ID
|
||||
# worker_id: "clearml-agent-machine1:gpu0"
|
||||
worker_id: ""
|
||||
|
||||
# worker name, replaces the hostname when creating a unique name for this worker
|
||||
# Override with os environment: CLEARML_WORKER_NAME
|
||||
# worker_name: "clearml-agent-machine1"
|
||||
worker_name: ""
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: "<20.2",
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
# install with --upgrade
|
||||
force_upgrade: false,
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||
torch_nightly: false,
|
||||
},
|
||||
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
# path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.clearml/vcs-cache
|
||||
},
|
||||
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
|
||||
# cached folder for specific python package download (used for pytorch package caching)
|
||||
pip_download_cache {
|
||||
enabled: true,
|
||||
path: ~/.clearml/pip-download-cache
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
# pip cache folder mapped into docker, used for python package caching
|
||||
docker_pip_cache = ~/.clearml/pip-cache
|
||||
# apt cache folder mapped into docker, used for ubuntu package caching
|
||||
docker_apt_cache = ~/.clearml/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
# Defined using a list of items of the format: "<hours> <days>".
|
||||
# hours - use values 0-23, single values would count as start hour and end at midnight.
|
||||
# days - use days in abbreviated format (SUN-SAT)
|
||||
# use '-' for ranges and ',' to separate singular values.
|
||||
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
|
||||
# uptime: ["17-20 SUN,TUE"]
|
||||
|
||||
# optional downtime configuration, can be used only when uptime is not used.
|
||||
# If downtime is specified, agent will be idle in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
|
||||
# Use the same format as described above for uptime
|
||||
# downtime: []
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
# docker_init_bash_script = [
|
||||
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||
# "chown -R root /root/.cache/pip",
|
||||
# "apt-get update",
|
||||
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
|
||||
# ]
|
||||
|
||||
# set the preprocessing bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# docker_preprocess_bash_script = [
|
||||
# "echo \"starting docker\"",
|
||||
#]
|
||||
|
||||
# If False replace \r with \n and display full console output
|
||||
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||
# suppress_carriage_return: true
|
||||
|
||||
# cuda versions used for solving pytorch wheel packages
|
||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
}
|
||||
}
|
||||
|
||||
sdk {
|
||||
# ClearML - default SDK configuration
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
size {
|
||||
# max_used_bytes = -1
|
||||
min_free_bytes = 10GB
|
||||
# cleanup_margin_percent = 5%
|
||||
}
|
||||
}
|
||||
|
||||
direct_access: [
|
||||
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
|
||||
# or cached, and any download request will return a direct reference.
|
||||
# Objects are specified in glob format, available for url and content_type.
|
||||
{ url: "file://*" } # file-urls are always directly referenced
|
||||
]
|
||||
}
|
||||
|
||||
metrics {
|
||||
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
|
||||
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
|
||||
# X files are stored in the upload destination for each metric/variant combination.
|
||||
file_history_size: 100
|
||||
|
||||
# Max history size for matplotlib imshow files per plot title.
|
||||
# File names for the uploaded images will be recycled in such a way that no more than
|
||||
# X images are stored in the upload destination for each matplotlib plot title.
|
||||
matplotlib_untitled_history_size: 100
|
||||
|
||||
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
|
||||
# plot_max_num_digits: 5
|
||||
|
||||
# Settings for generated debug images
|
||||
images {
|
||||
format: JPEG
|
||||
quality: 87
|
||||
subsampling: 0
|
||||
}
|
||||
|
||||
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
|
||||
tensorboard_single_series_per_graph: false
|
||||
}
|
||||
|
||||
network {
|
||||
metrics {
|
||||
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
|
||||
# a specific iteration
|
||||
file_upload_threads: 4
|
||||
|
||||
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
|
||||
# being sent for upload
|
||||
file_upload_starvation_warning_sec: 120
|
||||
}
|
||||
|
||||
iteration {
|
||||
# Max number of retries when getting frames if the server returned an error (http code 500)
|
||||
max_retries_on_server_error: 5
|
||||
# Backoff factory for consecutive retry attempts.
|
||||
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
|
||||
retry_backoff_factor_sec: 10
|
||||
}
|
||||
}
|
||||
aws {
|
||||
s3 {
|
||||
# S3 credentials, used for read/write access by various SDK elements
|
||||
|
||||
# default, used for any bucket not specified below
|
||||
key: ""
|
||||
secret: ""
|
||||
region: ""
|
||||
|
||||
credentials: [
|
||||
# specifies key/secret credentials to use when handling s3 urls (read or write)
|
||||
# {
|
||||
# bucket: "my-bucket-name"
|
||||
# key: "my-access-key"
|
||||
# secret: "my-secret-key"
|
||||
# },
|
||||
# {
|
||||
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
|
||||
# host: "my-minio-host:9000"
|
||||
# key: "12345678"
|
||||
# secret: "12345678"
|
||||
# multipart: false
|
||||
# secure: false
|
||||
# }
|
||||
]
|
||||
}
|
||||
boto3 {
|
||||
pool_connections: 512
|
||||
max_multipart_concurrency: 16
|
||||
}
|
||||
}
|
||||
google.storage {
|
||||
# # Default project and credentials file
|
||||
# # Will be used when no bucket configuration is found
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
|
||||
# # Specific credentials per bucket and sub directory
|
||||
# credentials = [
|
||||
# {
|
||||
# bucket: "my-bucket"
|
||||
# subdir: "path/in/bucket" # Not required
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
# },
|
||||
# ]
|
||||
}
|
||||
azure.storage {
|
||||
# containers: [
|
||||
# {
|
||||
# account_name: "clearml"
|
||||
# account_key: "secret"
|
||||
# # container_name:
|
||||
# }
|
||||
# ]
|
||||
}
|
||||
|
||||
log {
|
||||
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
|
||||
null_log_propagate: false
|
||||
task_log_buffer_capacity: 66
|
||||
|
||||
# disable urllib info and lower levels
|
||||
disable_urllib3_info: true
|
||||
}
|
||||
|
||||
development {
|
||||
# Development-mode options
|
||||
|
||||
# dev task reuse window
|
||||
task_reuse_time_window_in_hours: 72.0
|
||||
|
||||
# Run VCS repository detection asynchronously
|
||||
vcs_repo_detect_async: true
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff: true
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
support_stopping: true
|
||||
|
||||
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
|
||||
default_output_uri: ""
|
||||
|
||||
# Default auto generated requirements optimize for smaller requirements
|
||||
# If True, analyze the entire repository regardless of the entry point.
|
||||
# If False, first analyze the entry point script, if it does not contain other to local files,
|
||||
# do not analyze the entire repository.
|
||||
force_analyze_entire_repo: false
|
||||
|
||||
# If set to true, *clearml* update message will not be printed to the console
|
||||
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
|
||||
suppress_update_message: false
|
||||
|
||||
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
|
||||
detect_with_pip_freeze: false
|
||||
|
||||
# Development mode worker
|
||||
worker {
|
||||
# Status report period in seconds
|
||||
report_period_sec: 2
|
||||
|
||||
# ping to the server - check connectivity
|
||||
ping_period_sec: 30
|
||||
|
||||
# Log all stdout & stderr
|
||||
log_stdout: true
|
||||
|
||||
# compatibility feature, report memory usage for the entire machine
|
||||
# default (false), report only on the running process and its sub-processes
|
||||
report_global_mem_used: false
|
||||
}
|
||||
}
|
||||
}
|
||||
36
docker/k8s-glue/build-resources/entrypoint.sh
Normal file
36
docker/k8s-glue/build-resources/entrypoint.sh
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
|
||||
|
||||
if [ -z "$CLEARML_FILES_HOST" ]; then
|
||||
CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
|
||||
fi
|
||||
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
|
||||
export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
|
||||
export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
|
||||
|
||||
echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
|
||||
|
||||
if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
|
||||
if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
|
||||
python3 -m pip install -q -U "$CLEARML_AGENT_UPDATE_REPO"
|
||||
else
|
||||
python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
|
||||
fi
|
||||
fi
|
||||
|
||||
QUEUE=${K8S_GLUE_QUEUE:-k8s_glue}
|
||||
MAX_PODS=${K8S_GLUE_MAX_PODS:-2}
|
||||
EXTRA_ARGS=${K8S_GLUE_EXTRA_ARGS:-}
|
||||
|
||||
# shellcheck disable=SC2129
|
||||
echo "api.credentials.access_key: ${CLEARML_API_ACCESS_KEY}" >> ~/clearml.conf
|
||||
echo "api.credentials.secret_key: ${CLEARML_API_SECRET_KEY}" >> ~/clearml.conf
|
||||
echo "api.api_server: ${CLEARML_API_HOST}" >> ~/clearml.conf
|
||||
echo "api.web_server: ${CLEARML_WEB_HOST}" >> ~/clearml.conf
|
||||
echo "api.files_server: ${CLEARML_FILES_HOST}" >> ~/clearml.conf
|
||||
|
||||
./provider_entrypoint.sh
|
||||
|
||||
python3 k8s_glue_example.py --queue ${QUEUE} --max-pods ${MAX_PODS} ${EXTRA_ARGS}
|
||||
94
docker/k8s-glue/build-resources/k8s_glue_example.py
Normal file
94
docker/k8s-glue/build-resources/k8s_glue_example.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
This example assumes you have preconfigured services with selectors in the form of
|
||||
"ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
|
||||
The K8sIntegration component will label each pod accordingly.
|
||||
"""
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from clearml_agent.glue.k8s import K8sIntegration
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
|
||||
parser.add_argument(
|
||||
"--queue", type=str, help="Queue to pull tasks from"
|
||||
)
|
||||
group.add_argument(
|
||||
"--ports-mode", action='store_true', default=False,
|
||||
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
||||
"Should not be used with max-pods"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-of-services", type=int, default=20,
|
||||
help="Specify the number of k8s services to be used. Use only with ports-mode."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-port", type=int,
|
||||
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
|
||||
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
|
||||
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-pod-num", type=int, default=1,
|
||||
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
|
||||
"service (default: %(default)s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gateway-address", type=str, default=None,
|
||||
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pod-clearml-conf", type=str,
|
||||
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overrides-yaml", type=str,
|
||||
help="YAML file containing pod overrides to be used when launching a new pod"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template-yaml", type=str,
|
||||
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
|
||||
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ssh-server-port", type=int, default=0,
|
||||
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--namespace", type=str,
|
||||
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
|
||||
)
|
||||
group.add_argument(
|
||||
"--max-pods", type=int,
|
||||
help="Limit the maximum number of pods that this service can run at the same time."
|
||||
"Should not be used with ports-mode"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
user_props_cb = None
|
||||
if args.ports_mode and args.base_port:
|
||||
def k8s_user_props_cb(pod_number=0):
|
||||
user_prop = {"k8s-pod-port": args.base_port + pod_number}
|
||||
if args.gateway_address:
|
||||
user_prop["k8s-gateway-address"] = args.gateway_address
|
||||
return user_prop
|
||||
user_props_cb = k8s_user_props_cb
|
||||
|
||||
k8s = K8sIntegration(
|
||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
|
||||
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
|
||||
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
||||
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
||||
)
|
||||
k8s.k8s_daemon(args.queue)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
docker/k8s-glue/build-resources/setup.sh
Normal file
14
docker/k8s-glue/build-resources/setup.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
chmod +x /root/entrypoint.sh
|
||||
|
||||
apt-get update -y
|
||||
apt-get dist-upgrade -y
|
||||
apt-get install -y curl unzip less locales
|
||||
|
||||
locale-gen en_US.UTF-8
|
||||
|
||||
apt-get install -y curl python3-pip git
|
||||
python3 -m pip install -U pip
|
||||
python3 -m pip install clearml-agent
|
||||
python3 -m pip install -U "cryptography>=2.9"
|
||||
22
docker/k8s-glue/glue-build-aws/Dockerfile
Normal file
22
docker/k8s-glue/glue-build-aws/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LANGUAGE=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY ../build-resources/setup.sh /root/setup.sh
|
||||
RUN /root/setup.sh
|
||||
|
||||
COPY ./setup_aws.sh /root/setup_aws.sh
|
||||
RUN /root/setup_aws.sh
|
||||
|
||||
COPY ../build-resources/entrypoint.sh /root/entrypoint.sh
|
||||
COPY ./provider_entrypoint.sh /root/provider_entrypoint.sh
|
||||
COPY ./build-resources/k8s_glue_example.py /root/k8s_glue_example.py
|
||||
COPY ./clearml.conf /root/clearml.conf
|
||||
|
||||
ENTRYPOINT ["/root/entrypoint.sh"]
|
||||
4
docker/k8s-glue/glue-build-aws/provider_entrypoint.sh
Normal file
4
docker/k8s-glue/glue-build-aws/provider_entrypoint.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
source /root/.bashrc
|
||||
export PATH=$PATH:$HOME/bin
|
||||
14
docker/k8s-glue/glue-build-aws/setup_aws.sh
Normal file
14
docker/k8s-glue/glue-build-aws/setup_aws.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
||||
unzip awscliv2.zip
|
||||
./aws/install
|
||||
|
||||
curl -o kubectl https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/kubectl
|
||||
#curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl
|
||||
chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin
|
||||
|
||||
curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator
|
||||
#curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/aws-iam-authenticator
|
||||
chmod +x ./aws-iam-authenticator && mkdir -p $HOME/bin && cp ./aws-iam-authenticator $HOME/bin/aws-iam-authenticator && export PATH=$PATH:$HOME/bin
|
||||
echo 'export PATH=$PATH:$HOME/bin' >> ~/.bashrc
|
||||
22
docker/k8s-glue/glue-build-gcp/Dockerfile
Normal file
22
docker/k8s-glue/glue-build-gcp/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LANGUAGE=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY ../build-resources/setup.sh /root/setup.sh
|
||||
RUN /root/setup.sh
|
||||
|
||||
COPY ./setup_gcp.sh /root/setup_gcp.sh
|
||||
RUN /root/setup_gcp.sh
|
||||
|
||||
COPY ../build-resources/entrypoint.sh /root/entrypoint.sh
|
||||
COPY ./provider_entrypoint.sh /root/provider_entrypoint.sh
|
||||
COPY ./build-resources/k8s_glue_example.py /root/k8s_glue_example.py
|
||||
COPY ./clearml.conf /root/clearml.conf
|
||||
|
||||
ENTRYPOINT ["/root/entrypoint.sh"]
|
||||
4
docker/k8s-glue/glue-build-gcp/provider_entrypoint.sh
Normal file
4
docker/k8s-glue/glue-build-gcp/provider_entrypoint.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
gcloud auth activate-service-account ${CLEARML_SERVICE_ACC} --key-file=/root/keys/${SERVICE_ACC_KEY_JSON}
|
||||
gcloud container clusters get-credentials ${CLUSTER_CRED}
|
||||
14
docker/k8s-glue/glue-build-gcp/setup_gcp.sh
Normal file
14
docker/k8s-glue/glue-build-gcp/setup_gcp.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
curl -LO https://dl.k8s.io/release/v1.21.0/bin/linux/amd64/kubectl
|
||||
|
||||
install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
|
||||
|
||||
sudo apt-get install -y apt-transport-https ca-certificates gnupg
|
||||
|
||||
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
|
||||
|
||||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
|
||||
|
||||
apt-get update -y
|
||||
apt-get install -y google-cloud-sdk
|
||||
75
docker/k8s-glue/glue-build/Dockerfile.alpine
Normal file
75
docker/k8s-glue/glue-build/Dockerfile.alpine
Normal file
@@ -0,0 +1,75 @@
|
||||
ARG TAG=3.7.12-alpine3.15
|
||||
|
||||
FROM python:${TAG} as build
|
||||
|
||||
RUN apk add --no-cache \
|
||||
gcc \
|
||||
musl-dev \
|
||||
libffi-dev
|
||||
|
||||
RUN python3 \
|
||||
-m pip \
|
||||
install \
|
||||
--prefix=/install \
|
||||
--no-cache-dir \
|
||||
-U \
|
||||
clearml-agent \
|
||||
cryptography>=2.9
|
||||
|
||||
FROM python:${TAG} as target
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ARG KUBECTL_VERSION=1.22.4
|
||||
|
||||
# Not sure about these ENV vars
|
||||
# ENV LC_ALL=en_US.UTF-8
|
||||
# ENV LANG=en_US.UTF-8
|
||||
# ENV LANGUAGE=en_US.UTF-8
|
||||
# ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY --from=build /install /usr/local
|
||||
|
||||
ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/
|
||||
|
||||
RUN chmod +x /usr/bin/kubectl
|
||||
|
||||
RUN apk add --no-cache \
|
||||
bash
|
||||
|
||||
COPY k8s_glue_example.py .
|
||||
|
||||
# AWS CLI
|
||||
# https://github.com/kyleknap/aws-cli/blob/source-proposal/proposals/source-install.md#alpine-linux
|
||||
# https://github.com/aws/aws-cli/issues/4685
|
||||
# https://github.com/aws/aws-cli/pull/6352
|
||||
|
||||
# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/alpine/Dockerfile
|
||||
|
||||
FROM target as gcp
|
||||
|
||||
ARG CLOUD_SDK_VERSION=371.0.0
|
||||
ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION
|
||||
ENV PATH /google-cloud-sdk/bin:$PATH
|
||||
|
||||
WORKDIR /
|
||||
|
||||
RUN apk --no-cache add \
|
||||
curl \
|
||||
python3 \
|
||||
py3-crcmod \
|
||||
py3-openssl \
|
||||
bash \
|
||||
libc6-compat \
|
||||
openssh-client \
|
||||
git \
|
||||
gnupg \
|
||||
&& curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
|
||||
tar xzf google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
|
||||
rm google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
|
||||
gcloud config set core/disable_usage_reporting true && \
|
||||
gcloud config set component_manager/disable_update_check true && \
|
||||
gcloud config set metrics/environment github_docker_image && \
|
||||
gcloud --version
|
||||
|
||||
WORKDIR /app
|
||||
82
docker/k8s-glue/glue-build/Dockerfile.bullseye
Normal file
82
docker/k8s-glue/glue-build/Dockerfile.bullseye
Normal file
@@ -0,0 +1,82 @@
|
||||
ARG TAG=3.7.12-slim-bullseye
|
||||
|
||||
FROM python:${TAG} as target
|
||||
|
||||
ARG KUBECTL_VERSION=1.22.4
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN python3 \
|
||||
-m pip \
|
||||
install \
|
||||
--no-cache-dir \
|
||||
-U \
|
||||
clearml-agent \
|
||||
cryptography>=2.9
|
||||
|
||||
# Not sure about these ENV vars
|
||||
# ENV LC_ALL=en_US.UTF-8
|
||||
# ENV LANG=en_US.UTF-8
|
||||
# ENV LANGUAGE=en_US.UTF-8
|
||||
# ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/
|
||||
|
||||
RUN chmod +x /usr/bin/kubectl
|
||||
|
||||
COPY k8s_glue_example.py .
|
||||
|
||||
CMD ["python3", "k8s_glue_example.py"]
|
||||
|
||||
FROM target as aws
|
||||
|
||||
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
|
||||
# https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html
|
||||
|
||||
RUN apt-get update -qqy && \
|
||||
apt-get install -qqy \
|
||||
unzip && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ADD https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip awscliv2.zip
|
||||
ADD https://amazon-eks.s3.us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator
|
||||
|
||||
RUN unzip awscliv2.zip && \
|
||||
./aws/install && \
|
||||
rm -r awscliv2.zip aws/ && \
|
||||
chmod +x /usr/local/bin/aws-iam-authenticator && \
|
||||
aws --version && \
|
||||
aws-iam-authenticator version
|
||||
|
||||
# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/debian_slim/Dockerfile
|
||||
|
||||
FROM target as gcp
|
||||
|
||||
ARG CLOUD_SDK_VERSION=371.0.0
|
||||
ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION
|
||||
|
||||
ENV PATH "$PATH:/opt/google-cloud-sdk/bin/"
|
||||
|
||||
ARG INSTALL_COMPONENTS
|
||||
RUN mkdir -p /usr/share/man/man1/
|
||||
RUN apt-get update -qqy && \
|
||||
apt-get install -qqy \
|
||||
curl \
|
||||
gcc \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
apt-transport-https \
|
||||
lsb-release \
|
||||
openssh-client \
|
||||
git \
|
||||
gnupg && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
pip3 install -U crcmod && \
|
||||
export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \
|
||||
echo "deb https://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" > /etc/apt/sources.list.d/google-cloud-sdk.list && \
|
||||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
|
||||
apt-get update && apt-get install -y google-cloud-sdk=${CLOUD_SDK_VERSION}-0 $INSTALL_COMPONENTS && \
|
||||
gcloud config set core/disable_usage_reporting true && \
|
||||
gcloud config set component_manager/disable_update_check true && \
|
||||
gcloud config set metrics/environment github_docker_image && \
|
||||
gcloud --version
|
||||
94
docker/k8s-glue/glue-build/k8s_glue_example.py
Normal file
94
docker/k8s-glue/glue-build/k8s_glue_example.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
This example assumes you have preconfigured services with selectors in the form of
|
||||
"ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
|
||||
The K8sIntegration component will label each pod accordingly.
|
||||
"""
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from clearml_agent.glue.k8s import K8sIntegration
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
|
||||
parser.add_argument(
|
||||
"--queue", type=str, help="Queue to pull tasks from"
|
||||
)
|
||||
group.add_argument(
|
||||
"--ports-mode", action='store_true', default=False,
|
||||
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
||||
"Should not be used with max-pods"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-of-services", type=int, default=20,
|
||||
help="Specify the number of k8s services to be used. Use only with ports-mode."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-port", type=int,
|
||||
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
|
||||
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
|
||||
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-pod-num", type=int, default=1,
|
||||
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
|
||||
"service (default: %(default)s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gateway-address", type=str, default=None,
|
||||
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pod-clearml-conf", type=str,
|
||||
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overrides-yaml", type=str,
|
||||
help="YAML file containing pod overrides to be used when launching a new pod"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template-yaml", type=str,
|
||||
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
|
||||
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ssh-server-port", type=int, default=0,
|
||||
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--namespace", type=str,
|
||||
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
|
||||
)
|
||||
group.add_argument(
|
||||
"--max-pods", type=int,
|
||||
help="Limit the maximum number of pods that this service can run at the same time."
|
||||
"Should not be used with ports-mode"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
user_props_cb = None
|
||||
if args.ports_mode and args.base_port:
|
||||
def k8s_user_props_cb(pod_number=0):
|
||||
user_prop = {"k8s-pod-port": args.base_port + pod_number}
|
||||
if args.gateway_address:
|
||||
user_prop["k8s-gateway-address"] = args.gateway_address
|
||||
return user_prop
|
||||
user_props_cb = k8s_user_props_cb
|
||||
|
||||
k8s = K8sIntegration(
|
||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
|
||||
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
|
||||
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
||||
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
||||
)
|
||||
k8s.k8s_daemon(args.queue)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
47
docker/k8s-glue/k8s-glue-aws.yml
Normal file
47
docker/k8s-glue/k8s-glue-aws.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: k8s-glue
|
||||
spec:
|
||||
serviceAccountName: ""
|
||||
containers:
|
||||
- name: k8s-glue-container
|
||||
image: allegroai/clearml-agent-k8s:aws-latest-1.21
|
||||
imagePullPolicy: Always
|
||||
command: [
|
||||
"/bin/bash",
|
||||
"-c",
|
||||
"source /root/.bashrc && /root/entrypoint.sh"
|
||||
]
|
||||
volumeMounts:
|
||||
- name: pod-template
|
||||
mountPath: /root/template
|
||||
env:
|
||||
- name: CLEARML_API_HOST
|
||||
value: ""
|
||||
- name: CLEARML_WEB_HOST
|
||||
value: ""
|
||||
- name: CLEARML_FILES_HOST
|
||||
value: ""
|
||||
# - name: K8S_GLUE_MAX_PODS
|
||||
# value: "2"
|
||||
- name: K8S_GLUE_QUEUE
|
||||
value: "k8s-glue"
|
||||
- name: K8S_GLUE_EXTRA_ARGS
|
||||
value: "--template-yaml /root/template/pod_template.yml"
|
||||
- name: CLEARML_API_ACCESS_KEY
|
||||
value: ""
|
||||
- name: CLEARML_API_SECRET_KEY
|
||||
value: ""
|
||||
- name: CLEARML_WORKER_ID
|
||||
value: "k8s-glue-agent"
|
||||
- name: CLEARML_AGENT_UPDATE_REPO
|
||||
value: ""
|
||||
- name: FORCE_CLEARML_AGENT_REPO
|
||||
value: ""
|
||||
- name: CLEARML_DOCKER_IMAGE
|
||||
value: "ubuntu:18.04"
|
||||
volumes:
|
||||
- name: pod-template
|
||||
secret:
|
||||
secretName: k8s-glue-pod-template
|
||||
58
docker/k8s-glue/k8s-glue-gcp.yml
Normal file
58
docker/k8s-glue/k8s-glue-gcp.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: k8s-glue
|
||||
spec:
|
||||
serviceAccountName: ""
|
||||
containers:
|
||||
- name: k8s-glue-container
|
||||
image: allegroai/clearml-agent-k8s:gcp-latest-1.21
|
||||
imagePullPolicy: Always
|
||||
command: [
|
||||
"/bin/bash",
|
||||
"-c",
|
||||
"source /root/.bashrc && /root/entrypoint.sh"
|
||||
]
|
||||
volumeMounts:
|
||||
- name: pod-template
|
||||
mountPath: /root/template
|
||||
- name: service-acc-key
|
||||
mountPath: /root/keys
|
||||
env:
|
||||
- name: CLEARML_API_HOST
|
||||
value: ""
|
||||
- name: CLEARML_WEB_HOST
|
||||
value: ""
|
||||
- name: CLEARML_FILES_HOST
|
||||
value: ""
|
||||
# - name: K8S_GLUE_MAX_PODS
|
||||
# value: "2"
|
||||
- name: K8S_GLUE_QUEUE
|
||||
value: "k8s-glue"
|
||||
- name: K8S_GLUE_EXTRA_ARGS
|
||||
value: "--template-yaml /root/template/pod_template.yml"
|
||||
- name: CLEARML_API_ACCESS_KEY
|
||||
value: ""
|
||||
- name: CLEARML_API_SECRET_KEY
|
||||
value: ""
|
||||
- name: CLEARML_WORKER_ID
|
||||
value: "k8s-glue-agent"
|
||||
- name: CLEARML_AGENT_UPDATE_REPO
|
||||
value: ""
|
||||
- name: FORCE_CLEARML_AGENT_REPO
|
||||
value: ""
|
||||
- name: CLEARML_DOCKER_IMAGE
|
||||
value: "ubuntu:18.04"
|
||||
- name: CLEARML_SERVICE_ACC
|
||||
value: ""
|
||||
- name: SERVICE_ACC_KEY_JSON
|
||||
value: service-account-key.json
|
||||
- name: CLUSTER_CRED
|
||||
value: ""
|
||||
volumes:
|
||||
- name: pod-template
|
||||
secret:
|
||||
secretName: k8s-glue-pod-template
|
||||
- name: service-acc-key
|
||||
secret:
|
||||
secretName: k8s-glue-service-acc-key
|
||||
13
docker/k8s-glue/pod_template.yml
Normal file
13
docker/k8s-glue/pod_template.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
namespace: clearml
|
||||
spec:
|
||||
containers:
|
||||
- resources:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 4G
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 4G
|
||||
restartPolicy: Never
|
||||
7
docker/k8s-glue/task-pod-build/Dockerfile
Normal file
7
docker/k8s-glue/task-pod-build/Dockerfile
Normal file
@@ -0,0 +1,7 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
COPY ./setup.sh /root/setup.sh
|
||||
|
||||
RUN /root/setup.sh
|
||||
10
docker/k8s-glue/task-pod-build/setup.sh
Normal file
10
docker/k8s-glue/task-pod-build/setup.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean
|
||||
chown -R root /root/.cache/pip
|
||||
|
||||
apt-get update -y
|
||||
apt-get dist-upgrade -y
|
||||
apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0 curl python3-pip
|
||||
|
||||
python3 -m pip install -U pip
|
||||
python3 -m pip install clearml-agent
|
||||
python3 -m pip install -U "cryptography>=2.9"
|
||||
@@ -19,7 +19,7 @@ RUN locale-gen en_US.UTF-8
|
||||
RUN apt-get install -y curl python3-pip git
|
||||
RUN curl -sSL https://get.docker.com/ | sh
|
||||
RUN python3 -m pip install -U pip
|
||||
RUN python3 -m pip install trains-agent
|
||||
RUN python3 -m pip install clearml-agent
|
||||
RUN python3 -m pip install -U "cryptography>=2.9"
|
||||
|
||||
ENTRYPOINT ["/usr/agent/entrypoint.sh"]
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
if [ -z "$TRAINS_FILES_HOST" ]; then
|
||||
TRAINS_HOST_IP=${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}
|
||||
CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
|
||||
|
||||
if [ -z "$CLEARML_FILES_HOST" ]; then
|
||||
CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
|
||||
fi
|
||||
|
||||
TRAINS_FILES_HOST=${TRAINS_FILES_HOST:-"http://$TRAINS_HOST_IP:8081"}
|
||||
TRAINS_WEB_HOST=${TRAINS_WEB_HOST:-"http://$TRAINS_HOST_IP:8080"}
|
||||
TRAINS_API_HOST=${TRAINS_API_HOST:-"http://$TRAINS_HOST_IP:8008"}
|
||||
CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
|
||||
CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
|
||||
CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
|
||||
|
||||
echo $TRAINS_FILES_HOST $TRAINS_WEB_HOST $TRAINS_API_HOST 1>&2
|
||||
echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
|
||||
|
||||
python3 -m pip install -q -U "trains-agent${TRAINS_AGENT_UPDATE_VERSION}"
|
||||
trains-agent daemon --services-mode --queue services --create-queue --docker "$TRAINS_AGENT_DEFAULT_BASE_DOCKER" --cpu-only $TRAINS_AGENT_EXTRA_ARGS
|
||||
python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
|
||||
clearml-agent daemon --services-mode --queue services --create-queue --docker "${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}" --cpu-only ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
|
||||
@@ -4,7 +4,7 @@ api {
|
||||
web_server: https://demoapp.demo.clear.ml
|
||||
files_server: https://demofiles.demo.clear.ml
|
||||
|
||||
# Credentials are generated in the webapp, https://demoapp.demo.clear.ml/profile
|
||||
# Credentials are generated in the webapp, https://app.clear.ml/settings/workspace-configuration
|
||||
# Overridden with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
|
||||
credentials {"access_key": "EGRTCO8JMSIGI6S39GTP43NFWXDQOW", "secret_key": "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"}
|
||||
|
||||
@@ -15,6 +15,7 @@ api {
|
||||
agent {
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# Notice: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
git_user=""
|
||||
git_pass=""
|
||||
# Limit credentials to a single domain, for example: github.com,
|
||||
@@ -24,7 +25,9 @@ agent {
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: ""
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Overridden with os environment: CLEARML_WORKER_NAME
|
||||
@@ -40,16 +43,26 @@ agent {
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
# pip_version: "<20"
|
||||
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
|
||||
# poetry_version: "<2",
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
@@ -61,7 +74,7 @@ agent {
|
||||
extra_index_url: []
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", ]
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
# conda_full_env_update: false
|
||||
# conda_env_as_base_docker: false
|
||||
|
||||
@@ -89,17 +102,28 @@ agent {
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
# path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.clearml/vcs-cache
|
||||
},
|
||||
|
||||
# DEPRECATED: please use `venvs_cache` and set `venvs_cache.path`
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
# venv_update: {
|
||||
# enabled: false,
|
||||
# },
|
||||
|
||||
# cached folder for specific python package download (mostly pytorch versions)
|
||||
pip_download_cache {
|
||||
@@ -118,27 +142,114 @@ agent {
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
# extra_docker_arguments: ["--ipc=host", "-v", "/mnt/host/data:/mnt/data"]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
|
||||
image: "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host"]
|
||||
|
||||
# lookup table rules for default container
|
||||
# first matched rule will be picked, according to rule order
|
||||
# enterprise version only
|
||||
# match_rules: [
|
||||
# {
|
||||
# image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
|
||||
# arguments: "-e define=value"
|
||||
# match: {
|
||||
# script{
|
||||
# # Optional: must match all requirements (not partial)
|
||||
# requirements: {
|
||||
# # version selection matching PEP-440
|
||||
# pip: {
|
||||
# tensorflow: "~=2.6"
|
||||
# },
|
||||
# }
|
||||
# # Optional: matching based on regular expression, example: "^exact_match$"
|
||||
# repository: "/my_repository/"
|
||||
# branch: "main"
|
||||
# binary: "python3.6"
|
||||
# }
|
||||
# # Optional: matching based on regular expression, example: "^exact_match$"
|
||||
# project: "project/sub_project"
|
||||
# }
|
||||
# },
|
||||
# {
|
||||
# image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
|
||||
# arguments: "-e define=value"
|
||||
# match: {
|
||||
# # must match all requirements (not partial)
|
||||
# script{
|
||||
# requirements: {
|
||||
# conda: {
|
||||
# torch: ">=2.6,<2.8"
|
||||
# }
|
||||
# }
|
||||
# # no repository matching required
|
||||
# repository: ""
|
||||
# }
|
||||
# # no container image matching required (allow to replace one requested container with another)
|
||||
# container: ""
|
||||
# # no repository matching required
|
||||
# project: ""
|
||||
# }
|
||||
# },
|
||||
# ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# CUDA versions used for Conda setup & solving PyTorch wheel packages
|
||||
# it Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
parse_embedded_urls: true
|
||||
}
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
# docker_internal_mounts {
|
||||
# sdk_cache: "/clearml_agent_cache"
|
||||
# apt_cache: "/var/cache/apt/archives"
|
||||
# ssh_folder: "/root/.ssh"
|
||||
# pip_cache: "/root/.cache/pip"
|
||||
# poetry_cache: "/root/.cache/pypoetry"
|
||||
# vcs_cache: "/root/.clearml/vcs-cache"
|
||||
# venv_build: "/root/.clearml/venvs-builds"
|
||||
# pip_download: "/root/.clearml/pip-download-cache"
|
||||
# }
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
|
||||
# Note: resulting name must start with an alphanumeric character and
|
||||
# continue with alphanumeric characters, underscores (_), dots (.) and/or dashes (-)
|
||||
# docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
}
|
||||
|
||||
sdk {
|
||||
@@ -225,6 +336,7 @@ sdk {
|
||||
# secret: "12345678"
|
||||
# multipart: false
|
||||
# secure: false
|
||||
# verify: /path/to/ca/bundle.crt OR false to not verify
|
||||
# }
|
||||
]
|
||||
}
|
||||
@@ -299,5 +411,45 @@ sdk {
|
||||
log_stdout: True
|
||||
}
|
||||
}
|
||||
|
||||
# Apply top-level environment section from configuration into os.environ
|
||||
apply_environment: true
|
||||
# Top-level environment section is in the form of:
|
||||
# environment {
|
||||
# key: value
|
||||
# ...
|
||||
# }
|
||||
# and is applied to the OS environment as `key=value` for each key/value pair
|
||||
|
||||
# Apply top-level files section from configuration into local file system
|
||||
apply_files: true
|
||||
# Top-level files section allows auto-generating files at designated paths with a predefined contents
|
||||
# and target format. Options include:
|
||||
# contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
|
||||
# format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
|
||||
# base64-encoded contents string, otherwise ignored
|
||||
# path: the target file's path, may include ~ and inplace env vars
|
||||
# target_format: format used to encode contents before writing into the target file. Supported values are json,
|
||||
# yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
|
||||
# overwrite: overwrite the target file in case it exists. Default is true.
|
||||
#
|
||||
# Example:
|
||||
# files {
|
||||
# myfile1 {
|
||||
# contents: "The quick brown fox jumped over the lazy dog"
|
||||
# path: "/tmp/fox.txt"
|
||||
# }
|
||||
# myjsonfile {
|
||||
# contents: {
|
||||
# some {
|
||||
# nested {
|
||||
# value: [1, 2, 3, 4]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# path: "/tmp/test.json"
|
||||
# target_format: json
|
||||
# }
|
||||
# }
|
||||
}
|
||||
|
||||
|
||||
BIN
docs/clearml_architecture.png
Normal file
BIN
docs/clearml_architecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 123 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 2.0 MiB After Width: | Height: | Size: 1.1 MiB |
@@ -10,12 +10,15 @@ from clearml_agent.glue.k8s import K8sIntegration
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
|
||||
parser.add_argument(
|
||||
"--queue", type=str, help="Queue to pull tasks from"
|
||||
)
|
||||
parser.add_argument(
|
||||
group.add_argument(
|
||||
"--ports-mode", action='store_true', default=False,
|
||||
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
||||
"Should not be used with max-pods"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-of-services", type=int, default=20,
|
||||
@@ -24,7 +27,13 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--base-port", type=int,
|
||||
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
|
||||
"For pod #X, the port will be <base-port>+X"
|
||||
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
|
||||
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-pod-num", type=int, default=1,
|
||||
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
|
||||
"service (default: %(default)s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gateway-address", type=str, default=None,
|
||||
@@ -47,6 +56,15 @@ def parse_args():
|
||||
"--ssh-server-port", type=int, default=0,
|
||||
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--namespace", type=str,
|
||||
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
|
||||
)
|
||||
group.add_argument(
|
||||
"--max-pods", type=int,
|
||||
help="Limit the maximum number of pods that this service can run at the same time."
|
||||
"Should not be used with ports-mode"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -63,10 +81,11 @@ def main():
|
||||
user_props_cb = k8s_user_props_cb
|
||||
|
||||
k8s = K8sIntegration(
|
||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, user_props_cb=user_props_cb,
|
||||
overrides_yaml=args.overrides_yaml, trains_conf_file=args.pod_trains_conf, template_yaml=args.template_yaml,
|
||||
extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None
|
||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
|
||||
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
|
||||
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
||||
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
||||
)
|
||||
k8s.k8s_daemon(args.queue)
|
||||
|
||||
|
||||
@@ -2,19 +2,16 @@ attrs>=18.0,<20.4.0
|
||||
enum34>=0.9,<1.2.0 ; python_version < '3.6'
|
||||
furl>=2.0.0,<2.2.0
|
||||
future>=0.16.0,<0.19.0
|
||||
humanfriendly>=2.1,<9.2
|
||||
jsonschema>=2.6.0,<3.3.0
|
||||
pathlib2>=2.3.0,<2.4.0
|
||||
psutil>=3.4.2,<5.9.0
|
||||
pyhocon>=0.3.38,<0.4.0
|
||||
pyparsing>=2.0.3,<2.5.0
|
||||
python-dateutil>=2.4.2,<2.9.0
|
||||
pyjwt>=1.6.4,<1.8.0
|
||||
PyYAML>=3.12,<5.4.0
|
||||
requests-file>=1.4.2,<1.6.0
|
||||
pyjwt>=1.6.4,<2.1.0
|
||||
PyYAML>=3.12,<5.5.0
|
||||
requests>=2.20.0,<2.26.0
|
||||
six>=1.11.0,<1.16.0
|
||||
tqdm>=4.19.5,<4.55.0
|
||||
six>=1.13.0,<1.16.0
|
||||
typing>=3.6.4,<3.8.0
|
||||
urllib3>=1.21.1,<1.27.0
|
||||
virtualenv>=16,<20
|
||||
virtualenv>=16,<21
|
||||
|
||||
Reference in New Issue
Block a user