Compare commits

..

132 Commits

Author SHA1 Message Date
allegroai
d9b9b4984b Version bump to v0.17.2 2021-03-04 20:12:50 +02:00
allegroai
8a46dc6b03 Update default_docker in docs 2021-03-04 20:07:34 +02:00
allegroai
205f9dd816 Fix k8s glue does not pass docker environment variables
Remove deprecated flags
2021-03-03 15:07:06 +02:00
allegroai
9dfa1294e2 Add agent.enable_task_env set the OS environment based on the Environment section of the Task. 2021-02-28 19:47:44 +02:00
allegroai
f019905720 Fix venv cache support for local folders 2021-02-28 19:47:09 +02:00
allegroai
9c257858dd Fix venv cache support for local folders 2021-02-23 18:54:38 +02:00
allegroai
2006ab20dd Fix conda support for git+http links 2021-02-23 12:46:06 +02:00
allegroai
0caf31719c Fix venv caching always reinstall git repositories and local repositories 2021-02-23 12:45:34 +02:00
allegroai
5da7184276 Add agent.ignore_requested_python_version (control for multi python environments) 2021-02-23 12:45:00 +02:00
allegroai
50fccdab96 PEP8 2021-02-23 12:44:26 +02:00
allegroai
77d6ff6630 Fix docker mode without venvs cache dir 2021-02-17 00:04:07 +02:00
allegroai
99614702ea Add missing default configuration value 2021-02-17 00:03:42 +02:00
allegroai
58cb344ee6 Upgrade pynvml add detect CUDA version from driver level 2021-02-17 00:03:16 +02:00
allegroai
22d5892b12 Use shared git cache between multiple agents on the same machine 2021-02-14 13:49:29 +02:00
allegroai
f619969efc Add venvs_cache configuration 2021-02-14 13:48:57 +02:00
allegroai
ca242424ab Fix service-mode support for venvs
Fix --services-mode with venvs
2021-02-14 13:45:17 +02:00
allegroai
407deb84e9 Fix multi instances on Windows 2021-02-14 13:44:39 +02:00
allegroai
14589aa094 Fix CPU mode 2021-02-14 13:44:00 +02:00
allegroai
1260e3d942 Update cache entries on conda package manager 2021-02-11 14:47:26 +02:00
allegroai
b22d926d94 Fix cache to take cuda version into account 2021-02-11 14:47:05 +02:00
allegroai
410cc8c7be Add --dynamic-gpus and limit in --services-mode 2021-02-11 14:46:37 +02:00
allegroai
784c676f5b Fix "from clearml" runtime diff patching (make sure we move it to after all the __future__ imports) include handling triple quotes in comments 2021-02-11 14:46:06 +02:00
allegroai
296f7970df Fix file not found error (no 2) interpreted as aborted (i.e. ctrl-c) 2021-02-11 14:44:54 +02:00
allegroai
cd59933c9c Remove unused packages 2021-02-11 14:44:35 +02:00
allegroai
b95d3f5300 Add venv caching with docker mode support 2021-02-11 14:44:19 +02:00
allegroai
fa0d5d8469 Fix --detached not supported on Windows, ignore and issue warning 2021-02-11 14:40:09 +02:00
allegroai
8229843018 Add base-pod-number parameter to k8s glue and example 2021-01-26 20:00:18 +02:00
allegroai
c578b37c6d Change dump configuration and ssh on every docker run 2021-01-24 08:48:10 +02:00
allegroai
8ea062c0bd Fix environment variables CLEARML_WEB_HOST/CLEARML_FILES_HOST not passed to running tasks (or updated on the config object) 2021-01-24 08:47:33 +02:00
allegroai
5d8bbde434 Fix applying git diff on new added file 2021-01-24 08:46:42 +02:00
allegroai
0462af6a3d Allow providing namespace in k8s glue and k8s glue example 2021-01-20 19:01:03 +02:00
allegroai
5a94a4048e Update agent and services docker files 2021-01-18 11:40:11 +02:00
allegroai
2602301e1d Improve agent.extra_docker_arguments documentation 2021-01-10 12:40:24 +02:00
allegroai
161993f66f Add agent.force_git_ssh_user configuration value (issue #42)
Change default docker to nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
2021-01-10 12:38:45 +02:00
allegroai
b7f87fb8d3 Detect and delete "stuck" k8s pods k8s glue 2021-01-10 12:37:13 +02:00
allegroai
8fdb87f1f5 Fix docker --network returns None 2020-12-30 16:57:04 +02:00
Allegro AI
a9a68d230e Update README.md 2020-12-25 04:23:12 +02:00
allegroai
a1f2941ffd version bump 2020-12-25 02:10:06 +02:00
allegroai
c548eeacfc status stable 2020-12-25 02:09:54 +02:00
allegroai
428781af86 Fix support for Windows pip and Conda requirements.txt 2020-12-25 02:06:40 +02:00
Allegro AI
72efe2e9fe Update README.md 2020-12-23 01:42:10 +02:00
allegroai
a455003c7f version bump 2020-12-23 00:13:51 +02:00
allegroai
8c46cc55a3 Update READEME.md 2020-12-23 00:12:17 +02:00
Allegro AI
d1e3d93332 Update README.md 2020-12-22 23:58:39 +02:00
allegroai
b4d143812e initial clearml-agent v0.17.0 2020-12-22 23:00:57 +02:00
allegroai
6e1f74402e Rename trains-agent -> clearml-agent 2020-12-22 21:21:29 +02:00
allegroai
090327234a Version bump to v0.16.3 2020-12-22 20:18:30 +02:00
allegroai
3620c3a12d Update PyJWT requirement (v2.0.0 breaks interface) as well as other requirements constraints 2020-12-22 20:18:14 +02:00
allegroai
9a3f950ac6 Fix conform queue name to k8s standard 2020-12-13 16:21:29 +02:00
allegroai
0b36cb0f85 Change k8s pod naming scheme to include queue name 2020-12-10 14:19:19 +02:00
allegroai
dd42423482 Version bump to v0.16.2 2020-12-10 13:02:19 +02:00
allegroai
69eb25db1f Fix running trains-agent from conda environment - conda.sh not found in first conda PATH match 2020-12-10 09:53:18 +02:00
allegroai
a41ea52f87 Add multiple packages support 2020-12-10 09:52:00 +02:00
allegroai
259113c989 Add PackageCollectorRequirement to allow multiple entries of the same package 2020-12-06 12:16:56 +02:00
allegroai
1afa3a3914 Add torchcsprng and torchtext to PyTorch resolving. Improve debug prints on auto cuda version resolving. 2020-12-06 12:15:12 +02:00
allegroai
448e23825c Fix requirements dict with null entry in pip should be considered None and we should install from requirements.txt 2020-12-06 12:14:22 +02:00
allegroai
b0c0f41f62 Allow zero context diffs (useful when blind patching repository) 2020-12-06 12:13:28 +02:00
allegroai
d2c5fb6512 Add K8s glue example --gateway-address settings properties/k8s-gateway-address on all Tasks 2020-12-06 12:12:42 +02:00
allegroai
b89cf4ec23 version bump 2020-11-29 23:17:50 +02:00
allegroai
74b646af9e Add pass TRAINS_DOCKER_IMAGE into docker for interactive sessions 2020-11-29 23:16:40 +02:00
allegroai
0cf485f7a9 Improve k8s nvidia container integration 2020-11-26 01:15:49 +02:00
allegroai
ea63e4f66e Add --ssh-server-port to k8s glue service 2020-11-26 01:15:20 +02:00
allegroai
58eb5fbd5f Fix torch CUDA 11.1 support 2020-11-26 01:14:36 +02:00
allegroai
a8c543ef7b Fix nvidia pytorch dockers support 2020-11-25 16:45:09 +02:00
allegroai
64e198a57a Fix nvidia docker support on some linux distros (SUSE) 2020-11-25 16:44:37 +02:00
allegroai
de332b9e6b Document '--stop' usage 2020-11-19 12:36:58 +02:00
allegroai
60eeff292d version bump 2020-11-11 17:11:51 +02:00
allegroai
52f30b306a Fix git diff with empty line at the end of the git diff will cause corrupt diff apply message 2020-11-11 17:11:28 +02:00
allegroai
6df0f81ca0 Fix uid is None causes ValueError in str.startswith(). Fix str.split (should be on the filename itself, not the path). 2020-11-11 16:32:47 +02:00
allegroai
40b3c1502d Add extra_bash_init_script to k8s glue. Default config is the raw config file (not created at runtime) 2020-11-11 16:31:25 +02:00
allegroai
a61265effe Improve trying to find conda executable 2020-11-11 16:29:50 +02:00
allegroai
92efea6b76 Add agent.package_manager.force_repo_requirements_txt. If True, "Installed Packages" on Task are ignored, and only repo requirements.txt is used 2020-11-11 16:29:00 +02:00
allegroai
216b3e2179 Allow to specifying cudatoolkit version in "installed packages" when using Conda as package manager (trains issue #229) 2020-10-30 10:06:02 +02:00
allegroai
293a92f486 Improve k8s glue add --template-yaml 2020-10-23 01:28:22 +03:00
allegroai
6bad2b5352 Fix support non-ascii git diff 2020-10-23 01:27:59 +03:00
allegroai
a09a638b9c Improve k8s glue layer 2020-10-22 18:09:56 +03:00
allegroai
24f57270ed version bump 2020-10-22 18:09:23 +03:00
allegroai
1b7964ce98 Add k8s select external trains.conf file for the pod itself 2020-10-21 19:04:38 +03:00
allegroai
5a510882b8 Ignore environment SSH_AUTH_SOCK. Only check if git_user/pass are configured, if they are not, leave the links as they are 2020-10-21 19:02:29 +03:00
allegroai
601ed03198 Add support for k8s pod custom user properties 2020-10-20 23:48:02 +03:00
allegroai
90fe4570b9 Show k8s pod number in task's User Properties configuration section 2020-10-20 23:27:04 +03:00
allegroai
92fc8e838f Add K8s glue support for limited number of services exposing ports 2020-10-20 14:17:30 +03:00
allegroai
89a3020c5e Fix ubuntu/debian support by making sure not to ask for input (fix tzdata install) 2020-10-15 23:32:17 +03:00
allegroai
fc3e47b67e Add suppress_carriage_return to documentation
Add docker_preprocess_bash_script to allow preprocessing bash to be added
Fix multiple python versions installed in the same docker by finding the highest installed python inside the docker
Fix conda_env_as_base_docker not set to False in docker mode
2020-10-15 23:31:01 +03:00
allegroai
b2a80ca314 Fix Trains examples references 2020-10-15 23:28:53 +03:00
allegroai
14655f19a0 Fix conda PYTHONPATH (point only to code, not to venv) 2020-10-15 23:26:58 +03:00
allegroai
47092c47db Fix apply git diff from submodule only 2020-10-15 23:26:52 +03:00
allegroai
8e6fce8d63 Add conda support for read-only pre-built environment (pass conda folder as docker_cmd on Task).
Fix conda restore prebuild tar.gz file, fix conda prefix by call conda-unpack from unzipped conda env.
2020-10-15 23:25:57 +03:00
allegroai
3c514e3418 Make sure TRAINS_AGENT_K8S_HOST_MOUNT is used only once per mount 2020-10-15 23:24:51 +03:00
allegroai
8a425b100b Fix k8s glue script to trains-agent default docker script 2020-10-15 23:24:21 +03:00
allegroai
eb942cfedd Add agent.package_manager.conda_env_as_base_docker allowing "docker_cmd" to contain link to a full pre-packaged conda environment (conda-pack outputs a tar.gz). Use TRAINS_CONDA_ENV_PACKAGE to specify conda tar.gz file. 2020-10-15 23:23:46 +03:00
Allegro AI
0a7fc06108 Merge pull request #31 from eliorc/master
Fix broken links in README.md
2020-10-14 16:13:40 +03:00
Elior Cohen
0ae35afa76 📝 Broken links in README.md 2020-10-14 10:43:33 +03:00
allegroai
a2156e73bf Fix conda pip freeze to be consistent with trains 0.16.3 2020-10-11 11:25:35 +03:00
allegroai
9fe77f3c28 Fix conda environment support for trains 0.16.3 full env. Add agent.package_manager.conda_full_env_update to allow conda to update back the requirements (default is false, to preserve previous behavior) 2020-10-11 11:24:52 +03:00
allegroai
6f078afafd Add Requirement.clone() 2020-10-11 11:21:49 +03:00
allegroai
15f4aa613e Suppress "\r" when reading a current chunk of a file. Add agent.suppress_carriage_return (default True) to support previous behavior. 2020-10-11 11:21:08 +03:00
allegroai
7cd9fa6c41 Version bump to v0.16.1 2020-10-05 18:27:07 +03:00
allegroai
234d5fac2c When using force ssh protocol, only enforce on git_host if provided, otherwise apply everywhere 2020-10-05 18:26:21 +03:00
allegroai
6cbfb96ff8 Rename git_domain to git_host 2020-10-05 11:25:03 +03:00
allegroai
6e54e55c31 Add agent.force_git_ssh_port to control https to ssh link conversion for non standard ssh port 2020-10-04 19:42:44 +03:00
allegroai
3ff85b7b85 Replace back package version on conda and pip 2020-10-04 19:41:26 +03:00
allegroai
5640489f57 Replace torch version on pre-installed local file 2020-10-04 19:40:39 +03:00
allegroai
8135a6facf Add agent.git_domain setting for limiting git credential usage for a specific domain (env var TRAINS_AGENT_GIT_DOMAIN is also supported) 2020-10-04 19:40:04 +03:00
allegroai
b6ae4f211d Fix "package @ " should processed by us (pip will not test pre-installed version of the package compared with the link) 2020-10-04 19:38:33 +03:00
allegroai
a56f032ec4 Fix torch support to not change back the same link 2020-10-04 19:37:12 +03:00
allegroai
075736de20 Translate downloaded URL back to original link when new pip version is installed (otherwise we end up with file:///... links) 2020-10-04 19:36:14 +03:00
allegroai
d8543c892e When new pip version is installed, no need to install git packages twice (pip freeze will detect the correct git link version) 2020-10-04 19:35:26 +03:00
allegroai
ca0870b048 Allow parsing of "package @ scheme://link" lines in requirements 2020-10-04 19:34:32 +03:00
allegroai
c7a739fafa Add support for detecting new pip version (20+) supporting @ in requirements 2020-10-04 19:33:52 +03:00
allegroai
7170296162 Remove warning on '.' (same as an empty working directory) 2020-10-04 19:32:48 +03:00
allegroai
3bed0ef33c Add protection against bad file name parsing in git diff apply 2020-10-04 19:31:48 +03:00
allegroai
d419fa1e4f Update torch version after using system pre-installed version 2020-10-04 19:29:47 +03:00
allegroai
31a56c71bd Add preliminary agent uptime/downtime support 2020-09-29 19:34:51 +03:00
allegroai
28f47419b0 Fix incorrect check for spaces in current execution folder (only check in cache folders) 2020-09-15 20:26:02 +03:00
allegroai
6a24da2849 Add post_packages post_optional_packages to control packages installed after all the rest (e.g. horovod)
Rename CythonReq to PriorityPackageRequirement and HorovodReq to PostRequirement
2020-09-15 20:20:55 +03:00
allegroai
782668fd21 Add sdk.metrics.plot_max_num_digits to reduce plot storage size 2020-09-05 16:37:17 +03:00
allegroai
aaf8d802e7 Update documentation 2020-09-05 16:37:17 +03:00
allegroai
ca89a1e322 Fix pre-installed packages are ignored when installing a git package wheel. Reinstalling a git+http link is enough to make sure all requirements are met/installed (trains issue #196) 2020-09-05 16:37:17 +03:00
allegroai
121dec2a62 Version bump to v0.16.0 2020-08-10 17:28:00 +03:00
allegroai
4aacf9005e Fix GPU Windows monitoring support (Trains Issue #177) 2020-08-10 08:07:51 +03:00
allegroai
6b333202e9 Sync generated conf file with latest Trains 2020-08-08 14:44:45 +03:00
allegroai
ce6831368f Fix GPU monitoring on Windows machines 2020-08-08 14:43:25 +03:00
allegroai
e4111c830b Fix GIT user/pass in requirements and support for '-e git+http' lines 2020-07-30 14:30:23 +03:00
allegroai
52c1772b04 Add requirement_parser into trains-agent instead as a dependency. Fix requirement_parser to support 'package @ git+http' lines 2020-07-30 14:29:37 +03:00
allegroai
699d13bbb3 Fix task status change to queued should also never happen during Task runtime 2020-07-14 23:42:11 +03:00
allegroai
2c8d7d3d9a Fix --debug to set all specified loggers to DEBUG
Add set_urllib_log_level, in debug set urllib log level to DEBUG
2020-07-11 01:45:46 +03:00
allegroai
b13cc1e8e7 Add error message when Trains API Server is not accessible on startup 2020-07-11 01:44:45 +03:00
allegroai
17d2bf2a3e Change daemon --stop without any specific flag to terminate the agents by worker id lexicographic order 2020-07-11 01:43:54 +03:00
allegroai
94997f9c88 Add daemon --order-fairness for round-robin queue pulling
Add daemon --stop to terminate running agent (assume all the rest of the arguments are the same)
Clean up all log files on termination unless executed with --debug
2020-07-11 01:42:56 +03:00
allegroai
c6d998c4df Add terminate process and rmtree utilities 2020-07-11 01:40:50 +03:00
allegroai
f8ea445339 Fix docker to use UTF-8 encoding, so prints won't break it 2020-07-11 01:40:14 +03:00
146 changed files with 8853 additions and 3308 deletions

287
README.md
View File

@@ -1,80 +1,106 @@
# Allegro Trains Agent
## Deep Learning DevOps For Everyone - Now supporting all platforms (Linux, macOS, and Windows)
<div align="center">
"All the Deep-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_agent_logo.png?raw=true" width="250px">
**ClearML Agent - ML-Ops made easy
ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
[![GitHub license](https://img.shields.io/github/license/allegroai/trains-agent.svg)](https://img.shields.io/github/license/allegroai/trains-agent.svg)
[![PyPI pyversions](https://img.shields.io/pypi/pyversions/trains-agent.svg)](https://img.shields.io/pypi/pyversions/trains-agent.svg)
[![PyPI version shields.io](https://img.shields.io/pypi/v/trains-agent.svg)](https://img.shields.io/pypi/v/trains-agent.svg)
[![PyPI status](https://img.shields.io/pypi/status/trains-agent.svg)](https://pypi.python.org/pypi/trains-agent/)
[![PyPI pyversions](https://img.shields.io/pypi/pyversions/clearml-agent.svg)](https://img.shields.io/pypi/pyversions/clearml-agent.svg)
[![PyPI version shields.io](https://img.shields.io/pypi/v/clearml-agent.svg)](https://img.shields.io/pypi/v/clearml-agent.svg)
### Help improve Trains by filling our 2-min [user survey](https://allegro.ai/lp/trains-user-survey/)
</div>
**Trains Agent is an AI experiment cluster solution.**
---
It is a zero configuration fire-and-forget execution agent, which combined with trains-server provides a full AI cluster solution.
### ClearML-Agent
#### *Formerly known as Trains Agent*
**Full AutoML in 5 steps**
1. Install the [Trains Server](https://github.com/allegroai/trains-agent) (or use our [open server](https://demoapp.trains.allegro.ai))
2. `pip install trains-agent` ([install](#installing-the-trains-agent) the Trains Agent on any GPU machine: on-premises / cloud / ...)
3. Add [Trains](https://github.com/allegroai/trains) to your code with just 2 lines & run it once (on your machine / laptop)
4. Change the [parameters](#using-the-trains-agent) in the UI & schedule for [execution](#using-the-trains-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
* Run jobs (experiments) on any local or cloud based resource
* Implement optimized resource utilization policies
* Deploy execution environments with either virtualenv or fully docker containerized with zero effort
* Launch-and-Forget service containers
* [Cloud autoscaling](https://allegro.ai/clearml/docs/docs/examples/services/aws_autoscaler/aws_autoscaler.html)
* [Customizable cleanup](https://allegro.ai/clearml/docs/docs/examples/services/cleanup/cleanup_service.html)
* Advanced [pipeline building and execution](https://allegro.ai/clearml/docs/docs/examples/frameworks/pytorch/notebooks/table/tabular_training_pipeline.html)
It is a zero configuration fire-and-forget execution agent, providing a full ML/DL cluster solution.
**Full Automation in 5 steps**
1. ClearML Server [self-hosted](https://github.com/allegroai/trains-server) or [free tier hosting](https://app.community.clear.ml)
2. `pip install clearml-agent` ([install](#installing-the-clearml-agent) the ClearML Agent on any GPU machine: on-premises / cloud / ...)
3. Create a [job](https://github.com/allegroai/clearml/docs/clearml-task.md) or Add [ClearML](https://github.com/allegroai/trains) to your code with just 2 lines
4. Change the [parameters](#using-the-clearml-agent) in the UI & schedule for [execution](#using-the-clearml-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
5. :chart_with_downwards_trend: :chart_with_upwards_trend: :eyes: :beer:
"All the Deep/Machine-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
**Using the Trains Agent, you can now set up a dynamic cluster with \*epsilon DevOps**
**Try ClearML now** [Self Hosted](https://github.com/allegroai/trains-server) or [Free tier Hosting](https://app.community.clear.ml)
<a href="https://app.community.clear.ml"><img src="https://raw.githubusercontent.com/allegroai/trains-agent/9f1e86c1ca45c984ee13edc9353c7b10c55d7257/docs/screenshots.gif" width="100%"></a>
*epsilon - Because we are scientists :triangular_ruler: and nothing is really zero work
(Experience Trains live at [https://demoapp.trains.allegro.ai](https://demoapp.trains.allegro.ai))
<a href="https://demoapp.trains.allegro.ai"><img src="https://raw.githubusercontent.com/allegroai/trains-agent/9f1e86c1ca45c984ee13edc9353c7b10c55d7257/docs/screenshots.gif" width="100%"></a>
## Simple, Flexible Experiment Orchestration
**The Trains Agent was built to address the DL/ML R&D DevOps needs:**
### Simple, Flexible Experiment Orchestration
**The ClearML Agent was built to address the DL/ML R&D DevOps needs:**
* Easily add & remove machines from the cluster
* Reuse machines without the need for any dedicated containers or images
* **Combine GPU resources across any cloud and on-prem**
* **No need for yaml/json/template configuration of any kind**
* **No need for yaml / json / template configuration of any kind**
* **User friendly UI**
* Manageable resource allocation that can be used by researchers and engineers
* Flexible and controllable scheduler with priority support
* Automatic instance spinning in the cloud **(coming soon)**
* Automatic instance spinning in the cloud
**Using the ClearML Agent, you can now set up a dynamic cluster with \*epsilon DevOps**
*epsilon - Because we are :triangular_ruler: and nothing is really zero work
## But ... K8S?
We think Kubernetes is awesome.
Combined with KubeFlow it is a robust solution for production-grade DevOps.
We've observed, however, that it can be a bit of an overkill as an R&D DL/ML solution.
If you are considering K8S for your research, also consider that you will soon be managing **hundreds** of containers...
### Kubernetes Integration (Optional)
We think Kubernetes is awesome, but it should be a choice.
We designed `clearml-agent` so you can run bare-metal or inside a pod with any mix that fits your environment.
#### Benefits of integrating existing K8s with ClearML-Agent
- ClearML-Agent adds the missing scheduling capabilities to K8s
- Allowing for more flexible automation from code
- A programmatic interface for easier learning curve (and debugging)
- Seamless integration with ML/DL experiment manager
- Web UI for customization, scheduling & prioritization of jobs
In our experience, handling and building the environments, having to package every experiment in a docker, managing those hundreds (or more) containers and building pipelines on top of it all, is very complicated (also, its usually out of scope for the research team, and overwhelming even for the DevOps team).
**Two K8s integration flavours**
- Spin ClearML-Agent as a long-lasting service pod
- use [clearml-agent](https://hub.docker.com/r/allegroai/trains-agent) docker image
- map docker socket into the pod (soon replaced by [podman](https://github.com/containers/podman))
- allow the clearml-agent to manage sibling dockers
- benefits: full use of the ClearML scheduling, no need to worry about wrong container images / lost pods etc.
- downside: Sibling containers
- Kubernetes Glue, map ClearML jobs directly to K8s jobs
- Run the [clearml-k8s glue](https://github.com/allegroai/trains-agent/blob/master/examples/k8s_glue_example.py) on a K8s cpu node
- The clearml-k8s glue pulls jobs from the ClearML job execution queue and prepares a K8s job (based on provided yaml template)
- Inside the pod itself the clearml-agent will install the job (experiment) environment and spin and monitor the experiment's process
- benefits: Kubernetes full view of all running jobs in the system
- downside: No real scheduling (k8s scheduler), no docker image verification (post-mortem only)
We feel there has to be a better way, that can be just as powerful for R&D and at the same time allow integration with K8S **when the need arises**.
(If you already have a K8S cluster for AI, detailed instructions on how to integrate Trains into your K8S cluster are [here](https://github.com/allegroai/trains-server-k8s/tree/master/trains-server-chart) with included [helm chart](https://github.com/allegroai/trains-server-helm))
## Using the Trains Agent
### Using the ClearML Agent
**Full scale HPC with a click of a button**
The Trains Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the job and monitors its progress.
The ClearML Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the job and monitors its progress.
Any 'Draft' experiment can be scheduled for execution by a Trains agent.
Any 'Draft' experiment can be scheduled for execution by a ClearML agent.
A previously run experiment can be put into 'Draft' state by either of two methods:
* Using the **'Reset'** action from the experiment right-click context menu in the
Trains UI - This will clear any results and artifacts the previous run had created.
ClearML UI - This will clear any results and artifacts the previous run had created.
* Using the **'Clone'** action from the experiment right-click context menu in the
Trains UI - This will create a new 'Draft' experiment with the same configuration as the original experiment.
ClearML UI - This will create a new 'Draft' experiment with the same configuration as the original experiment.
An experiment is scheduled for execution using the **'Enqueue'** action from the experiment
right-click context menu in the Trains UI and selecting the execution queue.
right-click context menu in the ClearML UI and selecting the execution queue.
See [creating an experiment and enqueuing it for execution](#from-scratch).
Once an experiment is enqueued, it will be picked up and executed by a Trains agent monitoring this queue.
Once an experiment is enqueued, it will be picked up and executed by a ClearML agent monitoring this queue.
The Trains UI Workers & Queues page provides ongoing execution information:
The ClearML UI Workers & Queues page provides ongoing execution information:
- Workers Tab: Monitor you cluster
- Review available resources
- Monitor machines statistics (CPU / GPU / Disk / Network)
@@ -83,154 +109,129 @@ The Trains UI Workers & Queues page provides ongoing execution information:
- Cancel or abort job execution
- Move jobs between execution queues
### What The Trains Agent Actually Does
The Trains Agent executes experiments using the following process:
#### What The ClearML Agent Actually Does
The ClearML Agent executes experiments using the following process:
- Create a new virtual environment (or launch the selected docker image)
- Clone the code into the virtual-environment (or inside the docker)
- Install python packages based on the package requirements listed for the experiment
- Special note for PyTorch: The Trains Agent will automatically select the
- Special note for PyTorch: The ClearML Agent will automatically select the
torch packages based on the CUDA_VERSION environment variable of the machine
- Execute the code, while monitoring the process
- Log all stdout/stderr in the Trains UI, including the cloning and installation process, for easy debugging
- Monitor the execution and allow you to manually abort the job using the Trains UI (or, in the unfortunate case of a code crash, catch the error and signal the experiment has failed)
- Log all stdout/stderr in the ClearML UI, including the cloning and installation process, for easy debugging
- Monitor the execution and allow you to manually abort the job using the ClearML UI (or, in the unfortunate case of a code crash, catch the error and signal the experiment has failed)
### System Design & Flow
```text
+-----------------+
| GPU Machine |
Development Machine | |
+------------------------+ | +-------------+ |
| Data Scientist's | +--------------+ | |Trains Agent | |
| DL/ML Code | | WEB UI | | | | |
| | | | | | +---------+ | |
| | | | | | | DL/ML | | |
| | +--------------+ | | | Code | | |
| | User Clones Exp #1 / . . . . . . . / | | | | | |
| +-------------------+ | into Exp #2 / . . . . . . . / | | +---------+ | |
| | Trains | | +---------------/-_____________-/ | | | |
| +---------+---------+ | | | | ^ | |
+-----------|------------+ | | +------|------+ |
| | +--------|--------+
Auto-Magically | |
Creates Exp #1 | The Trains Agent
\ User Change Hyper-Parameters Pulls Exp #2, setup the
| | environment & clone code.
| | Start execution with the
+------------|------------+ | +--------------------+ new set of Hyper-Parameters.
| +---------v---------+ | | | Trains Server | |
| | Experiment #1 | | | | | |
| +-------------------+ | | | Execution Queue | |
| || | | | | |
| +-------------------+<----------+ | | |
| | | | | | |
| | Experiment #2 | | | | |
| +-------------------<------------\ | | |
| | ------------->---------------+ | |
| | User Send Exp #2 | |Execute Exp #2 +--------------------+
| | For Execution | +---------------+ |
| Trains Server | | |
+-------------------------+ +--------------------+
```
#### System Design & Flow
### Installing the Trains Agent
<img src="https://allegro.ai/clearml/docs/_images/ClearML_Architecture.png" width="100%" alt="clearml-architecture">
#### Installing the ClearML Agent
```bash
pip install trains-agent
pip install clearml-agent
```
### Trains Agent Usage Examples
#### ClearML Agent Usage Examples
Full Interface and capabilities are available with
```bash
trains-agent --help
trains-agent daemon --help
clearml-agent --help
clearml-agent daemon --help
```
### Configuring the Trains Agent
#### Configuring the ClearML Agent
```bash
trains-agent init
clearml-agent init
```
Note: The Trains Agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default Trains Agent cache folder is `~/.trains`
Note: The ClearML Agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default ClearML Agent cache folder is `~/.clearml`
See full details in your configuration file at `~/trains.conf`
See full details in your configuration file at `~/clearml.conf`
Note: The **Trains agent** extends the **Trains** configuration file `~/trains.conf`
They are designed to share the same configuration file, see example [here](docs/trains.conf)
Note: The **ClearML agent** extends the **ClearML** configuration file `~/clearml.conf`
They are designed to share the same configuration file, see example [here](docs/clearml.conf)
### Running the Trains Agent
#### Running the ClearML Agent
For debug and experimentation, start the Trains agent in `foreground` mode, where all the output is printed to screen
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen
```bash
trains-agent daemon --queue default --foreground
clearml-agent daemon --queue default --foreground
```
For actual service mode, all the stdout will be stored automatically into a temporary file (no need to pipe)
Notice: with `--detached` flag, the *trains-agent* will be running in the background
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
```bash
trains-agent daemon --detached --queue default
clearml-agent daemon --detached --queue default
```
GPU allocation is controlled via the standard OS environment `NVIDIA_VISIBLE_DEVICES` or `--gpus` flag (or disabled with `--cpu-only`).
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for the `trains-agent` <br>
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for the `trains-agent`
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for the `clearml-agent` <br>
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for the `clearml-agent`
Example: spin two agents, one per gpu on the same machine:
Notice: with `--detached` flag, the *trains-agent* will be running in the background
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
```bash
trains-agent daemon --detached --gpus 0 --queue default
trains-agent daemon --detached --gpus 1 --queue default
clearml-agent daemon --detached --gpus 0 --queue default
clearml-agent daemon --detached --gpus 1 --queue default
```
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent
```bash
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu
```
#### Starting the Trains Agent in docker mode
##### Starting the ClearML Agent in docker mode
For debug and experimentation, start the Trains agent in `foreground` mode, where all the output is printed to screen
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen
```bash
trains-agent daemon --queue default --docker --foreground
clearml-agent daemon --queue default --docker --foreground
```
For actual service mode, all the stdout will be stored automatically into a file (no need to pipe)
Notice: with `--detached` flag, the *trains-agent* will be running in the background
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
```bash
trains-agent daemon --detached --queue default --docker
clearml-agent daemon --detached --queue default --docker
```
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda docker:
```bash
trains-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda
trains-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda
clearml-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda
```
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda docker:
```bash
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda
```
#### Starting the Trains Agent - Priority Queues
##### Starting the ClearML Agent - Priority Queues
Priority Queues are also supported, example use case:
High priority queue: `important_jobs` Low priority queue: `default`
```bash
trains-agent daemon --queue important_jobs default
clearml-agent daemon --queue important_jobs default
```
The **Trains Agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
The **ClearML Agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [open server](https://demoapp.trains.allegro.ai/workers-and-queues/queues)
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [free server](https://app.community.clear.ml/workers-and-queues/queues)
## How do I create an experiment on the Trains Server? <a name="from-scratch"></a>
* Integrate [Trains](https://github.com/allegroai/trains) with your code
##### Stopping the ClearML Agent
To stop a **ClearML Agent** running in the background, run the same command line used to start the agent with `--stop` appended.
For example, to stop the first of the above shown same machine, single gpu agents:
```bash
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda --stop
```
### How do I create an experiment on the ClearML Server? <a name="from-scratch"></a>
* Integrate [ClearML](https://github.com/allegroai/trains) with your code
* Execute the code on your machine (Manually / PyCharm / Jupyter Notebook)
* As your code is running, **Trains** creates an experiment logging all the necessary execution information:
* As your code is running, **ClearML** creates an experiment logging all the necessary execution information:
- Git repository link and commit ID (or an entire jupyter notebook)
- Git diff (were not saying you never commit and push, but still...)
- Python packages used by your code (including specific versions used)
@@ -239,7 +240,7 @@ Adding queues, managing job order within a queue and moving jobs between queues,
You now have a 'template' of your experiment with everything required for automated execution
* In the Trains UI, Right click on the experiment and select 'clone'. A copy of your experiment will be created.
* In the ClearML UI, Right click on the experiment and select 'clone'. A copy of your experiment will be created.
* You now have a new draft experiment cloned from your original experiment, feel free to edit it
- Change the Hyper-Parameters
- Switch to the latest code base of the repository
@@ -248,44 +249,44 @@ Adding queues, managing job order within a queue and moving jobs between queues,
- Or simply change nothing to run the same experiment again...
* Schedule the newly created experiment for execution: Right-click the experiment and select 'enqueue'
## Trains-Agent Services Mode <a name="services"></a>
### ClearML-Agent Services Mode <a name="services"></a>
Trains-Agent Services is a special mode of Trains-Agent that provides the ability to launch long-lasting jobs
that previously had to be executed on local / dedicated machines. It allows a single agent to
launch multiple dockers (Tasks) for different use cases. To name a few use cases, auto-scaler service (spinning instances
ClearML-Agent Services is a special mode of ClearML-Agent that provides the ability to launch long-lasting jobs
that previously had to be executed on local / dedicated machines. It allows a single agent to
launch multiple dockers (Tasks) for different use cases. To name a few use cases, auto-scaler service (spinning instances
when the need arises and the budget allows), Controllers (Implementing pipelines and more sophisticated DevOps logic),
Optimizer (such as Hyper-parameter Optimization or sweeping), and Application (such as interactive Bokeh apps for
Optimizer (such as Hyper-parameter Optimization or sweeping), and Application (such as interactive Bokeh apps for
increased data transparency)
Trains-Agent Services mode will spin **any** task enqueued into the specified queue.
Every task launched by Trains-Agent Services will be registered as a new node in the system,
providing tracking and transparency capabilities.
Currently trains-agent in services-mode supports cpu only configuration. Trains-agent services mode can be launched alongside GPU agents.
ClearML-Agent Services mode will spin **any** task enqueued into the specified queue.
Every task launched by ClearML-Agent Services will be registered as a new node in the system,
providing tracking and transparency capabilities.
Currently clearml-agent in services-mode supports cpu only configuration. ClearML-agent services mode can be launched alongside GPU agents.
```bash
trains-agent daemon --services-mode --detached --queue services --create-queue --docker ubuntu:18.04 --cpu-only
clearml-agent daemon --services-mode --detached --queue services --create-queue --docker ubuntu:18.04 --cpu-only
```
**Note**: It is the user's responsibility to make sure the proper tasks are pushed into the specified queue.
**Note**: It is the user's responsibility to make sure the proper tasks are pushed into the specified queue.
## AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
The Trains Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the Trains package.
### AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
The ClearML Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the ClearML package.
Sample AutoML & Orchestration examples can be found in the Trains [example/automl](https://github.com/allegroai/trains/tree/master/examples/automl) folder.
Sample AutoML & Orchestration examples can be found in the ClearML [example/automation](https://github.com/allegroai/trains/tree/master/examples/automation) folder.
AutoML examples
- [Toy Keras training experiment](https://github.com/allegroai/trains/blob/master/examples/automl/automl_base_template_keras_simple.py)
- [Toy Keras training experiment](https://github.com/allegroai/trains/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
- In order to create an experiment-template in the system, this code must be executed once manually
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/trains/blob/master/examples/automl/automl_random_search_example.py)
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/trains/blob/master/examples/automation/manual_random_param_search_example.py)
- This example will create multiple copies of the Keras experiment-template, with different hyper-parameter combinations
Experiment Pipeline examples
- [First step experiment](https://github.com/allegroai/trains/blob/master/examples/automl/task_piping_example.py)
- [First step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/task_piping_example.py)
- This example will "process data", and once done, will launch a copy of the 'second step' experiment-template
- [Second step experiment](https://github.com/allegroai/trains/blob/master/examples/automl/toy_base_task.py)
- [Second step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/toy_base_task.py)
- In order to create an experiment-template in the system, this code must be executed once manually
## License
### License
Apache License, Version 2.0 (see the [LICENSE](https://www.apache.org/licenses/LICENSE-2.0.html) for more information)

View File

@@ -4,13 +4,13 @@ import argparse
import sys
import warnings
from trains_agent.backend_api.session.datamodel import UnusedKwargsWarning
from clearml_agent.backend_api.session.datamodel import UnusedKwargsWarning
import trains_agent
from trains_agent.config import get_config
from trains_agent.definitions import FileBuffering, CONFIG_FILE
from trains_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
from trains_agent.helper.process import ExitStatus
import clearml_agent
from clearml_agent.config import get_config
from clearml_agent.definitions import FileBuffering, CONFIG_FILE
from clearml_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
from clearml_agent.helper.process import ExitStatus
from . import interface, session, definitions, commands
from .errors import ConfigFileNotFound, Sigterm, APIError
from .helper.trace import PackageTrace
@@ -47,7 +47,7 @@ def run_command(parser, args, command_name):
except ConfigFileNotFound:
message = 'Cannot find configuration file in "{}".\n' \
'To create a configuration file, run:\n' \
'$ trains_agent init'.format(reverse_home_folder_expansion(CONFIG_FILE))
'$ clearml_agent init'.format(reverse_home_folder_expansion(CONFIG_FILE))
command_class.exit(message)
except APIError as api_error:
if not debug:

View File

@@ -1,25 +1,30 @@
{
# unique name of this worker, if None, created based on hostname:process_id
# Override with os environment: TRAINS_WORKER_ID
# worker_id: "trains-agent-machine1:gpu0"
# Override with os environment: CLEARML_WORKER_ID
# worker_id: "clearml-agent-machine1:gpu0"
worker_id: ""
# worker name, replaces the hostname when creating a unique name for this worker
# Override with os environment: TRAINS_WORKER_NAME
# worker_name: "trains-agent-machine1"
# Override with os environment: CLEARML_WORKER_NAME
# worker_name: "clearml-agent-machine1"
worker_name: ""
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
# git_user: ""
# git_pass: ""
# git_host: ""
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
force_git_ssh_protocol: false
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
# force_git_ssh_port: 0
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
# force_git_ssh_user: git
# Set the python version to use when creating the virtual environment and launching the experiment
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
# The default is the python executing the trains_agent
# The default is the python executing the clearml_agent
python_binary: ""
# select python package manager:
@@ -39,23 +44,53 @@
force_upgrade: false,
# additional artifact repositories to use when installing python packages
# extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
# additional conda channels to use when installing with conda package manager
conda_channels: ["defaults", "conda-forge", "pytorch", ]
# If set to true, Task's "installed packages" are ignored,
# and the repository's "requirements.txt" is used instead
# force_repo_requirements_txt: false
# set the priority packages to be installed before the rest of the required packages
# priority_packages: ["cython", "numpy", "setuptools", ]
# set the optional priority packages to be installed before the rest of the required packages,
# In case a package installation fails, the package will be ignored,
# and the virtual environment process will continue
# priority_optional_packages: ["pygobject", ]
# set the post packages to be installed after all the rest of the required packages
# post_packages: ["horovod", ]
# set the optional post packages to be installed after all the rest of the required packages,
# In case a package installation fails, the package will be ignored,
# and the virtual environment process will continue
# post_optional_packages: []
# set to True to support torch nightly build installation,
# notice: torch nightly builds are ephemeral and are deleted from time to time
torch_nightly: false,
},
# target folder for virtual environments builds, created when executing experiment
venvs_dir = ~/.trains/venvs-builds
venvs_dir = ~/.clearml/venvs-builds
# cached virtual environment folder
venvs_cache: {
# maximum number of cached venvs
max_entries: 10
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
free_space_threshold_gb: 2.0
# unmark to enable virtual environment caching
# path: ~/.clearml/venvs-cache
},
# cached git clone folder
vcs_cache: {
enabled: true,
path: ~/.trains/vcs-cache
path: ~/.clearml/vcs-cache
},
# use venv-update in order to accelerate python virtual environment building
@@ -67,7 +102,7 @@
# cached folder for specific python package download (used for pytorch package caching)
pip_download_cache {
enabled: true,
path: ~/.trains/pip-download-cache
path: ~/.clearml/pip-download-cache
},
translate_ssh: true,
@@ -75,9 +110,9 @@
reload_config: false,
# pip cache folder mapped into docker, used for python package caching
docker_pip_cache = ~/.trains/pip-cache
docker_pip_cache = ~/.clearml/pip-cache
# apt cache folder mapped into docker, used for ubuntu package caching
docker_apt_cache = ~/.trains/apt-cache
docker_apt_cache = ~/.clearml/apt-cache
# optional arguments to pass to docker image
# these are local for this agent and will not be updated in the experiment's docker_cmd section
@@ -86,18 +121,37 @@
# optional shell script to run in docker when started before the experiment is started
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
# Outside of the specified time-spans, the agent will be idle.
# Defined using a list of items of the format: "<hours> <days>".
# hours - use values 0-23, single values would count as start hour and end at midnight.
# days - use days in abbreviated format (SUN-SAT)
# use '-' for ranges and ',' to separate singular values.
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
# uptime: ["17-20 SUN,TUE"]
# optional downtime configuration, can be used only when uptime is not used.
# If downtime is specified, agent will be idle in the time-spans defined here.
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
# Use the same format as described above for uptime
# downtime: []
# set to true in order to force "docker pull" before running an experiment using a docker image.
# This makes sure the docker image is updated.
docker_force_pull: false
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host", ]
}
# set the OS environments based on the Task's Environment section before launching the Task process.
enable_task_env: false
# set the initial bash script to execute at the startup of any docker.
# all lines will be executed regardless of their exit code.
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
@@ -109,6 +163,16 @@
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
# ]
# set the preprocessing bash script to execute at the startup of any docker.
# all lines will be executed regardless of their exit code.
# docker_preprocess_bash_script = [
# "echo \"starting docker\"",
#]
# If False replace \r with \n and display full console output
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
# suppress_carriage_return: true
# cuda versions used for solving pytorch wheel packages
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
# cuda_version: 10.1

View File

@@ -1,10 +1,10 @@
{
# TRAINS - default SDK configuration
# ClearML - default SDK configuration
storage {
cache {
# Defaults to system temp folder / cache
default_base_dir: "~/.trains/cache"
default_base_dir: "~/.clearml/cache"
size {
# max_used_bytes = -1
min_free_bytes = 10GB
@@ -31,12 +31,18 @@
# X images are stored in the upload destination for each matplotlib plot title.
matplotlib_untitled_history_size: 100
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
# plot_max_num_digits: 5
# Settings for generated debug images
images {
format: JPEG
quality: 87
subsampling: 0
}
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
tensorboard_single_series_per_graph: false
}
network {
@@ -92,7 +98,7 @@
google.storage {
# # Default project and credentials file
# # Will be used when no bucket configuration is found
# project: "trains"
# project: "clearml"
# credentials_json: "/path/to/credentials.json"
# # Specific credentials per bucket and sub directory
@@ -100,7 +106,7 @@
# {
# bucket: "my-bucket"
# subdir: "path/in/bucket" # Not required
# project: "trains"
# project: "clearml"
# credentials_json: "/path/to/credentials.json"
# },
# ]
@@ -108,7 +114,7 @@
azure.storage {
# containers: [
# {
# account_name: "trains"
# account_name: "clearml"
# account_key: "secret"
# # container_name:
# }
@@ -117,11 +123,11 @@
log {
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
null_log_propagate: False
null_log_propagate: false
task_log_buffer_capacity: 66
# disable urllib info and lower levels
disable_urllib3_info: True
disable_urllib3_info: true
}
development {
@@ -131,14 +137,30 @@
task_reuse_time_window_in_hours: 72.0
# Run VCS repository detection asynchronously
vcs_repo_detect_async: True
vcs_repo_detect_async: true
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
store_uncommitted_code_diff_on_train: True
store_uncommitted_code_diff: true
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
support_stopping: True
support_stopping: true
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
default_output_uri: ""
# Default auto generated requirements optimize for smaller requirements
# If True, analyze the entire repository regardless of the entry point.
# If False, first analyze the entry point script, if it does not contain other to local files,
# do not analyze the entire repository.
force_analyze_entire_repo: false
# If set to true, *clearml* update message will not be printed to the console
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
suppress_update_message: false
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
detect_with_pip_freeze: false
# Development mode worker
worker {
@@ -149,7 +171,11 @@
ping_period_sec: 30
# Log all stdout & stderr
log_stdout: True
log_stdout: true
# compatibility feature, report memory usage for the entire machine
# default (false), report only on the running process and its sub-processes
report_global_mem_used: false
}
}
}
}

View File

@@ -106,15 +106,15 @@ class StrictSession(Session):
init()
return
original = os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR, None)
original = LOCAL_CONFIG_FILE_OVERRIDE_VAR.get() or None
try:
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = str(config_file)
LOCAL_CONFIG_FILE_OVERRIDE_VAR.set(str(config_file))
init()
finally:
if original is None:
os.environ.pop(LOCAL_CONFIG_FILE_OVERRIDE_VAR, None)
LOCAL_CONFIG_FILE_OVERRIDE_VAR.pop()
else:
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = original
LOCAL_CONFIG_FILE_OVERRIDE_VAR.set(original)
def send(self, request, *args, **kwargs):
result = super(StrictSession, self).send(request, *args, **kwargs)
@@ -222,7 +222,7 @@ class TableResponse(Response):
return "" if result is None else result
fields = fields or self.fields
from trains_agent.helper.base import create_table
from clearml_agent.helper.base import create_table
return create_table(
(dict((attr, getter(item, attr)) for attr in fields) for item in self),
titles=fields, columns=fields, headers=True,

View File

@@ -0,0 +1,11 @@
from ...backend_config.environment import EnvEntry
ENV_HOST = EnvEntry("CLEARML_API_HOST", "TRAINS_API_HOST")
ENV_WEB_HOST = EnvEntry("CLEARML_WEB_HOST", "TRAINS_WEB_HOST")
ENV_FILES_HOST = EnvEntry("CLEARML_FILES_HOST", "TRAINS_FILES_HOST")
ENV_ACCESS_KEY = EnvEntry("CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY")
ENV_SECRET_KEY = EnvEntry("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
ENV_VERBOSE = EnvEntry("CLEARML_API_VERBOSE", "TRAINS_API_VERBOSE", type=bool, default=False)
ENV_HOST_VERIFY_CERT = EnvEntry("CLEARML_API_HOST_VERIFY_CERT", "TRAINS_API_HOST_VERIFY_CERT", type=bool, default=True)
ENV_CONDA_ENV_PACKAGE = EnvEntry("CLEARML_CONDA_ENV_PACKAGE", "TRAINS_CONDA_ENV_PACKAGE")

View File

@@ -29,24 +29,25 @@ class MaxRequestSizeError(Exception):
class Session(TokenManager):
""" TRAINS API Session class. """
""" ClearML API Session class. """
_AUTHORIZATION_HEADER = "Authorization"
_WORKER_HEADER = "X-Trains-Worker"
_ASYNC_HEADER = "X-Trains-Async"
_CLIENT_HEADER = "X-Trains-Agent"
_WORKER_HEADER = ("X-ClearML-Worker", "X-Trains-Worker", )
_ASYNC_HEADER = ("X-ClearML-Async", "X-Trains-Async", )
_CLIENT_HEADER = ("X-ClearML-Agent", "X-Trains-Agent", )
_async_status_code = 202
_session_requests = 0
_session_initial_timeout = (3.0, 10.)
_session_timeout = (10.0, 30.)
_session_initial_connect_retry = 4
_write_session_data_size = 15000
_write_session_timeout = (30.0, 30.)
api_version = '2.1'
default_host = "https://demoapi.trains.allegro.ai"
default_web = "https://demoapp.trains.allegro.ai"
default_files = "https://demofiles.trains.allegro.ai"
default_host = "https://demoapi.demo.clear.ml"
default_web = "https://demoapp.demo.clear.ml"
default_files = "https://demofiles.demo.clear.ml"
default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
@@ -96,7 +97,7 @@ class Session(TokenManager):
else:
self.config = load()
if initialize_logging:
self.config.initialize_logging()
self.config.initialize_logging(debug=kwargs.get('debug', False))
token_expiration_threshold_sec = self.config.get(
"auth.token_expiration_threshold_sec", 60
@@ -134,7 +135,6 @@ class Session(TokenManager):
"api.http.retries", ConfigTree()
).as_plain_ordered_dict()
http_retries_config["status_forcelist"] = self._retry_codes
self.__http_session = get_http_session_with_retry(**http_retries_config)
self.__worker = worker or gethostname()
@@ -144,7 +144,14 @@ class Session(TokenManager):
self.client = client or "api-{}".format(__version__)
# limit the reconnect retries, so we get an error if we are starting the session
http_no_retries_config = dict(**http_retries_config)
http_no_retries_config['connect'] = self._session_initial_connect_retry
self.__http_session = get_http_session_with_retry(**http_no_retries_config)
# try to connect with the server
self.refresh_token()
# create the default session with many retries
self.__http_session = get_http_session_with_retry(**http_retries_config)
# update api version from server response
try:
@@ -185,8 +192,10 @@ class Session(TokenManager):
"""
host = self.host
headers = headers.copy() if headers else {}
headers[self._WORKER_HEADER] = self.worker
headers[self._CLIENT_HEADER] = self.client
for h in self._WORKER_HEADER:
headers[h] = self.worker
for h in self._CLIENT_HEADER:
headers[h] = self.client
token_refreshed_on_error = False
url = (
@@ -261,7 +270,8 @@ class Session(TokenManager):
headers.copy() if headers else {}
)
if async_enable:
headers[self._ASYNC_HEADER] = "1"
for h in self._ASYNC_HEADER:
headers[h] = "1"
return self._send_request(
service=service,
action=action,
@@ -427,16 +437,15 @@ class Session(TokenManager):
@classmethod
def get_api_server_host(cls, config=None):
if not config:
from ...config import config_obj
config = config_obj
return None
return ENV_HOST.get(default=(config.get("api.api_server", None) or
config.get("api.host", None) or cls.default_host))
@classmethod
def get_app_server_host(cls, config=None):
if not config:
from ...config import config_obj
config = config_obj
return None
# get from config/environment
web_host = ENV_WEB_HOST.get(default=config.get("api.web_server", None))
@@ -458,13 +467,13 @@ class Session(TokenManager):
if parsed.port == 8008:
return host.replace(':8008', ':8080', 1)
raise ValueError('Could not detect TRAINS web application server')
raise ValueError('Could not detect ClearML web application server')
@classmethod
def get_files_server_host(cls, config=None):
if not config:
from ...config import config_obj
config = config_obj
return None
# get from config/environment
files_host = ENV_FILES_HOST.get(default=(config.get("api.files_server", None)))
if files_host:
@@ -542,10 +551,13 @@ class Session(TokenManager):
# check if this is a misconfigured api server (getting 200 without the data section)
if res and res.status_code == 200:
raise ValueError('It seems *api_server* is misconfigured. '
'Is this the TRAINS API server {} ?'.format(self.get_api_server_host()))
'Is this the ClearML API server {} ?'.format(self.get_api_server_host()))
else:
raise LoginError("Response data mismatch: No 'token' in 'data' value from res, receive : {}, "
"exception: {}".format(res, ex))
except requests.ConnectionError as ex:
raise ValueError('Connection Error: it seems *api_server* is misconfigured. '
'Is this the ClearML API server {} ?'.format('/'.join(ex.request.url.split('/')[:3])))
except Exception as ex:
raise LoginError('Unrecognized Authentication Error: {} {}'.format(type(ex), ex))

View File

@@ -107,7 +107,7 @@ def get_http_session_with_retry(
if not session.verify and __disable_certificate_verification_warning < 2:
# show warning
__disable_certificate_verification_warning += 1
logging.getLogger('TRAINS').warning(
logging.getLogger('ClearML').warning(
msg='InsecureRequestWarning: Certificate verification is disabled! Adding '
'certificate verification is strongly advised. See: '
'https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings')

View File

@@ -1,4 +1,3 @@
from .defs import Environment
from .config import Config, ConfigEntry
from .errors import ConfigurationError
from .environment import EnvEntry

View File

@@ -138,7 +138,7 @@ class Config(object):
else:
env_config_paths = []
env_config_path_override = os.environ.get(ENV_CONFIG_PATH_OVERRIDE_VAR)
env_config_path_override = ENV_CONFIG_PATH_OVERRIDE_VAR.get()
if env_config_path_override:
env_config_paths = [expanduser(env_config_path_override)]
@@ -165,7 +165,7 @@ class Config(object):
)
local_config_files = LOCAL_CONFIG_FILES
local_config_override = os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR)
local_config_override = LOCAL_CONFIG_FILE_OVERRIDE_VAR.get()
if local_config_override:
local_config_files = [expanduser(local_config_override)]
@@ -190,7 +190,7 @@ class Config(object):
def reload(self):
self.replace(self._reload())
def initialize_logging(self):
def initialize_logging(self, debug=False):
logging_config = self._config.get("logging", None)
if not logging_config:
return False
@@ -217,6 +217,8 @@ class Config(object):
)
for logger in loggers:
handlers = logger.get("handlers", None)
if debug:
logger['level'] = 'DEBUG'
if not handlers:
continue
logger["handlers"] = [h for h in handlers if h not in deleted]

View File

@@ -1,6 +1,8 @@
from os.path import expanduser
from pathlib2 import Path
from ..backend_config.environment import EnvEntry
ENV_VAR = 'TRAINS_ENV'
""" Name of system environment variable that can be used to specify the config environment name """
@@ -17,23 +19,24 @@ ENV_CONFIG_PATHS = [
LOCAL_CONFIG_PATHS = [
# '/etc/opt/trains', # used by servers for docker-generated configuration
# expanduser('~/.trains/config'),
# '/etc/opt/clearml', # used by servers for docker-generated configuration
# expanduser('~/.clearml/config'),
]
""" Local config paths, not related to environment """
LOCAL_CONFIG_FILES = [
expanduser('~/trains.conf'), # used for workstation configuration (end-users, workers)
expanduser('~/clearml.conf'), # used for workstation configuration (end-users, workers)
]
""" Local config files (not paths) """
LOCAL_CONFIG_FILE_OVERRIDE_VAR = 'TRAINS_CONFIG_FILE'
LOCAL_CONFIG_FILE_OVERRIDE_VAR = EnvEntry('CLEARML_CONFIG_FILE', 'TRAINS_CONFIG_FILE', )
""" Local config file override environment variable. If this is set, no other local config files will be used. """
ENV_CONFIG_PATH_OVERRIDE_VAR = 'TRAINS_CONFIG_PATH'
ENV_CONFIG_PATH_OVERRIDE_VAR = EnvEntry('CLEARML_CONFIG_PATH', 'TRAINS_CONFIG_PATH', )
"""
Environment-related config path override environment variable. If this is set, no other env config path will be used.
"""
@@ -46,6 +49,15 @@ class Environment(object):
local = 'local'
class UptimeConf(object):
min_api_version = "2.10"
queue_tag_on = "force_workers:on"
queue_tag_off = "force_workers:off"
worker_key = "force"
worker_value_off = ["off"]
worker_value_on = ["on"]
CONFIG_FILE_EXTENSION = '.conf'

View File

@@ -85,8 +85,9 @@ class Entry(object):
def set(self, value):
# type: (Any, Any) -> (Text, Any)
key, _ = self.get_pair(default=None, converter=None)
self._set(key, str(value))
# key, _ = self.get_pair(default=None, converter=None)
for k in self.keys:
self._set(k, str(value))
def _set(self, key, value):
# type: (Text, Text) -> None

View File

@@ -0,0 +1,64 @@
from os import getenv, environ
from .converters import text_to_bool
from .entry import Entry, NotSet
class EnvEntry(Entry):
@classmethod
def default_conversions(cls):
conversions = super(EnvEntry, cls).default_conversions().copy()
conversions[bool] = text_to_bool
return conversions
def pop(self):
for k in self.keys:
environ.pop(k, None)
def _get(self, key):
value = getenv(key, "").strip()
return value or NotSet
def _set(self, key, value):
environ[key] = value
def __str__(self):
return "env:{}".format(super(EnvEntry, self).__str__())
def error(self, message):
print("Environment configuration: {}".format(message))
def backward_compatibility_support():
from ..definitions import ENVIRONMENT_CONFIG, ENVIRONMENT_SDK_PARAMS, ENVIRONMENT_BACKWARD_COMPATIBLE
if ENVIRONMENT_BACKWARD_COMPATIBLE.get():
# Add TRAINS_ prefix on every CLEARML_ os environment we support
for k, v in ENVIRONMENT_CONFIG.items():
try:
trains_vars = [var for var in v.vars if var.startswith('CLEARML_')]
if not trains_vars:
continue
alg_var = trains_vars[0].replace('CLEARML_', 'TRAINS_', 1)
if alg_var not in v.vars:
v.vars = tuple(list(v.vars) + [alg_var])
except:
continue
for k, v in ENVIRONMENT_SDK_PARAMS.items():
try:
trains_vars = [var for var in v if var.startswith('CLEARML_')]
if not trains_vars:
continue
alg_var = trains_vars[0].replace('CLEARML_', 'TRAINS_', 1)
if alg_var not in v:
ENVIRONMENT_SDK_PARAMS[k] = tuple(list(v) + [alg_var])
except:
continue
# set OS environ:
keys = list(environ.keys())
for k in keys:
if not k.startswith('CLEARML_'):
continue
backwards_k = k.replace('CLEARML_', 'TRAINS_', 1)
if backwards_k not in keys:
environ[backwards_k] = environ[k]

View File

@@ -4,11 +4,11 @@ from pathlib2 import Path
def logger(path=None):
name = "trains"
name = "clearml"
if path:
p = Path(path)
module = (p.parent if p.stem.startswith('_') else p).stem
name = "trains.%s" % module
name = "clearml.%s" % module
return logging.getLogger(name)

View File

@@ -9,16 +9,16 @@ from operator import attrgetter
from traceback import print_exc
from typing import Text
from trains_agent.helper.console import ListFormatter, print_text
from trains_agent.helper.dicts import filter_keys
from clearml_agent.helper.console import ListFormatter, print_text
from clearml_agent.helper.dicts import filter_keys
import six
from trains_agent.backend_api import services
from clearml_agent.backend_api import services
from trains_agent.errors import APIError, CommandFailedError
from trains_agent.helper.base import Singleton, return_list, print_parameters, dump_yaml, load_yaml, error, warning
from trains_agent.interface.base import ObjectID
from trains_agent.session import Session
from clearml_agent.errors import APIError, CommandFailedError
from clearml_agent.helper.base import Singleton, return_list, print_parameters, dump_yaml, load_yaml, error, warning
from clearml_agent.interface.base import ObjectID
from clearml_agent.session import Session
class NameResolutionError(CommandFailedError):
@@ -74,7 +74,7 @@ class BaseCommandSection(object):
@staticmethod
def log(message, *args):
print("trains-agent: {}".format(message % args))
print("clearml-agent: {}".format(message % args))
@classmethod
def exit(cls, message, code=1): # type: (Text, int) -> ()

View File

@@ -1,4 +1,4 @@
from trains_agent.commands.base import ServiceCommandSection
from clearml_agent.commands.base import ServiceCommandSection
class Config(ServiceCommandSection):

View File

@@ -5,13 +5,15 @@ from pyhocon import ConfigFactory, ConfigMissingException
from pathlib2 import Path
from six.moves.urllib.parse import urlparse
from trains_agent.backend_api.session import Session
from trains_agent.backend_api.session.defs import ENV_HOST
from trains_agent.backend_config.defs import LOCAL_CONFIG_FILES
from clearml_agent.backend_api.session import Session
from clearml_agent.backend_api.session.defs import ENV_HOST
from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
description = """
Please create new trains credentials through the profile page in your trains web app (e.g. https://demoapp.trains.allegro.ai/profile)
Please create new clearml credentials through the profile page in your clearml web app (e.g. https://demoapp.demo.clear.ml/profile)
Or with the free hosted service at https://app.community.clear.ml/profile
In the profile page, press "Create new credentials", then press "Copy to clipboard".
Paste copied configuration here:
@@ -25,7 +27,7 @@ except Exception:
host_description = """
Editing configuration file: {CONFIG_FILE}
Enter the url of the trains-server's Web service, for example: {HOST}
Enter the url of the clearml-server's Web service, for example: {HOST}
""".format(
CONFIG_FILE=LOCAL_CONFIG_FILES[0],
HOST=def_host,
@@ -33,8 +35,12 @@ Enter the url of the trains-server's Web service, for example: {HOST}
def main():
print('TRAINS-AGENT setup process')
conf_file = Path(LOCAL_CONFIG_FILES[0]).absolute()
print('CLEARML-AGENT setup process')
for f in LOCAL_CONFIG_FILES:
conf_file = Path(f).absolute()
if conf_file.exists():
break
if conf_file.exists() and conf_file.is_file() and conf_file.stat().st_size > 0:
print('Configuration file already exists: {}'.format(str(conf_file)))
print('Leaving setup, feel free to edit the configuration file.')
@@ -42,7 +48,12 @@ def main():
print(description, end='')
sentinel = ''
parse_input = '\n'.join(iter(input, sentinel))
parse_input = ''
for line in iter(input, sentinel):
parse_input += line+'\n'
if line.rstrip() == '}':
break
credentials = None
api_server = None
web_server = None
@@ -86,7 +97,7 @@ def main():
files_host = input_url('File Store Host', files_host)
print('\nTRAINS Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
print('\nClearML Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
web_host, api_host, files_host))
retry = 1
@@ -140,13 +151,14 @@ def main():
# noinspection PyBroadException
try:
with open(str(conf_file), 'wt') as f:
header = '# TRAINS-AGENT configuration file\n' \
header = '# CLEARML-AGENT configuration file\n' \
'api {\n' \
' # Notice: \'host\' is the api server (default port 8008), not the web server.\n' \
' api_server: %s\n' \
' web_server: %s\n' \
' files_server: %s\n' \
' # Credentials are generated using the webapp, %s/profile\n' \
' # Override with os environment: TRAINS_API_ACCESS_KEY / TRAINS_API_SECRET_KEY\n' \
' # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY\n' \
' credentials {"access_key": "%s", "secret_key": "%s"}\n' \
'}\n\n' % (api_host, web_host, files_host,
web_host, credentials['access_key'], credentials['secret_key'])
@@ -157,7 +169,7 @@ def main():
'agent.git_pass=\"{}\"\n' \
'\n'.format(git_user or '', git_pass or '')
f.write(git_credentials)
extra_index_str = '# extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]\n' \
extra_index_str = '# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]\n' \
'agent.package_manager.extra_index_url= ' \
'[\n{}\n]\n\n'.format("\n".join(map("\"{}\"".format, extra_index_urls)))
f.write(extra_index_str)
@@ -167,7 +179,7 @@ def main():
return
print('\nNew configuration stored in {}'.format(str(conf_file)))
print('TRAINS-AGENT setup completed successfully.')
print('CLEARML-AGENT setup completed successfully.')
def parse_host(parsed_host, allow_input=True):
@@ -308,7 +320,7 @@ def verify_url(parse_input):
parsed_host = None
except Exception:
parsed_host = None
print('Could not parse url {}\nEnter your trains-server host: '.format(parse_input), end='')
print('Could not parse url {}\nEnter your clearml-server host: '.format(parse_input), end='')
return parsed_host

View File

@@ -5,8 +5,8 @@ import time
from future.builtins import super
from trains_agent.commands.base import ServiceCommandSection
from trains_agent.helper.base import return_list
from clearml_agent.commands.base import ServiceCommandSection
from clearml_agent.helper.base import return_list
class Events(ServiceCommandSection):

View File

@@ -1,8 +1,8 @@
"""
Script for generating command-line completion.
Called by trains_agent/utilities/complete.sh (or a copy of it) like so:
Called by clearml_agent/utilities/complete.sh (or a copy of it) like so:
python -m trains_agent.complete "current command line"
python -m clearml_agent.complete "current command line"
And writes line-separated completion targets to stdout.
Results are line-separated in order to enable other whitespace in results.
@@ -13,7 +13,7 @@ from __future__ import print_function
import argparse
import sys
from trains_agent.interface import get_parser
from clearml_agent.interface import get_parser
def is_argument_required(action):

View File

@@ -1,7 +1,7 @@
from pyhocon import ConfigTree
import six
from trains_agent.helper.base import Singleton
from clearml_agent.helper.base import Singleton
@six.add_metaclass(Singleton)

View File

@@ -1,22 +1,22 @@
from datetime import timedelta
from distutils.util import strtobool
from enum import IntEnum
from os import getenv
from os import getenv, environ
from typing import Text, Optional, Union, Tuple, Any
from furl import furl
from pathlib2 import Path
import six
from trains_agent.helper.base import normalize_path
from clearml_agent.helper.base import normalize_path
PROGRAM_NAME = "trains-agent"
PROGRAM_NAME = "clearml-agent"
FROM_FILE_PREFIX_CHARS = "@"
CONFIG_DIR = normalize_path("~/.trains")
TOKEN_CACHE_FILE = normalize_path("~/.trains.trains_agent.tmp")
CONFIG_DIR = normalize_path("~/.clearml")
TOKEN_CACHE_FILE = normalize_path("~/.clearml.clearml_agent.tmp")
CONFIG_FILE_CANDIDATES = ["~/trains.conf"]
CONFIG_FILE_CANDIDATES = ["~/clearml.conf"]
def find_config_path():
@@ -40,6 +40,14 @@ class EnvironmentConfig(object):
self.vars = names
self.type = kwargs.pop("type", six.text_type)
def pop(self):
for k in self.vars:
environ.pop(k, None)
def set(self, value):
for k in self.vars:
environ[k] = str(value)
def convert(self, value):
return self.conversions.get(self.type, self.type)(value)
@@ -55,23 +63,25 @@ class EnvironmentConfig(object):
ENVIRONMENT_CONFIG = {
"api.api_server": EnvironmentConfig("TRAINS_API_HOST", ),
"api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
"api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ),
"api.web_server": EnvironmentConfig("CLEARML_WEB_HOST", "TRAINS_WEB_HOST", ),
"api.credentials.access_key": EnvironmentConfig(
"TRAINS_API_ACCESS_KEY",
"CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
),
"api.credentials.secret_key": EnvironmentConfig(
"TRAINS_API_SECRET_KEY",
"CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY",
),
"agent.worker_name": EnvironmentConfig("TRAINS_WORKER_NAME", ),
"agent.worker_id": EnvironmentConfig("TRAINS_WORKER_ID", ),
"agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
"agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
"agent.cuda_version": EnvironmentConfig(
"TRAINS_CUDA_VERSION", "CUDA_VERSION"
"CLEARML_CUDA_VERSION", "TRAINS_CUDA_VERSION", "CUDA_VERSION"
),
"agent.cudnn_version": EnvironmentConfig(
"TRAINS_CUDNN_VERSION", "CUDNN_VERSION"
"CLEARML_CUDNN_VERSION", "TRAINS_CUDNN_VERSION", "CUDNN_VERSION"
),
"agent.cpu_only": EnvironmentConfig(
"TRAINS_CPU_ONLY", "CPU_ONLY", type=bool
names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
),
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
"sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"),
@@ -82,13 +92,14 @@ ENVIRONMENT_CONFIG = {
}
ENVIRONMENT_SDK_PARAMS = {
"task_id": ("TRAINS_TASK_ID", ),
"config_file": ("TRAINS_CONFIG_FILE", ),
"log_level": ("TRAINS_LOG_LEVEL", ),
"log_to_backend": ("TRAINS_LOG_TASK_TO_BACKEND", ),
"task_id": ("CLEARML_TASK_ID", "TRAINS_TASK_ID", ),
"config_file": ("CLEARML_CONFIG_FILE", "TRAINS_CONFIG_FILE", ),
"log_level": ("CLEARML_LOG_LEVEL", "TRAINS_LOG_LEVEL", ),
"log_to_backend": ("CLEARML_LOG_TASK_TO_BACKEND", "TRAINS_LOG_TASK_TO_BACKEND", ),
}
ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig("TRAINS_AGENT_ALG_ENV", type=bool)
ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig(
names=("CLEARML_AGENT_ALG_ENV", "TRAINS_AGENT_ALG_ENV"), type=bool)
VIRTUAL_ENVIRONMENT_PATH = {
"python2": normalize_path(CONFIG_DIR, "py2venv"),
@@ -96,7 +107,7 @@ VIRTUAL_ENVIRONMENT_PATH = {
}
DEFAULT_BASE_DIR = normalize_path(CONFIG_DIR, "data_cache")
DEFAULT_HOST = "https://demoapi.trains.allegro.ai"
DEFAULT_HOST = "https://demoapi.demo.clear.ml"
MAX_DATASET_SOURCES_COUNT = 50000
INVALID_WORKER_ID = (400, 1001)
@@ -105,11 +116,6 @@ WORKER_ALREADY_REGISTERED = (400, 1003)
API_VERSION = "v1.5"
TOKEN_EXPIRATION_SECONDS = int(timedelta(days=2).total_seconds())
HTTP_HEADERS = {
"worker": "X-Trains-Worker",
"act-as": "X-Trains-Act-As",
"client": "X-Trains-Agent",
}
METADATA_EXTENSION = ".json"
DEFAULT_VENV_UPDATE_URL = (
@@ -120,11 +126,16 @@ DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
PIP_EXTRA_INDICES = [
]
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
ENV_AGENT_GIT_USER = EnvironmentConfig('TRAINS_AGENT_GIT_USER')
ENV_AGENT_GIT_PASS = EnvironmentConfig('TRAINS_AGENT_GIT_PASS')
ENV_TASK_EXECUTE_AS_USER = 'TRAINS_AGENT_EXEC_USER'
ENV_TASK_EXTRA_PYTHON_PATH = 'TRAINS_AGENT_EXTRA_PYTHON_PATH'
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAGE')
ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
'TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
class FileBuffering(IntEnum):

View File

@@ -0,0 +1,22 @@
from .parser import parse # noqa
_MAJOR = 0
_MINOR = 2
_PATCH = 0
def version_tuple():
'''
Returns a 3-tuple of ints that represent the version
'''
return (_MAJOR, _MINOR, _PATCH)
def version():
'''
Returns a string representation of the version
'''
return '%d.%d.%d' % (version_tuple())
__version__ = version()

View File

@@ -0,0 +1,44 @@
import re
# Copied from pip
# https://github.com/pypa/pip/blob/281eb61b09d87765d7c2b92f6982b3fe76ccb0af/pip/index.py#L947
HASH_ALGORITHMS = set(['sha1', 'sha224', 'sha384', 'sha256', 'sha512', 'md5'])
extras_require_search = re.compile(
r'(?P<name>.+)\[(?P<extras>[^\]]+)\]').search
def parse_fragment(fragment_string):
"""Takes a fragment string nd returns a dict of the components"""
fragment_string = fragment_string.lstrip('#')
try:
return dict(
key_value_string.split('=')
for key_value_string in fragment_string.split('&')
)
except ValueError:
raise ValueError(
'Invalid fragment string {fragment_string}'.format(
fragment_string=fragment_string
)
)
def get_hash_info(d):
"""Returns the first matching hashlib name and value from a dict"""
for key in d.keys():
if key.lower() in HASH_ALGORITHMS:
return key, d[key]
return None, None
def parse_extras_require(egg):
if egg is not None:
match = extras_require_search(egg)
if match is not None:
name = match.group('name')
extras = match.group('extras')
return name, [extra.strip() for extra in extras.split(',')]
return egg, []

View File

@@ -0,0 +1,50 @@
import os
import warnings
from .requirement import Requirement
def parse(reqstr):
"""
Parse a requirements file into a list of Requirements
See: pip/req.py:parse_requirements()
:param reqstr: a string or file like object containing requirements
:returns: a *generator* of Requirement objects
"""
filename = getattr(reqstr, 'name', None)
try:
# Python 2.x compatibility
if not isinstance(reqstr, basestring): # noqa
reqstr = reqstr.read()
except NameError:
# Python 3.x only
if not isinstance(reqstr, str):
reqstr = reqstr.read()
for line in reqstr.splitlines():
line = line.strip()
if line == '':
continue
elif not line or line.startswith('#'):
# comments are lines that start with # only
continue
elif line.startswith('-r') or line.startswith('--requirement'):
_, new_filename = line.split()
new_file_path = os.path.join(os.path.dirname(filename or '.'),
new_filename)
with open(new_file_path) as f:
for requirement in parse(f):
yield requirement
elif line.startswith('-f') or line.startswith('--find-links') or \
line.startswith('-i') or line.startswith('--index-url') or \
line.startswith('--extra-index-url') or \
line.startswith('--no-index'):
warnings.warn('Private repos not supported. Skipping.')
continue
elif line.startswith('-Z') or line.startswith('--always-unzip'):
warnings.warn('Unused option --always-unzip. Skipping.')
continue
else:
yield Requirement.parse(line)

View File

@@ -0,0 +1,241 @@
from __future__ import unicode_literals
import re
from pkg_resources import Requirement as Req
from .fragment import get_hash_info, parse_fragment, parse_extras_require
from .vcs import VCS, VCS_SCHEMES
URI_REGEX = re.compile(
r'^(?P<scheme>https?|file|ftps?)://(?P<path>[^#]+)'
r'(#(?P<fragment>\S+))?'
)
VCS_REGEX = re.compile(
r'^(?P<scheme>{0})://'.format(r'|'.join(
[scheme.replace('+', r'\+') for scheme in VCS_SCHEMES])) +
r'((?P<login>[^/@]+)@)?'
r'(?P<path>[^#@]+)'
r'(@(?P<revision>[^#]+))?'
r'(#(?P<fragment>\S+))?'
)
# This matches just about everyting
LOCAL_REGEX = re.compile(
r'^((?P<scheme>file)://)?'
r'(?P<path>[^#]+)' +
r'(#(?P<fragment>\S+))?'
)
class Requirement(object):
"""
Represents a single requirementfrom clearml_agent.external.requirements_parser.requirement import Requirement
Typically instances of this class are created with ``Requirement.parse``.
For local file requirements, there's no verification that the file
exists. This class attempts to be *dict-like*.
See: http://www.pip-installer.org/en/latest/logic.html
**Members**:
* ``line`` - the actual requirement line being parsed
* ``editable`` - a boolean whether this requirement is "editable"
* ``local_file`` - a boolean whether this requirement is a local file/path
* ``specifier`` - a boolean whether this requirement used a requirement
specifier (eg. "django>=1.5" or "requirements")
* ``vcs`` - a string specifying the version control system
* ``revision`` - a version control system specifier
* ``name`` - the name of the requirement
* ``uri`` - the URI if this requirement was specified by URI
* ``subdirectory`` - the subdirectory fragment of the URI
* ``path`` - the local path to the requirement
* ``hash_name`` - the type of hashing algorithm indicated in the line
* ``hash`` - the hash value indicated by the requirement line
* ``extras`` - a list of extras for this requirement
(eg. "mymodule[extra1, extra2]")
* ``specs`` - a list of specs for this requirement
(eg. "mymodule>1.5,<1.6" => [('>', '1.5'), ('<', '1.6')])
"""
def __init__(self, line):
# Do not call this private method
self.line = line
self.editable = False
self.local_file = False
self.specifier = False
self.vcs = None
self.name = None
self.subdirectory = None
self.uri = None
self.path = None
self.revision = None
self.hash_name = None
self.hash = None
self.extras = []
self.specs = []
def __repr__(self):
return '<Requirement: "{0}">'.format(self.line)
def __getitem__(self, key):
return getattr(self, key)
def keys(self):
return self.__dict__.keys()
@classmethod
def parse_editable(cls, line):
"""
Parses a Requirement from an "editable" requirement which is either
a local project path or a VCS project URI.
See: pip/req.py:from_editable()
:param line: an "editable" requirement
:returns: a Requirement instance for the given line
:raises: ValueError on an invalid requirement
"""
req = cls('-e {0}'.format(line))
req.editable = True
vcs_match = VCS_REGEX.match(line)
local_match = LOCAL_REGEX.match(line)
if vcs_match is not None:
groups = vcs_match.groupdict()
if groups.get('login'):
req.uri = '{scheme}://{login}@{path}'.format(**groups)
else:
req.uri = '{scheme}://{path}'.format(**groups)
req.revision = groups['revision']
if groups['fragment']:
fragment = parse_fragment(groups['fragment'])
egg = fragment.get('egg')
req.name, req.extras = parse_extras_require(egg)
req.hash_name, req.hash = get_hash_info(fragment)
req.subdirectory = fragment.get('subdirectory')
for vcs in VCS:
if req.uri.startswith(vcs):
req.vcs = vcs
else:
assert local_match is not None, 'This should match everything'
groups = local_match.groupdict()
req.local_file = True
if groups['fragment']:
fragment = parse_fragment(groups['fragment'])
egg = fragment.get('egg')
req.name, req.extras = parse_extras_require(egg)
req.hash_name, req.hash = get_hash_info(fragment)
req.subdirectory = fragment.get('subdirectory')
req.path = groups['path']
return req
@classmethod
def parse_line(cls, line):
"""
Parses a Requirement from a non-editable requirement.
See: pip/req.py:from_line()
:param line: a "non-editable" requirement
:returns: a Requirement instance for the given line
:raises: ValueError on an invalid requirement
"""
req = cls(line)
vcs_match = VCS_REGEX.match(line)
uri_match = URI_REGEX.match(line)
local_match = LOCAL_REGEX.match(line)
if vcs_match is not None:
groups = vcs_match.groupdict()
if groups.get('login'):
req.uri = '{scheme}://{login}@{path}'.format(**groups)
else:
req.uri = '{scheme}://{path}'.format(**groups)
req.revision = groups['revision']
if groups['fragment']:
fragment = parse_fragment(groups['fragment'])
egg = fragment.get('egg')
req.name, req.extras = parse_extras_require(egg)
req.hash_name, req.hash = get_hash_info(fragment)
req.subdirectory = fragment.get('subdirectory')
for vcs in VCS:
if req.uri.startswith(vcs):
req.vcs = vcs
elif uri_match is not None:
groups = uri_match.groupdict()
req.uri = '{scheme}://{path}'.format(**groups)
if groups['fragment']:
fragment = parse_fragment(groups['fragment'])
egg = fragment.get('egg')
req.name, req.extras = parse_extras_require(egg)
req.hash_name, req.hash = get_hash_info(fragment)
req.subdirectory = fragment.get('subdirectory')
if groups['scheme'] == 'file':
req.local_file = True
elif '#egg=' in line:
# Assume a local file match
assert local_match is not None, 'This should match everything'
groups = local_match.groupdict()
req.local_file = True
if groups['fragment']:
fragment = parse_fragment(groups['fragment'])
egg = fragment.get('egg')
name, extras = parse_extras_require(egg)
req.name = fragment.get('egg')
req.hash_name, req.hash = get_hash_info(fragment)
req.subdirectory = fragment.get('subdirectory')
req.path = groups['path']
else:
# This is a requirement specifier.
# Delegate to pkg_resources and hope for the best
req.specifier = True
pkg_req = Req.parse(line)
req.name = pkg_req.unsafe_name
req.extras = list(pkg_req.extras)
req.specs = pkg_req.specs
return req
@classmethod
def parse(cls, line):
"""
Parses a Requirement from a line of a requirement file.
:param line: a line of a requirement file
:returns: a Requirement instance for the given line
:raises: ValueError on an invalid requirement
"""
line = line.lstrip()
if line.startswith('-e') or line.startswith('--editable'):
# Editable installs are either a local project path
# or a VCS project URI
return cls.parse_editable(
re.sub(r'^(-e|--editable=?)\s*', '', line))
elif '@' in line and ('#' not in line or line.index('#') > line.index('@')):
# Allegro bug fix: support 'name @ git+' entries
name, uri = line.split('@', 1)
name = name.strip()
uri = uri.strip()
# noinspection PyBroadException
try:
# check if the name is valid & parsed
Req.parse(name)
# if we are here, name is a valid package name, check if the vcs part is valid
if VCS_REGEX.match(uri):
req = cls.parse_line(uri)
req.name = name
return req
elif URI_REGEX.match(uri):
req = cls.parse_line(uri)
req.name = name
req.line = line
return req
except Exception:
pass
return cls.parse_line(line)

View File

@@ -0,0 +1,30 @@
from __future__ import unicode_literals
VCS = [
'git',
'hg',
'svn',
'bzr',
]
VCS_SCHEMES = [
'git',
'git+https',
'git+ssh',
'git+git',
'hg+http',
'hg+https',
'hg+static-http',
'hg+ssh',
'svn',
'svn+svn',
'svn+http',
'svn+https',
'svn+ssh',
'bzr+http',
'bzr+https',
'bzr+ssh',
'bzr+sftp',
'bzr+ftp',
'bzr+lp',
]

593
clearml_agent/glue/k8s.py Normal file
View File

@@ -0,0 +1,593 @@
from __future__ import print_function, division, unicode_literals
import base64
import functools
import json
import logging
import os
import re
import subprocess
import tempfile
from copy import deepcopy
from pathlib import Path
from threading import Thread
from time import sleep
from typing import Text, List
import yaml
from clearml_agent.commands.events import Events
from clearml_agent.commands.worker import Worker
from clearml_agent.definitions import ENV_DOCKER_IMAGE
from clearml_agent.errors import APIError
from clearml_agent.helper.base import safe_remove_file
from clearml_agent.helper.dicts import merge_dicts
from clearml_agent.helper.process import get_bash_output
from clearml_agent.helper.resource_monitor import ResourceMonitor
from clearml_agent.interface.base import ObjectID
class K8sIntegration(Worker):
K8S_PENDING_QUEUE = "k8s_scheduler"
K8S_DEFAULT_NAMESPACE = "clearml"
KUBECTL_APPLY_CMD = "kubectl apply -f"
KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \
"--image {docker_image} " \
"--restart=Never " \
"--namespace={namespace}"
KUBECTL_DELETE_CMD = "kubectl delete pods " \
"--selector=TRAINS=agent " \
"--field-selector=status.phase!=Pending,status.phase!=Running " \
"--namespace={namespace}"
BASH_INSTALL_SSH_CMD = [
"apt-get install -y openssh-server",
"mkdir -p /var/run/sshd",
"echo 'root:training' | chpasswd",
"echo 'PermitRootLogin yes' >> /etc/ssh/sshd_config",
"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config",
r"sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd",
"echo 'AcceptEnv TRAINS_API_ACCESS_KEY TRAINS_API_SECRET_KEY CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY' "
">> /etc/ssh/sshd_config",
'echo "export VISIBLE=now" >> /etc/profile',
'echo "export PATH=$PATH" >> /etc/profile',
'echo "ldconfig" >> /etc/profile',
"/usr/sbin/sshd -p {port}"]
CONTAINER_BASH_SCRIPT = [
"export DEBIAN_FRONTEND='noninteractive'",
"echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
"chown -R root /root/.cache/pip",
"apt-get update",
"apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
"declare LOCAL_PYTHON",
"for i in {{10..5}}; do which python3.$i && python3.$i -m pip --version && "
"export LOCAL_PYTHON=$(which python3.$i) && break ; done",
"[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
"$LOCAL_PYTHON -m pip install clearml-agent",
"{extra_bash_init_cmd}",
"$LOCAL_PYTHON -m clearml_agent execute --full-monitoring --require-queue --id {task_id}"
]
AGENT_LABEL = "TRAINS=agent"
LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"
_edit_hyperparams_version = "2.9"
def __init__(
self,
k8s_pending_queue_name=None,
kubectl_cmd=None,
container_bash_script=None,
debug=False,
ports_mode=False,
num_of_services=20,
base_pod_num=1,
user_props_cb=None,
overrides_yaml=None,
template_yaml=None,
clearml_conf_file=None,
extra_bash_init_script=None,
namespace=None,
**kwargs
):
"""
Initialize the k8s integration glue layer daemon
:param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
:param str|callable kubectl_cmd: kubectl command line str, supports formatting (default: KUBECTL_RUN_CMD)
example: "task={task_id} image={docker_image} queue_id={queue_id}"
or a callable function: kubectl_cmd(task_id, docker_image, queue_id, task_data)
:param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
Notice this string will use format() call, if you have curly brackets they should be doubled { -> {{
Format arguments passed: {task_id} and {extra_bash_init_cmd}
:param bool debug: Switch logging on
:param bool ports_mode: Adds a label to each pod which can be used in services in order to expose ports.
Requires the `num_of_services` parameter.
:param int num_of_services: Number of k8s services configured in the cluster. Required if `port_mode` is True.
(default: 20)
:param int base_pod_num: Used when `ports_mode` is True, sets the base pod number to a given value (default: 1)
:param callable user_props_cb: An Optional callable allowing additional user properties to be specified
when scheduling a task to run in a pod. Callable can receive an optional pod number and should return
a dictionary of user properties (name and value). Signature is [[Optional[int]], Dict[str,str]]
:param str overrides_yaml: YAML file containing the overrides for the pod (optional)
:param str template_yaml: YAML file containing the template for the pod (optional).
If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
:param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional)
:param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
:param str namespace: K8S namespace to be used when creating the new pods (default: clearml)
"""
super(K8sIntegration, self).__init__()
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
# Always do system packages, because by we will be running inside a docker
self._session.config.put("agent.package_manager.system_site_packages", True)
# Add debug logging
if debug:
self.log.logger.disabled = False
self.log.logger.setLevel(logging.INFO)
self.ports_mode = ports_mode
self.num_of_services = num_of_services
self.base_pod_num = base_pod_num
self._edit_hyperparams_support = None
self._user_props_cb = user_props_cb
self.conf_file_content = None
self.overrides_json_string = None
self.template_dict = None
self.extra_bash_init_script = extra_bash_init_script or None
if self.extra_bash_init_script and not isinstance(self.extra_bash_init_script, str):
self.extra_bash_init_script = ' ; '.join(self.extra_bash_init_script) # noqa
self.namespace = namespace or self.K8S_DEFAULT_NAMESPACE
self.pod_limits = []
self.pod_requests = []
if overrides_yaml:
with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
overrides = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
if overrides:
containers = overrides.get('spec', {}).get('containers', [])
for c in containers:
resources = {str(k).lower(): v for k, v in c.get('resources', {}).items()}
if not resources:
continue
if resources.get('limits'):
self.pod_limits += ['{}={}'.format(k, v) for k, v in resources['limits'].items()]
if resources.get('requests'):
self.pod_requests += ['{}={}'.format(k, v) for k, v in resources['requests'].items()]
# remove double entries
self.pod_limits = list(set(self.pod_limits))
self.pod_requests = list(set(self.pod_requests))
if self.pod_limits or self.pod_requests:
self.log.warning('Found pod container requests={} limits={}'.format(
self.pod_limits, self.pod_requests))
if containers:
self.log.warning('Removing containers section: {}'.format(overrides['spec'].pop('containers')))
self.overrides_json_string = json.dumps(overrides)
if template_yaml:
with open(os.path.expandvars(os.path.expanduser(str(template_yaml))), 'rt') as f:
self.template_dict = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
clearml_conf_file = clearml_conf_file or kwargs.get('trains_conf_file')
if clearml_conf_file:
with open(os.path.expandvars(os.path.expanduser(str(clearml_conf_file))), 'rt') as f:
self.conf_file_content = f.read()
# make sure we use system packages!
self.conf_file_content += '\nagent.package_manager.system_site_packages=true\n'
self._monitor_hanging_pods()
def _monitor_hanging_pods(self):
_check_pod_thread = Thread(target=self._monitor_hanging_pods_daemon)
_check_pod_thread.daemon = True
_check_pod_thread.start()
def _monitor_hanging_pods_daemon(self):
while True:
output = get_bash_output('kubectl get pods -n {namespace} -o=JSON'.format(
namespace=self.namespace
))
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
try:
output_config = json.loads(output)
except Exception as ex:
self.log.warning('K8S Glue pods monitor: Failed parsing kubectl output:\n{}\nEx: {}'.format(output, ex))
sleep(self._polling_interval)
continue
pods = output_config.get('items', [])
for pod in pods:
try:
reason = functools.reduce(
lambda a, b: a[b], ('status', 'containerStatuses', 0, 'state', 'waiting', 'reason'), pod
)
except (IndexError, KeyError):
continue
if reason == 'ImagePullBackOff':
pod_name = pod.get('metadata', {}).get('name', None)
if pod_name:
task_id = pod_name.rpartition('-')[-1]
delete_pod_cmd = 'kubectl delete pods {} -n {}'.format(pod_name, self.namespace)
get_bash_output(delete_pod_cmd)
try:
self._session.api_client.tasks.failed(
task=task_id,
status_reason="K8S glue error due to ImagePullBackOff",
status_message="Changed by K8S glue",
force=True
)
except Exception as ex:
self.log.warning(
'K8S Glue pods monitor: Failed deleting task "{}"\nEX: {}'.format(task_id, ex)
)
sleep(self._polling_interval)
def _set_task_user_properties(self, task_id: str, **properties: str):
if self._edit_hyperparams_support is not True:
# either not supported or never tested
if self._edit_hyperparams_support == self._session.api_version:
# tested against latest api_version, not supported
return
if not self._session.check_min_api_version(self._edit_hyperparams_version):
# not supported due to insufficient api_version
self._edit_hyperparams_support = self._session.api_version
return
try:
self._session.get(
service="tasks",
action="edit_hyper_params",
task=task_id,
hyperparams=[
{
"section": "properties",
"name": k,
"value": str(v),
}
for k, v in properties.items()
],
)
# definitely supported
self._runtime_props_support = True
except APIError as error:
if error.code == 404:
self._edit_hyperparams_support = self._session.api_version
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
try:
print('Pushing task {} into temporary pending queue'.format(task_id))
self._session.api_client.tasks.reset(task_id)
self._session.api_client.tasks.enqueue(task_id, queue=self.k8s_pending_queue_name,
status_reason='k8s pending scheduler')
except Exception as e:
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
task_id, self.k8s_pending_queue_name, e))
return
if task_data.execution.docker_cmd:
docker_cmd = task_data.execution.docker_cmd
else:
docker_cmd = str(ENV_DOCKER_IMAGE.get() or
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
# take the first part, this is the docker image name (not arguments)
docker_parts = docker_cmd.split()
docker_image = docker_parts[0]
docker_args = docker_parts[1:] if len(docker_parts) > 1 else []
# get the clearml.conf encoded file
# noinspection PyProtectedMember
hocon_config_encoded = (
self.conf_file_content
or Path(self._session._config_file).read_text()
).encode("ascii")
create_clearml_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
base64.b64encode(
hocon_config_encoded
).decode('ascii')
)
if self.ports_mode:
print("Kubernetes looking for available pod to use")
# noinspection PyBroadException
try:
queue_name = self._session.api_client.queues.get_by_id(queue=queue).name
except Exception:
queue_name = 'k8s'
# conform queue name to k8s standards
safe_queue_name = queue_name.lower().strip()
safe_queue_name = re.sub(r'\W+', '', safe_queue_name).replace('_', '').replace('-', '')
# Search for a free pod number
pod_count = 0
pod_number = self.base_pod_num
while self.ports_mode:
pod_number = self.base_pod_num + pod_count
kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
agent_label=self.AGENT_LABEL,
namespace=self.namespace,
)
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
if not output:
# No such pod exist so we can use the pod_number we found
break
if pod_count >= self.num_of_services - 1:
# All pod numbers are taken, exit
self.log.warning(
"kubectl last result: {}\n{}\nAll k8s services are in use, task '{}' "
"will be enqueued back to queue '{}'".format(
error, output, task_id, queue
)
)
self._session.api_client.tasks.reset(task_id)
self._session.api_client.tasks.enqueue(
task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
return
pod_count += 1
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]
if self.ports_mode:
print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
else:
print("Kubernetes scheduling task id={}".format(task_id))
if self.template_dict:
output, error = self._kubectl_apply(
create_clearml_conf=create_clearml_conf,
labels=labels, docker_image=docker_image, docker_args=docker_args,
task_id=task_id, queue=queue, queue_name=safe_queue_name)
else:
output, error = self._kubectl_run(
create_clearml_conf=create_clearml_conf,
labels=labels, docker_image=docker_cmd,
task_data=task_data,
task_id=task_id, queue=queue, queue_name=safe_queue_name)
error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
print('kubectl output:\n{}\n{}'.format(error, output))
if error:
send_log = "Running kubectl encountered an error: {}".format(error)
self.log.error(send_log)
self.send_logs(task_id, send_log.splitlines())
user_props = {"k8s-queue": str(queue_name)}
if self.ports_mode:
user_props.update(
{
"k8s-pod-number": pod_number,
"k8s-pod-label": labels[0],
"k8s-internal-pod-count": pod_count,
}
)
if self._user_props_cb:
# noinspection PyBroadException
try:
custom_props = self._user_props_cb(pod_number) if self.ports_mode else self._user_props_cb()
user_props.update(custom_props)
except Exception:
pass
if user_props:
self._set_task_user_properties(
task_id=task_id,
**user_props
)
def _parse_docker_args(self, docker_args):
# type: (list) -> dict
kube_args = {'env': []}
while docker_args:
cmd = docker_args.pop().strip()
if cmd in ('-e', '--env',):
env = docker_args.pop().strip()
key, value = env.split('=', 1)
kube_args[key] += {key: value}
else:
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
return kube_args
def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id, queue_name):
template = deepcopy(self.template_dict)
template.setdefault('apiVersion', 'v1')
template['kind'] = 'Pod'
template.setdefault('metadata', {})
name = 'clearml-{queue}-id-{task_id}'.format(queue=queue_name, task_id=task_id)
template['metadata']['name'] = name
template.setdefault('spec', {})
template['spec'].setdefault('containers', [])
if labels:
labels_dict = dict(pair.split('=', 1) for pair in labels)
template['metadata'].setdefault('labels', {})
template['metadata']['labels'].update(labels_dict)
container = self._parse_docker_args(docker_args)
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
else self.container_bash_script
script_encoded = '\n'.join(
['#!/bin/bash', ] +
[line.format(extra_bash_init_cmd=self.extra_bash_init_script or '', task_id=task_id)
for line in container_bash_script])
create_init_script = \
"echo '{}' | base64 --decode >> ~/__start_agent__.sh ; " \
"/bin/bash ~/__start_agent__.sh".format(
base64.b64encode(
script_encoded.encode('ascii')
).decode('ascii'))
container = merge_dicts(
container,
dict(name=name, image=docker_image,
command=['/bin/bash'],
args=['-c', '{} ; {}'.format(create_clearml_conf, create_init_script)])
)
if template['spec']['containers']:
template['spec']['containers'][0] = merge_dicts(template['spec']['containers'][0], container)
else:
template['spec']['containers'].append(container)
fp, yaml_file = tempfile.mkstemp(prefix='clearml_k8stmpl_', suffix='.yml')
os.close(fp)
with open(yaml_file, 'wt') as f:
yaml.dump(template, f)
kubectl_cmd = self.KUBECTL_APPLY_CMD.format(
task_id=task_id,
docker_image=docker_image,
queue_id=queue,
namespace=self.namespace
)
# make sure we provide a list
if isinstance(kubectl_cmd, str):
kubectl_cmd = kubectl_cmd.split()
# add the template file at the end
kubectl_cmd += [yaml_file]
try:
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
except Exception as ex:
return None, str(ex)
finally:
safe_remove_file(yaml_file)
return output, error
def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id, queue_name):
if callable(self.kubectl_cmd):
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data, queue_name)
else:
kubectl_cmd = self.kubectl_cmd.format(
queue_name=queue_name,
task_id=task_id,
docker_image=docker_image,
queue_id=queue,
namespace=self.namespace,
)
# make sure we provide a list
if isinstance(kubectl_cmd, str):
kubectl_cmd = kubectl_cmd.split()
if self.overrides_json_string:
kubectl_cmd += ['--overrides=' + self.overrides_json_string]
if self.pod_limits:
kubectl_cmd += ['--limits', ",".join(self.pod_limits)]
if self.pod_requests:
kubectl_cmd += ['--requests', ",".join(self.pod_requests)]
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
else self.container_bash_script
container_bash_script = ' ; '.join(container_bash_script)
kubectl_cmd += [
"--labels=" + ",".join(labels),
"--command",
"--",
"/bin/sh",
"-c",
"{} ; {}".format(create_clearml_conf, container_bash_script.format(
extra_bash_init_cmd=self.extra_bash_init_script, task_id=task_id)),
]
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
return output, error
def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
"""
:summary: Pull and run tasks from queues.
:description: 1. Go through ``queues`` by order.
2. Try getting the next task for each and run the first one that returns.
3. Go to step 1
:param queues: IDs of queues to pull tasks from
:type queues: list of ``Text``
:param worker_params: Worker command line arguments
:type worker_params: ``clearml_agent.helper.process.WorkerParams``
"""
events_service = self.get_service(Events)
# make sure we have a k8s pending queue
# noinspection PyBroadException
try:
self._session.api_client.queues.create(self.k8s_pending_queue_name)
except Exception:
pass
# get queue id
self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")
_last_machine_update_ts = 0
while True:
# iterate over queues (priority style, queues[0] is highest)
for queue in queues:
# delete old completed / failed pods
get_bash_output(self.KUBECTL_DELETE_CMD.format(namespace=self.namespace))
# get next task in queue
try:
response = self._session.api_client.queues.get_next_task(queue=queue)
except Exception as e:
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
continue
else:
try:
task_id = response.entry.task
except AttributeError:
print("No tasks in queue {}".format(queue))
continue
events_service.send_log_events(
self.worker_id,
task_id=task_id,
lines="task {} pulled from {} by worker {}".format(
task_id, queue, self.worker_id
),
level="INFO",
)
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
self.run_one_task(queue, task_id, worker_params)
self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
break
else:
# sleep and retry polling
print("No tasks in Queues, sleeping for {:.1f} seconds".format(self._polling_interval))
sleep(self._polling_interval)
if self._session.config["agent.reload_config"]:
self.reload_config()
def k8s_daemon(self, queue):
"""
Start the k8s Glue service.
This service will be pulling tasks from *queue* and scheduling them for execution using kubectl.
Notice all scheduled tasks are pushed back into K8S_PENDING_QUEUE,
and popped when execution actually starts. This creates full visibility into the k8s scheduler.
Manually popping a task from the K8S_PENDING_QUEUE,
will cause the k8s scheduler to skip the execution once the scheduled tasks needs to be executed
:param list(str) queue: queue name to pull from
"""
return self.daemon(queues=[ObjectID(name=queue)] if queue else None,
log_level=logging.INFO, foreground=True, docker=False)
@classmethod
def get_ssh_server_bash(cls, ssh_port_number):
return ' ; '.join(line.format(port=ssh_port_number) for line in cls.BASH_INSTALL_SSH_CMD)

View File

@@ -1,4 +1,4 @@
""" TRAINS-AGENT Stdout Helper Functions """
""" CLEARML-AGENT Stdout Helper Functions """
from __future__ import print_function, unicode_literals
import io
@@ -24,12 +24,11 @@ import pyhocon
import yaml
from attr import fields_dict
from pathlib2 import Path
from tqdm import tqdm
import six
from six.moves import reduce
from trains_agent.errors import CommandFailedError
from trains_agent.helper.dicts import filter_keys
from clearml_agent.errors import CommandFailedError
from clearml_agent.helper.dicts import filter_keys
pretty_lines = False
@@ -173,14 +172,32 @@ def normalize_path(*paths):
def safe_remove_file(filename, error_message=None):
# noinspection PyBroadException
try:
os.remove(filename)
if filename:
os.remove(filename)
except Exception:
if error_message:
print(error_message)
def get_python_path(script_dir, entry_point, package_api):
def safe_remove_tree(filename):
if not filename:
return
# noinspection PyBroadException
try:
shutil.rmtree(filename, ignore_errors=True)
except Exception:
pass
# noinspection PyBroadException
try:
os.remove(filename)
except Exception:
pass
def get_python_path(script_dir, entry_point, package_api, is_conda_env=False):
# noinspection PyBroadException
try:
python_path_sep = ';' if is_windows_platform() else ':'
python_path_cmd = package_api.get_python_command(
@@ -192,9 +209,9 @@ def get_python_path(script_dir, entry_point, package_api):
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
python_path_sep=python_path_sep)
if is_windows_platform():
return python_path.replace('/', '\\') + org_python_path
python_path = python_path.replace('/', '\\')
return python_path + org_python_path
return python_path if is_conda_env else (python_path + org_python_path)
except Exception:
return None
@@ -362,11 +379,11 @@ AllDumper.add_multi_representer(object, lambda dumper, data: dumper.represent_st
def error(message):
print('\ntrains_agent: ERROR: {}\n'.format(message))
print('\nclearml_agent: ERROR: {}\n'.format(message))
def warning(message):
print('trains_agent: Warning: {}'.format(message))
print('clearml_agent: Warning: {}'.format(message))
class TqdmStream(object):
@@ -381,12 +398,6 @@ class TqdmStream(object):
self.buffer.write('\n')
class TqdmLog(tqdm):
def __init__(self, iterable=None, file=None, **kwargs):
super(TqdmLog, self).__init__(iterable, file=TqdmStream(file or sys.stderr), **kwargs)
def url_join(first, *rest):
"""
Join url parts similarly to Path.join
@@ -442,9 +453,9 @@ def chain_map(*args):
return reduce(lambda x, y: x.update(y) or x, args, {})
def check_directory_path(path):
def check_directory_path(path, check_whitespace_in_path=True):
message = 'Could not create directory "{}": {}'
if not is_windows_platform():
if not is_windows_platform() and check_whitespace_in_path:
match = re.search(r'\s', path)
if match:
raise CommandFailedError(
@@ -537,6 +548,7 @@ class ExecutionInfo(NonStrictAttrs):
branch = nullable_string
version_num = nullable_string
tag = nullable_string
docker_cmd = nullable_string
@classmethod
def from_task(cls, task_info):
@@ -554,6 +566,12 @@ class ExecutionInfo(NonStrictAttrs):
execution.entry_point = entry_point
execution.working_dir = working_dir or ""
# noinspection PyBroadException
try:
execution.docker_cmd = task_info.execution.docker_cmd
except Exception:
pass
return execution

View File

@@ -21,14 +21,14 @@ def start_check_update_daemon():
def _check_new_version_available():
cur_version = __version__
update_server_releases = requests.get('https://updates.trains.allegro.ai/updates',
data=json.dumps({"versions": {"trains-agent": str(cur_version)}}),
update_server_releases = requests.get('https://updates.clear.ml/updates',
data=json.dumps({"versions": {"clearml-agent": str(cur_version)}}),
timeout=3.0)
if update_server_releases.ok:
update_server_releases = update_server_releases.json()
else:
return None
trains_answer = update_server_releases.get("trains-agent", {})
trains_answer = update_server_releases.get("clearml-agent", {})
latest_version = trains_answer.get("version")
cur_version = cur_version
latest_version = latest_version or ''
@@ -48,7 +48,7 @@ def _check_update_daemon():
if latest_version:
if latest_version[1]:
sep = os.linesep
print('TRAINS-AGENT new package available: UPGRADE to v{} is recommended!\nRelease Notes:\n{}'.format(
print('CLEARML-AGENT new package available: UPGRADE to v{} is recommended!\nRelease Notes:\n{}'.format(
latest_version[0], sep.join(latest_version[2])))
else:
print('TRAINS-SERVER new version available: upgrade to v{} is recommended!'.format(

View File

@@ -9,7 +9,7 @@ from attr import attrs, attrib
import six
from six import binary_type, text_type
from trains_agent.helper.base import nonstrict_in_place_sort, create_tree
from clearml_agent.helper.base import nonstrict_in_place_sort
def print_text(text, newline=True):
@@ -22,15 +22,21 @@ def print_text(text, newline=True):
sys.stdout.write(data)
def decode_binary_lines(binary_lines, encoding='utf-8'):
def decode_binary_lines(binary_lines, encoding='utf-8', replace_cr=False, overwrite_cr=False):
# decode per line, if we failed decoding skip the line
lines = []
for b in binary_lines:
# noinspection PyBroadException
try:
l = b.decode(encoding=encoding, errors='replace').replace('\r', '\n')
except:
l = ''
lines.append(l + '\n' if l and l[-1] != '\n' else l)
line = b.decode(encoding=encoding, errors='replace')
if replace_cr:
line = line.replace('\r', '\n')
elif overwrite_cr:
cr_lines = line.split('\r')
line = cr_lines[-1] if cr_lines[-1] or len(cr_lines) < 2 else cr_lines[-2]
except Exception:
line = ''
lines.append(line + '\n' if not line or line[-1] != '\n' else line)
return lines

View File

@@ -0,0 +1,17 @@
from typing import Callable, Dict, Any
def filter_keys(filter_, dct): # type: (Callable[[Any], bool], Dict) -> Dict
return {key: value for key, value in dct.items() if filter_(key)}
def merge_dicts(dict1, dict2):
""" Recursively merges dict2 into dict1 """
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
return dict2
for k in dict2:
if k in dict1:
dict1[k] = merge_dicts(dict1[k], dict2[k])
else:
dict1[k] = dict2[k]
return dict1

View File

@@ -20,6 +20,7 @@ import platform
import sys
import time
from datetime import datetime
from typing import Optional
import psutil
from ..gpu import pynvml as N
@@ -200,24 +201,30 @@ class GPUStatCollection(object):
GPUStatCollection.global_processes[nv_process.pid] = \
psutil.Process(pid=nv_process.pid)
ps_process = GPUStatCollection.global_processes[nv_process.pid]
process['username'] = ps_process.username()
# cmdline returns full path;
# as in `ps -o comm`, get short cmdnames.
_cmdline = ps_process.cmdline()
if not _cmdline:
# sometimes, zombie or unknown (e.g. [kworker/8:2H])
process['command'] = '?'
process['full_command'] = ['?']
else:
process['command'] = os.path.basename(_cmdline[0])
process['full_command'] = _cmdline
# Bytes to MBytes
process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
process['cpu_percent'] = ps_process.cpu_percent()
process['cpu_memory_usage'] = \
round((ps_process.memory_percent() / 100.0) *
psutil.virtual_memory().total)
process['pid'] = nv_process.pid
# noinspection PyBroadException
try:
# we do not actually use these, so no point in collecting them
# process['username'] = ps_process.username()
# # cmdline returns full path;
# # as in `ps -o comm`, get short cmdnames.
# _cmdline = ps_process.cmdline()
# if not _cmdline:
# # sometimes, zombie or unknown (e.g. [kworker/8:2H])
# process['command'] = '?'
# process['full_command'] = ['?']
# else:
# process['command'] = os.path.basename(_cmdline[0])
# process['full_command'] = _cmdline
# process['cpu_percent'] = ps_process.cpu_percent()
# process['cpu_memory_usage'] = \
# round((ps_process.memory_percent() / 100.0) *
# psutil.virtual_memory().total)
# Bytes to MBytes
process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
except Exception:
# insufficient permissions
pass
return process
if not GPUStatCollection._gpu_device_info.get(index):
@@ -285,12 +292,13 @@ class GPUStatCollection(object):
# e.g. nvidia-smi reset or reboot the system
pass
# TODO: Do not block if full process info is not requested
time.sleep(0.1)
for process in processes:
pid = process['pid']
cache_process = GPUStatCollection.global_processes[pid]
process['cpu_percent'] = cache_process.cpu_percent()
# we do not actually use these, so no point in collecting them
# # TODO: Do not block if full process info is not requested
# time.sleep(0.1)
# for process in processes:
# pid = process['pid']
# cache_process = GPUStatCollection.global_processes[pid]
# process['cpu_percent'] = cache_process.cpu_percent()
index = N.nvmlDeviceGetIndex(handle)
gpu_info = {
@@ -383,3 +391,34 @@ def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
'''
return GPUStatCollection.new_query(shutdown=shutdown, per_process_stats=per_process_stats,
get_driver_info=get_driver_info)
def get_driver_cuda_version():
# type: () -> Optional[str]
"""
:return: Return detected CUDA version from driver. On fail return value is None.
Example: `110` is cuda version 11.0
"""
# noinspection PyBroadException
try:
N.nvmlInit()
except BaseException:
return None
# noinspection PyBroadException
try:
cuda_version = str(N.nvmlSystemGetCudaDriverVersion())
except BaseException:
# noinspection PyBroadException
try:
cuda_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
except BaseException:
cuda_version = ''
# noinspection PyBroadException
try:
N.nvmlShutdown()
except BaseException:
return None
return cuda_version[:3] if cuda_version else None

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,218 @@
import os
import shutil
from logging import warning
from random import random
from time import time
from typing import List, Optional, Sequence
import psutil
from pathlib2 import Path
from .locks import FileLock
class FolderCache(object):
_lock_filename = '.clearml.lock'
_lock_timeout_seconds = 30
_temp_entry_prefix = '_temp.'
def __init__(self, cache_folder, max_cache_entries=5, min_free_space_gb=None):
self._cache_folder = Path(os.path.expandvars(cache_folder)).expanduser().absolute()
self._cache_folder.mkdir(parents=True, exist_ok=True)
self._max_cache_entries = max_cache_entries
self._last_copied_entry_folder = None
self._min_free_space_gb = min_free_space_gb if min_free_space_gb and min_free_space_gb > 0 else None
self._lock = FileLock((self._cache_folder / self._lock_filename).as_posix())
def get_cache_folder(self):
# type: () -> Path
"""
:return: Return the base cache folder
"""
return self._cache_folder
def copy_cached_entry(self, keys, destination):
# type: (List[str], Path) -> Optional[Path]
"""
Copy a cached entry into a destination directory, if the cached entry does not exist return None
:param keys:
:param destination:
:return: Target path, None if cached entry does not exist
"""
self._last_copied_entry_folder = None
if not keys:
return None
# lock so we make sure no one deletes it before we copy it
# noinspection PyBroadException
try:
self._lock.acquire(timeout=self._lock_timeout_seconds)
except BaseException as ex:
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
return None
src = None
try:
src = self.get_entry(keys)
if src:
destination = Path(destination).absolute()
destination.mkdir(parents=True, exist_ok=True)
shutil.rmtree(destination.as_posix())
shutil.copytree(src.as_posix(), dst=destination.as_posix(), symlinks=True)
except BaseException as ex:
warning('Could not copy cache folder {} to {}: {}'.format(src, destination, ex))
self._lock.release()
return None
# release Lock
self._lock.release()
self._last_copied_entry_folder = src
return destination if src else None
def get_entry(self, keys):
# type: (List[str]) -> Optional[Path]
"""
Return a folder (a sub-folder of inside the cache_folder) matching one of the keys
:param keys: List of keys, return the first match to one of the keys, notice keys cannot contain '.'
:return: Path to the sub-folder or None if none was found
"""
if not keys:
return None
# conform keys
keys = [keys] if isinstance(keys, str) else keys
keys = sorted([k.replace('.', '_') for k in keys])
for cache_folder in self._cache_folder.glob('*'):
if cache_folder.is_dir() and any(True for k in cache_folder.name.split('.') if k in keys):
cache_folder.touch()
return cache_folder
return None
def add_entry(self, keys, source_folder, exclude_sub_folders=None):
# type: (List[str], Path, Optional[Sequence[str]]) -> bool
"""
Add a local folder into the cache, copy all sub-folders inside `source_folder`
excluding folders matching `exclude_sub_folders` list
:param keys: Cache entry keys list (str)
:param source_folder: Folder to copy into the cache
:param exclude_sub_folders: List of sub-folders to exclude from the copy operation
:return: return True is new entry was added to cache
"""
if not keys:
return False
keys = [keys] if isinstance(keys, str) else keys
keys = sorted([k.replace('.', '_') for k in keys])
# If entry already exists skip it
cached_entry = self.get_entry(keys)
if cached_entry:
# make sure the entry contains all keys
cached_keys = cached_entry.name.split('.')
if set(keys) - set(cached_keys):
# noinspection PyBroadException
try:
self._lock.acquire(timeout=self._lock_timeout_seconds)
except BaseException as ex:
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
# failed locking do nothing
return True
keys = sorted(list(set(keys) | set(cached_keys)))
dst = cached_entry.parent / '.'.join(keys)
# rename
try:
shutil.move(src=cached_entry.as_posix(), dst=dst.as_posix())
except BaseException as ex:
warning('Could not rename cache entry {} to {}: ex'.format(
cached_entry.as_posix(), dst.as_posix(), ex))
# release lock
self._lock.release()
return True
# make sure we remove old entries
self._remove_old_entries()
# if we do not have enough free space, do nothing.
if not self._check_min_free_space():
warning('Could not add cache entry, not enough free space on drive, '
'free space threshold {} GB. Clearing all cache entries!'.format(self._min_free_space_gb))
self._remove_old_entries(max_cache_entries=0)
return False
# create the new entry for us
exclude_sub_folders = exclude_sub_folders or []
source_folder = Path(source_folder).absolute()
# create temp folder
temp_folder = \
self._temp_entry_prefix + \
'{}.{}'.format(str(time()).replace('.', '_'), str(random()).replace('.', '_'))
temp_folder = self._cache_folder / temp_folder
temp_folder.mkdir(parents=True, exist_ok=False)
for f in source_folder.glob('*'):
if f.name in exclude_sub_folders:
continue
shutil.copytree(src=f.as_posix(), dst=(temp_folder / f.name).as_posix(), symlinks=True)
# rename the target folder
target_cache_folder = self._cache_folder / '.'.join(keys)
# if we failed moving it means someone else created the cached entry before us, we can just leave
# noinspection PyBroadException
try:
shutil.move(src=temp_folder.as_posix(), dst=target_cache_folder.as_posix())
except BaseException:
# noinspection PyBroadException
try:
shutil.rmtree(path=temp_folder.as_posix())
except BaseException:
return False
return True
def get_last_copied_entry(self):
# type: () -> Optional[Path]
"""
:return: the last copied cached entry folder inside the cache
"""
return self._last_copied_entry_folder
def _remove_old_entries(self, max_cache_entries=None):
# type: (Optional[int]) -> ()
"""
Notice we only keep self._max_cache_entries-1, assuming we will be adding a new entry soon
:param int max_cache_entries: if not None use instead of self._max_cache_entries
"""
folder_entries = [(cache_folder, cache_folder.stat().st_mtime)
for cache_folder in self._cache_folder.glob('*')
if cache_folder.is_dir() and not cache_folder.name.startswith(self._temp_entry_prefix)]
folder_entries = sorted(folder_entries, key=lambda x: x[1], reverse=True)
# lock so we make sure no one deletes it before we copy it
# noinspection PyBroadException
try:
self._lock.acquire(timeout=self._lock_timeout_seconds)
except BaseException as ex:
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
return
number_of_entries_to_keep = self._max_cache_entries - 1 \
if max_cache_entries is None else max(0, int(max_cache_entries))
for folder, ts in folder_entries[number_of_entries_to_keep:]:
try:
shutil.rmtree(folder.as_posix(), ignore_errors=True)
except BaseException as ex:
warning('Could not delete cache entry {}: {}'.format(folder.as_posix(), ex))
self._lock.release()
def _check_min_free_space(self):
# type: () -> bool
"""
:return: return False if we hit the free space limit.
If not free space limit provided, always return True
"""
if not self._min_free_space_gb or not self._cache_folder:
return True
free_space = float(psutil.disk_usage(self._cache_folder.as_posix()).free)
free_space /= 2**30
return free_space > self._min_free_space_gb

View File

@@ -0,0 +1,211 @@
import os
import time
import tempfile
import contextlib
from .portalocker import constants, exceptions, lock, unlock
current_time = getattr(time, "monotonic", time.time)
DEFAULT_TIMEOUT = 10 ** 8
DEFAULT_CHECK_INTERVAL = 0.25
LOCK_METHOD = constants.LOCK_EX | constants.LOCK_NB
__all__ = [
'FileLock',
'open_atomic',
]
@contextlib.contextmanager
def open_atomic(filename, binary=True):
"""Open a file for atomic writing. Instead of locking this method allows
you to write the entire file and move it to the actual location. Note that
this makes the assumption that a rename is atomic on your platform which
is generally the case but not a guarantee.
http://docs.python.org/library/os.html#os.rename
>>> filename = 'test_file.txt'
>>> if os.path.exists(filename):
... os.remove(filename)
>>> with open_atomic(filename) as fh:
... written = fh.write(b'test')
>>> assert os.path.exists(filename)
>>> os.remove(filename)
"""
assert not os.path.exists(filename), '%r exists' % filename
path, name = os.path.split(filename)
# Create the parent directory if it doesn't exist
if path and not os.path.isdir(path): # pragma: no cover
os.makedirs(path)
temp_fh = tempfile.NamedTemporaryFile(
mode=binary and 'wb' or 'w',
dir=path,
delete=False,
)
yield temp_fh
temp_fh.flush()
os.fsync(temp_fh.fileno())
temp_fh.close()
try:
os.rename(temp_fh.name, filename)
finally:
try:
os.remove(temp_fh.name)
except Exception: # noqa
pass
class FileLock(object):
def __init__(
self, filename, mode='a', timeout=DEFAULT_TIMEOUT,
check_interval=DEFAULT_CHECK_INTERVAL, fail_when_locked=False,
flags=LOCK_METHOD, **file_open_kwargs):
"""Lock manager with build-in timeout
filename -- filename
mode -- the open mode, 'a' or 'ab' should be used for writing
truncate -- use truncate to emulate 'w' mode, None is disabled, 0 is
truncate to 0 bytes
timeout -- timeout when trying to acquire a lock
check_interval -- check interval while waiting
fail_when_locked -- after the initial lock failed, return an error
or lock the file
**file_open_kwargs -- The kwargs for the `open(...)` call
fail_when_locked is useful when multiple threads/processes can race
when creating a file. If set to true than the system will wait till
the lock was acquired and then return an AlreadyLocked exception.
Note that the file is opened first and locked later. So using 'w' as
mode will result in truncate _BEFORE_ the lock is checked.
"""
if 'w' in mode:
truncate = True
mode = mode.replace('w', 'a')
else:
truncate = False
self.fh = None
self.filename = filename
self.mode = mode
self.truncate = truncate
self.timeout = timeout
self.check_interval = check_interval
self.fail_when_locked = fail_when_locked
self.flags = flags
self.file_open_kwargs = file_open_kwargs
def acquire(
self, timeout=None, check_interval=None, fail_when_locked=None):
"""Acquire the locked filehandle"""
if timeout is None:
timeout = self.timeout
if timeout is None:
timeout = 0
if check_interval is None:
check_interval = self.check_interval
if fail_when_locked is None:
fail_when_locked = self.fail_when_locked
# If we already have a filehandle, return it
fh = self.fh
if fh:
return fh
# Get a new filehandler
fh = self._get_fh()
try:
# Try to lock
fh = self._get_lock(fh)
except exceptions.LockException as exception:
# Try till the timeout has passed
timeoutend = current_time() + timeout
while timeoutend > current_time():
# Wait a bit
time.sleep(check_interval)
# Try again
try:
# We already tried to the get the lock
# If fail_when_locked is true, then stop trying
if fail_when_locked:
raise exceptions.AlreadyLocked(exception)
else: # pragma: no cover
# We've got the lock
fh = self._get_lock(fh)
break
except exceptions.LockException:
pass
else:
# We got a timeout... reraising
raise exceptions.LockException(exception)
# Prepare the filehandle (truncate if needed)
fh = self._prepare_fh(fh)
self.fh = fh
return fh
def release(self):
"""Releases the currently locked file handle"""
if self.fh:
# noinspection PyBroadException
try:
unlock(self.fh)
except Exception:
pass
# noinspection PyBroadException
try:
self.fh.close()
except Exception:
pass
self.fh = None
def _get_fh(self):
"""Get a new filehandle"""
return open(self.filename, self.mode, **self.file_open_kwargs)
def _get_lock(self, fh):
"""
Try to lock the given filehandle
returns LockException if it fails"""
lock(fh, self.flags)
return fh
def _prepare_fh(self, fh):
"""
Prepare the filehandle for usage
If truncate is a number, the file will be truncated to that amount of
bytes
"""
if self.truncate:
fh.seek(0)
fh.truncate(0)
return fh
def __enter__(self):
return self.acquire()
def __exit__(self, type_, value, tb):
self.release()
def __delete__(self, instance): # pragma: no cover
instance.release()

View File

@@ -0,0 +1,193 @@
import os
import sys
class exceptions:
class BaseLockException(Exception):
# Error codes:
LOCK_FAILED = 1
def __init__(self, *args, **kwargs):
self.fh = kwargs.pop('fh', None)
Exception.__init__(self, *args, **kwargs)
class LockException(BaseLockException):
pass
class AlreadyLocked(BaseLockException):
pass
class FileToLarge(BaseLockException):
pass
class constants:
# The actual tests will execute the code anyhow so the following code can
# safely be ignored from the coverage tests
if os.name == 'nt': # pragma: no cover
import msvcrt
LOCK_EX = 0x1 #: exclusive lock
LOCK_SH = 0x2 #: shared lock
LOCK_NB = 0x4 #: non-blocking
LOCK_UN = msvcrt.LK_UNLCK #: unlock
LOCKFILE_FAIL_IMMEDIATELY = 1
LOCKFILE_EXCLUSIVE_LOCK = 2
elif os.name == 'posix': # pragma: no cover
import fcntl
LOCK_EX = fcntl.LOCK_EX #: exclusive lock
LOCK_SH = fcntl.LOCK_SH #: shared lock
LOCK_NB = fcntl.LOCK_NB #: non-blocking
LOCK_UN = fcntl.LOCK_UN #: unlock
else: # pragma: no cover
raise RuntimeError('PortaLocker only defined for nt and posix platforms')
if os.name == 'nt': # pragma: no cover
import msvcrt
if sys.version_info.major == 2:
lock_length = -1
else:
lock_length = int(2**31 - 1)
def lock(file_, flags):
if flags & constants.LOCK_SH:
import win32file
import pywintypes
import winerror
__overlapped = pywintypes.OVERLAPPED()
if sys.version_info.major == 2:
if flags & constants.LOCK_NB:
mode = constants.LOCKFILE_FAIL_IMMEDIATELY
else:
mode = 0
else:
if flags & constants.LOCK_NB:
mode = msvcrt.LK_NBRLCK
else:
mode = msvcrt.LK_RLCK
# is there any reason not to reuse the following structure?
hfile = win32file._get_osfhandle(file_.fileno())
try:
win32file.LockFileEx(hfile, mode, 0, -0x10000, __overlapped)
except pywintypes.error as exc_value:
# error: (33, 'LockFileEx', 'The process cannot access the file
# because another process has locked a portion of the file.')
if exc_value.winerror == winerror.ERROR_LOCK_VIOLATION:
raise exceptions.LockException(
exceptions.LockException.LOCK_FAILED,
exc_value.strerror,
fh=file_)
else:
# Q: Are there exceptions/codes we should be dealing with
# here?
raise
else:
mode = constants.LOCKFILE_EXCLUSIVE_LOCK
if flags & constants.LOCK_NB:
mode |= constants.LOCKFILE_FAIL_IMMEDIATELY
if flags & constants.LOCK_NB:
mode = msvcrt.LK_NBLCK
else:
mode = msvcrt.LK_LOCK
# windows locks byte ranges, so make sure to lock from file start
try:
savepos = file_.tell()
if savepos:
# [ ] test exclusive lock fails on seek here
# [ ] test if shared lock passes this point
file_.seek(0)
# [x] check if 0 param locks entire file (not documented in
# Python)
# [x] fails with "IOError: [Errno 13] Permission denied",
# but -1 seems to do the trick
try:
msvcrt.locking(file_.fileno(), mode, lock_length)
except IOError as exc_value:
# [ ] be more specific here
raise exceptions.LockException(
exceptions.LockException.LOCK_FAILED,
exc_value.strerror,
fh=file_)
finally:
if savepos:
file_.seek(savepos)
except IOError as exc_value:
raise exceptions.LockException(
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
fh=file_)
def unlock(file_):
try:
savepos = file_.tell()
if savepos:
file_.seek(0)
try:
msvcrt.locking(file_.fileno(), constants.LOCK_UN, lock_length)
except IOError as exc_value:
if exc_value.strerror == 'Permission denied':
import pywintypes
import win32file
import winerror
__overlapped = pywintypes.OVERLAPPED()
hfile = win32file._get_osfhandle(file_.fileno())
try:
win32file.UnlockFileEx(
hfile, 0, -0x10000, __overlapped)
except pywintypes.error as exc_value:
if exc_value.winerror == winerror.ERROR_NOT_LOCKED:
# error: (158, 'UnlockFileEx',
# 'The segment is already unlocked.')
# To match the 'posix' implementation, silently
# ignore this error
pass
else:
# Q: Are there exceptions/codes we should be
# dealing with here?
raise
else:
raise exceptions.LockException(
exceptions.LockException.LOCK_FAILED,
exc_value.strerror,
fh=file_)
finally:
if savepos:
file_.seek(savepos)
except IOError as exc_value:
raise exceptions.LockException(
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
fh=file_)
elif os.name == 'posix': # pragma: no cover
import fcntl
def lock(file_, flags):
locking_exceptions = IOError,
try: # pragma: no cover
locking_exceptions += BlockingIOError,
except NameError: # pragma: no cover
pass
try:
fcntl.flock(file_.fileno(), flags)
except locking_exceptions as exc_value:
# The exception code varies on different systems so we'll catch
# every IO error
raise exceptions.LockException(exc_value, fh=file_)
def unlock(file_):
fcntl.flock(file_.fileno(), constants.LOCK_UN)
else: # pragma: no cover
raise RuntimeError('PortaLocker only defined for nt and posix platforms')

View File

@@ -0,0 +1,263 @@
from __future__ import unicode_literals
import abc
from collections import OrderedDict
from contextlib import contextmanager
from typing import Text, Iterable, Union, Optional, Dict, List
from pathlib2 import Path
from hashlib import md5
import six
from clearml_agent.helper.base import mkstemp, safe_remove_file, join_lines, select_for_platform
from clearml_agent.helper.console import ensure_binary
from clearml_agent.helper.os.folder_cache import FolderCache
from clearml_agent.helper.process import Executable, Argv, PathLike
@six.add_metaclass(abc.ABCMeta)
class PackageManager(object):
"""
ABC for classes providing python package management interface
"""
_selected_manager = None
_cwd = None
_pip_version = None
_config_cache_folder = 'agent.venvs_cache.path'
_config_cache_max_entries = 'agent.venvs_cache.max_entries'
_config_cache_free_space_threshold = 'agent.venvs_cache.free_space_threshold_gb'
def __init__(self):
self._cache_manager = None
@abc.abstractproperty
def bin(self):
# type: () -> PathLike
pass
@abc.abstractmethod
def create(self):
pass
@abc.abstractmethod
def remove(self):
pass
@abc.abstractmethod
def install_from_file(self, path):
pass
@abc.abstractmethod
def freeze(self):
pass
@abc.abstractmethod
def load_requirements(self, requirements):
pass
@abc.abstractmethod
def install_packages(self, *packages):
# type: (Iterable[Text]) -> None
"""
Install packages, upgrading depends on config
"""
pass
@abc.abstractmethod
def _install(self, *packages):
# type: (Iterable[Text]) -> None
"""
Run install command
"""
pass
@abc.abstractmethod
def uninstall_packages(self, *packages):
# type: (Iterable[Text]) -> None
pass
def upgrade_pip(self):
result = self._install(
select_for_platform(windows='pip{}', linux='pip{}').format(self.get_pip_version()), "--upgrade")
packages = self.run_with_env(('list',), output=True).splitlines()
# p.split is ('pip', 'x.y.z')
pip = [p.split() for p in packages if len(p.split()) == 2 and p.split()[0] == 'pip']
if pip:
# noinspection PyBroadException
try:
from .requirements import MarkerRequirement
pip = pip[0][1].split('.')
MarkerRequirement.pip_new_version = bool(int(pip[0]) >= 20)
except Exception:
pass
return result
def get_python_command(self, extra=()):
# type: (...) -> Executable
return Argv(self.bin, *extra)
@contextmanager
def temp_file(self, prefix, contents, suffix=".txt"):
# type: (Union[Text, Iterable[Text]], Iterable[Text], Text) -> Text
"""
Write contents to a temporary file, yielding its path. Finally, delete it.
:param prefix: file name prefix
:param contents: text lines to write
:param suffix: file name suffix
"""
f, temp_path = mkstemp(suffix=suffix, prefix=prefix)
with f:
f.write(
contents
if isinstance(contents, six.text_type)
else join_lines(contents)
)
try:
yield temp_path
finally:
if not self.session.debug_mode:
safe_remove_file(temp_path)
def set_selected_package_manager(self):
# set this instance as the selected package manager
# this is helpful when we want out of context requirement installations
PackageManager._selected_manager = self
@property
def cwd(self):
return self._cwd
@cwd.setter
def cwd(self, value):
self._cwd = value
@classmethod
def out_of_scope_install_package(cls, package_name, *args):
if PackageManager._selected_manager is not None:
try:
result = PackageManager._selected_manager._install(package_name, *args)
if result not in (0, None, True):
return False
except Exception:
return False
return True
@classmethod
def out_of_scope_freeze(cls):
if PackageManager._selected_manager is not None:
try:
return PackageManager._selected_manager.freeze()
except Exception:
pass
return []
@classmethod
def set_pip_version(cls, version):
if not version:
return
version = version.replace(' ', '')
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
cls._pip_version = version
else:
cls._pip_version = "=="+version
@classmethod
def get_pip_version(cls):
return cls._pip_version or ''
def get_cached_venv(self, requirements, docker_cmd, python_version, cuda_version, destination_folder):
# type: (Dict, Optional[Union[dict, str]], Optional[str], Optional[str], Path) -> Optional[Path]
"""
Copy a cached copy of the venv (based on the requirements) into destination_folder.
Return None if failed or cached entry does not exist
"""
if not self._get_cache_manager():
return None
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
return self._get_cache_manager().copy_cached_entry(keys, destination_folder)
def add_cached_venv(
self,
requirements, # type: Union[Dict, List[Dict]]
docker_cmd, # type: Optional[Union[dict, str]]
python_version, # type: Optional[str]
cuda_version, # type: Optional[str]
source_folder, # type: Path
exclude_sub_folders=None # type: Optional[List[str]]
):
# type: (...) -> ()
"""
Copy the local venv folder into the venv cache (keys are based on the requirements+python+docker).
"""
if not self._get_cache_manager():
return
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
return self._get_cache_manager().add_entry(
keys=keys, source_folder=source_folder, exclude_sub_folders=exclude_sub_folders)
def get_cache_folder(self):
# type: () -> Optional[Path]
if not self._get_cache_manager():
return
return self._get_cache_manager().get_cache_folder()
def get_last_used_entry_cache(self):
# type: () -> Optional[Path]
"""
:return: the last used cached folder entry
"""
if not self._get_cache_manager():
return
return self._get_cache_manager().get_last_copied_entry()
@classmethod
def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
# type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]
requirements_list = requirements_list or dict()
if not isinstance(requirements_list, (list, tuple)):
requirements_list = [requirements_list]
docker_cmd = dict(docker_cmd=docker_cmd) if isinstance(docker_cmd, str) else docker_cmd or dict()
docker_cmd = OrderedDict(sorted(docker_cmd.items(), key=lambda t: t[0]))
if 'docker_cmd' in docker_cmd:
# we only take the first part of the docker_cmd which is the docker image name
docker_cmd['docker_cmd'] = docker_cmd['docker_cmd'].strip('\r\n\t ').split(' ')[0]
keys = []
strip_chars = '\n\r\t '
for requirements in requirements_list:
pip, conda = ('pip', 'conda')
pip_reqs = requirements.get(pip, '')
conda_reqs = requirements.get(conda, '')
if isinstance(pip_reqs, str):
pip_reqs = pip_reqs.split('\n')
if isinstance(conda_reqs, str):
conda_reqs = conda_reqs.split('\n')
pip_reqs = sorted([p.strip(strip_chars) for p in pip_reqs
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
conda_reqs = sorted([p.strip(strip_chars) for p in conda_reqs
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
if not pip_reqs and not conda_reqs:
continue
hash_text = '{class_type}\n{docker_cmd}\n{cuda_ver}\n{python_version}\n{pip_reqs}\n{conda_reqs}'.format(
class_type=str(cls),
docker_cmd=str(docker_cmd or ''),
cuda_ver=str(cuda_version or ''),
python_version=str(python_version or ''),
pip_reqs=str(pip_reqs or ''),
conda_reqs=str(conda_reqs or ''),
)
keys.append(md5(ensure_binary(hash_text)).hexdigest())
return sorted(list(set(keys)))
def _get_cache_manager(self):
if not self._cache_manager:
cache_folder = self.session.config.get(self._config_cache_folder, None)
if not cache_folder:
return None
max_entries = int(self.session.config.get(self._config_cache_max_entries, 10))
free_space_threshold = float(self.session.config.get(self._config_cache_free_space_threshold, 0))
self._cache_manager = FolderCache(
cache_folder, max_cache_entries=max_entries, min_free_space_gb=free_space_threshold)
return self._cache_manager

View File

@@ -2,8 +2,9 @@ from __future__ import unicode_literals
import json
import re
import shutil
import os
import subprocess
from collections import OrderedDict
from distutils.spawn import find_executable
from functools import partial
from itertools import chain
@@ -14,17 +15,18 @@ import yaml
from time import time
from attr import attrs, attrib, Factory
from pathlib2 import Path
from requirements import parse
from requirements.requirement import Requirement
from clearml_agent.external.requirements_parser import parse
from clearml_agent.external.requirements_parser.requirement import Requirement
from trains_agent.errors import CommandFailedError
from trains_agent.helper.base import rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform
from trains_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
from trains_agent.helper.package.requirements import SimpleVersion
from trains_agent.session import Session
from clearml_agent.errors import CommandFailedError
from clearml_agent.helper.base import rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo
from clearml_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
from clearml_agent.helper.package.requirements import SimpleVersion
from clearml_agent.session import Session
from .base import PackageManager
from .pip_api.venv import VirtualenvPip
from .requirements import RequirementsManager, MarkerRequirement
from ...backend_api.session.defs import ENV_CONDA_ENV_PACKAGE
package_normalize = partial(re.compile(r"""\[version=['"](.*)['"]\]""").sub, r"\1")
@@ -40,8 +42,8 @@ def _package_diff(path, packages):
class CondaPip(VirtualenvPip):
def __init__(self, source=None, *args, **kwargs):
super(CondaPip, self).__init__(*args, interpreter=Path(kwargs.get('path'), "python.exe") \
if is_windows_platform() and kwargs.get('path') else None, **kwargs)
super(CondaPip, self).__init__(*args, interpreter=Path(kwargs.get('path'), "python.exe")
if is_windows_platform() and kwargs.get('path') else None, **kwargs)
self.source = source
def run_with_env(self, command, output=False, **kwargs):
@@ -61,18 +63,27 @@ class CondaAPI(PackageManager):
MINIMUM_VERSION = "4.3.30"
def __init__(self, session, path, python, requirements_manager):
# type: (Session, PathLike, float, RequirementsManager) -> None
def __init__(self, session, path, python, requirements_manager, execution_info=None, **kwargs):
# type: (Session, PathLike, float, RequirementsManager, ExecutionInfo, Any) -> None
"""
:param python: base python version to use (e.g python3.6)
:param path: path of env
"""
super(CondaAPI, self).__init__()
self.session = session
self.python = python
self.source = None
self.requirements_manager = requirements_manager
self.path = path
self.env_read_only = False
self.extra_channels = self.session.config.get('agent.package_manager.conda_channels', [])
self.conda_env_as_base_docker = \
self.session.config.get('agent.package_manager.conda_env_as_base_docker', None) or \
bool(ENV_CONDA_ENV_PACKAGE.get())
if ENV_CONDA_ENV_PACKAGE.get():
self.conda_pre_build_env_path = ENV_CONDA_ENV_PACKAGE.get()
else:
self.conda_pre_build_env_path = execution_info.docker_cmd if execution_info else None
self.pip = CondaPip(
session=self.session,
source=self.source,
@@ -80,10 +91,15 @@ class CondaAPI(PackageManager):
requirements_manager=self.requirements_manager,
path=self.path,
)
self.conda = (
find_executable("conda")
or Argv(select_for_platform(windows="where", linux="which"), "conda").get_output(shell=True).strip()
)
try:
self.conda = (
find_executable("conda") or
Argv(select_for_platform(windows="where", linux="which"), "conda").get_output(
shell=select_for_platform(windows=True, linux=False)).strip()
)
except Exception:
raise ValueError("ERROR: package manager \"conda\" selected, "
"but \'conda\' executable could not be located")
try:
output = Argv(self.conda, "--version").get_output(stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as ex:
@@ -111,13 +127,46 @@ class CondaAPI(PackageManager):
def bin(self):
return self.pip.bin
# noinspection SpellCheckingInspection
def upgrade_pip(self):
return self._install("pip" + self.pip.get_pip_version())
# do not change pip version if pre built environement is used
if self.env_read_only:
print('Conda environment in read-only mode, skipping pip upgrade.')
return ''
return self._install(select_for_platform(windows='pip{}', linux='pip{}').format(self.pip.get_pip_version()))
def create(self):
"""
Create a new environment
"""
if self.conda_env_as_base_docker and self.conda_pre_build_env_path:
if Path(self.conda_pre_build_env_path).is_dir():
self._init_existing_environment(self.conda_pre_build_env_path)
return self
elif Path(self.conda_pre_build_env_path).is_file():
print("Restoring Conda environment from {}".format(self.conda_pre_build_env_path))
tar_path = find_executable("tar")
self.path.mkdir(parents=True, exist_ok=True)
output = Argv(
tar_path,
"-xzf",
self.conda_pre_build_env_path,
"-C",
self.path,
).get_output()
self.source = self.pip.source = ("conda", "activate", self.path.as_posix())
conda_env = self._get_conda_sh()
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
# unpack cleanup
print("Fixing prefix in Conda environment {}".format(self.path))
CommandSequence(('source', conda_env.as_posix()),
((self.path / 'bin' / 'conda-unpack').as_posix(), )).get_output()
return self
else:
raise ValueError("Could not restore Conda environment, cannot find {}".format(
self.conda_pre_build_env_path))
output = Argv(
self.conda,
"create",
@@ -133,13 +182,15 @@ class CondaAPI(PackageManager):
self.source = self.pip.source = (
tuple(match.group(1).split()) + (match.group(2),)
if match
else ("activate", self.path)
else ("conda", "activate", self.path.as_posix())
)
conda_env = Path(self.conda).parent.parent / 'etc' / 'profile.d' / 'conda.sh'
conda_env = self._get_conda_sh()
if conda_env.is_file() and not is_windows_platform():
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
# install cuda toolkit
# noinspection PyBroadException
try:
cuda_version = float(int(self.session.config['agent.cuda_version'])) / 10.0
if cuda_version > 0:
@@ -148,6 +199,21 @@ class CondaAPI(PackageManager):
pass
return self
def _init_existing_environment(self, conda_pre_build_env_path):
print("Using pre-existing Conda environment from {}".format(conda_pre_build_env_path))
self.path = Path(conda_pre_build_env_path)
self.source = ("conda", "activate", self.path.as_posix())
self.pip = CondaPip(
session=self.session,
source=self.source,
python=self.python,
requirements_manager=self.requirements_manager,
path=self.path,
)
conda_env = self._get_conda_sh()
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
self.env_read_only = True
def remove(self):
"""
Delete a conda environment.
@@ -181,6 +247,10 @@ class CondaAPI(PackageManager):
def _install(self, *args):
# type: (*PathLike) -> ()
# if we are in read only mode, do not install anything
if self.env_read_only:
print('Conda environment in read-only mode, skipping package installing: {}'.format(args))
return
channels_args = tuple(
chain.from_iterable(("-c", channel) for channel in self.extra_channels)
)
@@ -208,41 +278,174 @@ class CondaAPI(PackageManager):
return self._install(*packages)
def uninstall_packages(self, *packages):
# if we are in read only mode, do not uninstall anything
if self.env_read_only:
print('Conda environment in read-only mode, skipping package uninstalling: {}'.format(packages))
return ''
return self._run_command(("uninstall", "-p", self.path))
def install_from_file(self, path):
"""
Try to install packages from conda. Install packages which are not available from conda with pip.
"""
try:
self._install_from_file(path)
return
except PackageNotFoundError as e:
pip_packages = [e.pkg]
except PackagesNotFoundError as e:
pip_packages = package_set(e.packages)
with self.temp_file("conda_reqs", _package_diff(path, pip_packages)) as reqs:
self.install_from_file(reqs)
with self.temp_file("pip_reqs", pip_packages) as reqs:
self.pip.install_from_file(reqs)
requirements = {}
# assume requirements.txt
with open(path, 'rt') as f:
requirements['pip'] = f.read()
self.load_requirements(requirements)
def freeze(self):
def freeze(self, freeze_full_environment=False):
requirements = self.pip.freeze()
req_lines = []
conda_lines = []
# noinspection PyBroadException
try:
conda_packages = json.loads(self._run_command((self.conda, "list", "--json", "-p", self.path), raw=True))
conda_packages_txt = []
requirements_pip = [r.split('==')[0].strip().lower() for r in requirements['pip']]
for pkg in conda_packages:
# skip if this is a pypi package or it is not a python package at all
if pkg['channel'] == 'pypi' or pkg['name'].lower() not in requirements_pip:
pip_lines = requirements['pip']
conda_packages_json = json.loads(
self._run_command((self.conda, "list", "--json", "-p", self.path), raw=True))
for r in conda_packages_json:
# check if this is a pypi package, if it is, leave it outside
if not r.get('channel') or r.get('channel') == 'pypi':
name = (r['name'].replace('-', '_'), r['name'])
pip_req_line = [l for l in pip_lines
if l.split('==', 1)[0].strip() in name or l.split('@', 1)[0].strip() in name]
if pip_req_line and \
('@' not in pip_req_line[0] or
not pip_req_line[0].split('@', 1)[1].strip().startswith('file://')):
req_lines.append(pip_req_line[0])
continue
req_lines.append(
'{}=={}'.format(name[1], r['version']) if r.get('version') else '{}'.format(name[1]))
continue
conda_packages_txt.append('{0}{1}{2}'.format(pkg['name'], '==', pkg['version']))
requirements['conda'] = conda_packages_txt
except:
# check if we have it in our required packages
name = r['name']
# hack support pytorch/torch different naming convention
if name == 'pytorch':
name = 'torch'
# skip over packages with _
if name.startswith('_'):
continue
conda_lines.append('{}=={}'.format(name, r['version']) if r.get('version') else '{}'.format(name))
# make sure we see the conda packages, put them into the pip as well
if conda_lines:
req_lines = ['# Conda Packages', ''] + conda_lines + ['', '# pip Packages', ''] + req_lines
requirements['pip'] = req_lines
requirements['conda'] = conda_lines
except Exception:
pass
if freeze_full_environment:
# noinspection PyBroadException
try:
conda_env_json = json.loads(
self._run_command((self.conda, "env", "export", "--json", "-p", self.path), raw=True))
conda_env_json.pop('name', None)
conda_env_json.pop('prefix', None)
conda_env_json.pop('channels', None)
requirements['conda_env_json'] = json.dumps(conda_env_json)
except Exception:
pass
return requirements
def _load_conda_full_env(self, conda_env_dict, requirements):
# noinspection PyBroadException
try:
cuda_version = int(self.session.config.get('agent.cuda_version', 0))
except Exception:
cuda_version = 0
conda_env_dict['channels'] = self.extra_channels
if 'dependencies' not in conda_env_dict:
conda_env_dict['dependencies'] = []
new_dependencies = OrderedDict()
pip_requirements = None
for line in conda_env_dict['dependencies']:
if isinstance(line, dict):
pip_requirements = line.pop('pip', None)
continue
name = line.strip().split('=', 1)[0].lower()
if name == 'pip':
continue
elif name == 'python':
line = 'python={}'.format('.'.join(line.split('=')[1].split('.')[:2]))
elif name == 'tensorflow-gpu' and cuda_version == 0:
line = 'tensorflow={}'.format(line.split('=')[1])
elif name == 'tensorflow' and cuda_version > 0:
line = 'tensorflow-gpu={}'.format(line.split('=')[1])
elif name in ('cupti', 'cudnn'):
# cudatoolkit should pull them based on the cudatoolkit version
continue
elif name.startswith('_'):
continue
new_dependencies[line.split('=', 1)[0].strip()] = line
# fix packages:
conda_env_dict['dependencies'] = list(new_dependencies.values())
with self.temp_file("conda_env", yaml.dump(conda_env_dict), suffix=".yml") as name:
print('Conda: Trying to install requirements:\n{}'.format(conda_env_dict['dependencies']))
result = self._run_command(
("env", "update", "-p", self.path, "--file", name)
)
# check if we need to remove specific packages
bad_req = self._parse_conda_result_bad_packges(result)
if bad_req:
print('failed installing the following conda packages: {}'.format(bad_req))
return False
if pip_requirements:
# create a list of vcs packages that we need to replace in the pip section
vcs_reqs = {}
if 'pip' in requirements:
pip_lines = requirements['pip'].splitlines() \
if isinstance(requirements['pip'], six.string_types) else requirements['pip']
for line in pip_lines:
try:
marker = list(parse(line))
except Exception:
marker = None
if not marker:
continue
m = MarkerRequirement(marker[0])
if m.vcs:
vcs_reqs[m.name] = m
try:
pip_req_str = [str(vcs_reqs.get(r.split('=', 1)[0], r)) for r in pip_requirements
if not r.startswith('pip=') and not r.startswith('virtualenv=')]
print('Conda: Installing requirements: step 2 - using pip:\n{}'.format(pip_req_str))
PackageManager._selected_manager = self.pip
self.pip.load_requirements({'pip': '\n'.join(pip_req_str)})
except Exception as e:
print(e)
raise e
finally:
PackageManager._selected_manager = self
self.requirements_manager.post_install(self.session)
def load_requirements(self, requirements):
# if we are in read only mode, do not uninstall anything
if self.env_read_only:
print('Conda environment in read-only mode, skipping requirements installation.')
return None
# if we have a full conda environment, use it and pass the pip to pip
if requirements.get('conda_env_json'):
# noinspection PyBroadException
try:
conda_env_json = json.loads(requirements.get('conda_env_json'))
print('Conda restoring full yaml environment')
return self._load_conda_full_env(conda_env_json, requirements)
except Exception:
print('Could not load fully stored conda environment, falling back to requirements')
# create new environment file
conda_env = dict()
conda_env['channels'] = self.extra_channels
@@ -276,6 +479,15 @@ class CondaAPI(PackageManager):
if m.vcs:
pip_requirements.append(m)
continue
# Skip over pip
if m.name in ('pip', 'virtualenv', ):
continue
# python version, only major.minor
if m.name == 'python' and m.specs:
m.specs = [(m.specs[0][0], '.'.join(m.specs[0][1].split('.')[:2])), ]
if '.' not in m.specs[0][1]:
continue
conda_supported_req_names.append(m.name.lower())
if m.req.name.lower() == 'matplotlib':
has_matplotlib = True
@@ -293,6 +505,8 @@ class CondaAPI(PackageManager):
reqs.append(m)
# if we have a conda list, the rest should be installed with pip,
# this means any experiment that was executed with pip environment,
# will be installed using pip
if requirements.get('conda', None) is not None:
for r in requirements['pip']:
try:
@@ -303,15 +517,20 @@ class CondaAPI(PackageManager):
continue
m = MarkerRequirement(marker[0])
m_name = m.name.lower()
# skip over local files (we cannot change the version to a local file)
if m.local_file:
continue
m_name = (m.name or '').lower()
if m_name in conda_supported_req_names:
# this package is in the conda list,
# make sure that if we changed version and we match it in conda
conda_supported_req_names.remove(m_name)
## conda_supported_req_names.remove(m_name)
for cr in reqs:
if m_name == cr.name.lower():
if m_name.lower().replace('_', '-') == cr.name.lower().replace('_', '-'):
# match versions
cr.specs = m.specs
# # conda always likes "-" not "_" but only on pypi packages
# cr.name = cr.name.lower().replace('_', '-')
break
else:
# not in conda, it is a pip package
@@ -319,29 +538,39 @@ class CondaAPI(PackageManager):
if m_name == 'matplotlib':
has_matplotlib = True
# remove any leftover conda packages (they were removed from the pip list)
if conda_supported_req_names:
reqs = [r for r in reqs if r.name.lower() not in conda_supported_req_names]
# Conda requirements Hacks:
if has_matplotlib:
reqs.append(MarkerRequirement(Requirement.parse('graphviz')))
reqs.append(MarkerRequirement(Requirement.parse('python-graphviz')))
reqs.append(MarkerRequirement(Requirement.parse('kiwisolver')))
# remove specific cudatoolkit, it should have being preinstalled.
# allow to override default cudatoolkit, but not the derivative packages, cudatoolkit should pull them
reqs = [r for r in reqs if r.name not in ('cudnn', 'cupti')]
if has_torch and cuda_version == 0:
reqs.append(MarkerRequirement(Requirement.parse('cpuonly')))
# make sure we have no double entries
reqs = list(OrderedDict((r.name, r) for r in reqs).values())
# conform conda packages (version/name)
for r in reqs:
# change _ to - in name but not the prefix _ (as this is conda prefix)
if r.name and not r.name.startswith('_') and not requirements.get('conda', None):
r.name = r.name.replace('_', '-')
# remove .post from version numbers, it fails ~= version, and change == to ~=
if r.specs and r.specs[0]:
r.specs = [(r.specs[0][0].replace('==', '~='), r.specs[0][1].split('.post')[0])]
# conda always likes "-" not "_"
r.req.name = r.req.name.replace('_', '-')
while reqs:
# notice, we give conda more freedom in version selection, to help it choose best combination
conda_env['dependencies'] = [r.tostr() for r in reqs]
def clean_ver(ar):
if not ar.specs:
return ar.tostr()
ar.specs = [(ar.specs[0][0], ar.specs[0][1] + '.0' if '.' not in ar.specs[0][1] else ar.specs[0][1])]
return ar.tostr()
conda_env['dependencies'] = [clean_ver(r) for r in reqs]
with self.temp_file("conda_env", yaml.dump(conda_env), suffix=".yml") as name:
print('Conda: Trying to install requirements:\n{}'.format(conda_env['dependencies']))
result = self._run_command(
@@ -371,12 +600,15 @@ class CondaAPI(PackageManager):
if pip_requirements:
try:
pip_req_str = [r.tostr() for r in pip_requirements]
pip_req_str = [r.tostr() for r in pip_requirements if r.name not in ('pip', 'virtualenv', )]
print('Conda: Installing requirements: step 2 - using pip:\n{}'.format(pip_req_str))
self.pip.load_requirements('\n'.join(pip_req_str))
PackageManager._selected_manager = self.pip
self.pip.load_requirements({'pip': '\n'.join(pip_req_str)})
except Exception as e:
print(e)
raise e
finally:
PackageManager._selected_manager = self
self.requirements_manager.post_install(self.session)
return True
@@ -439,10 +671,26 @@ class CondaAPI(PackageManager):
return result
def get_python_command(self, extra=()):
if not self.source:
self._init_existing_environment(self.path)
return CommandSequence(self.source, self.pip.get_python_command(extra=extra))
def _get_conda_sh(self):
# type () -> Path
base_conda_env = Path(self.conda).parent.parent / 'etc' / 'profile.d' / 'conda.sh'
if base_conda_env.is_file():
return base_conda_env
for path in os.environ.get('PATH', '').split(select_for_platform(windows=';', linux=':')):
conda = find_executable("conda", path=path)
if not conda:
continue
conda_env = Path(conda).parent.parent / 'etc' / 'profile.d' / 'conda.sh'
if conda_env.is_file():
return conda_env
return base_conda_env
# enable hashing with cmp=False because pdb fails on unhashable exceptions
# enable hashing with cmp=False because pdb fails on un-hashable exceptions
exception = attrs(str=True, cmp=False)

View File

@@ -0,0 +1,133 @@
import re
from collections import OrderedDict
from typing import Text
from .base import PackageManager
from .requirements import SimpleSubstitution
from ..base import safe_furl as furl
class ExternalRequirements(SimpleSubstitution):
name = "external_link"
def __init__(self, *args, **kwargs):
super(ExternalRequirements, self).__init__(*args, **kwargs)
self.post_install_req = []
self.post_install_req_lookup = OrderedDict()
def match(self, req):
# match local folder building:
# noinspection PyBroadException
try:
if not req.name and req.req and not req.req.editable and not req.req.vcs and \
req.req.line and req.req.line.strip().split('#')[0] and \
not req.req.line.strip().split('#')[0].lower().endswith('.whl'):
return True
except Exception:
pass
# match both editable or code or unparsed
if not (not req.name or req.req and (req.req.editable or req.req.vcs)):
return False
if not req.req or not req.req.line or not req.req.line.strip() or req.req.line.strip().startswith('#'):
return False
if req.pip_new_version and not (req.req.editable or req.req.vcs):
return False
return True
def post_install(self, session):
post_install_req = self.post_install_req
self.post_install_req = []
for req in post_install_req:
try:
freeze_base = PackageManager.out_of_scope_freeze() or ''
except:
freeze_base = ''
req_line = req.tostr(markers=False)
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
if req.req.vcs and req_line.startswith('git+'):
try:
url_no_frag = furl(req_line)
url_no_frag.set(fragment=None)
# reverse replace
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
vcs_url = req_line[4:]
# reverse replace
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
from ..repo import Git
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
vcs._set_ssh_url()
new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
if new_req_line != req_line:
furl_line = furl(new_req_line)
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
req_line,
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
req_line = new_req_line
except Exception:
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
# if we have older pip version we have to make sure we replace back the package name with the
# git repository link. In new versions this is supported and we get "package @ git+https://..."
if not req.pip_new_version:
PackageManager.out_of_scope_install_package(req_line, "--no-deps")
# noinspection PyBroadException
try:
freeze_post = PackageManager.out_of_scope_freeze() or ''
package_name = list(set(freeze_post['pip']) - set(freeze_base['pip']))
if package_name and package_name[0] not in self.post_install_req_lookup:
self.post_install_req_lookup[package_name[0]] = req.req.line
except Exception:
pass
# no need to force reinstall, pip will always rebuilt if the package comes from git
# and make sure the required packages are installed (if they are not it will install them)
if not PackageManager.out_of_scope_install_package(req_line):
raise ValueError("Failed installing GIT/HTTPs package \'{}\'".format(req_line))
def replace(self, req):
"""
Replace a requirement
:raises: ValueError if version is pre-release
"""
# Store in post req install, and return nothing
self.post_install_req.append(req)
# mark skip package, we will install it in post install hook
return Text('')
def replace_back(self, list_of_requirements):
if not list_of_requirements:
return list_of_requirements
for k in list_of_requirements:
# k is either pip/conda
if k not in ('pip', 'conda'):
continue
original_requirements = list_of_requirements[k]
list_of_requirements[k] = [r for r in original_requirements
if r not in self.post_install_req_lookup]
list_of_requirements[k] += [self.post_install_req_lookup.get(r, '')
for r in self.post_install_req_lookup.keys() if r in original_requirements]
return list_of_requirements
class OnlyExternalRequirements(ExternalRequirements):
def __init__(self, *args, **kwargs):
super(OnlyExternalRequirements, self).__init__(*args, **kwargs)
def match(self, req):
return not super(OnlyExternalRequirements, self).match(req)
def replace(self, req):
"""
Replace a requirement
:raises: ValueError if version is pre-release
"""
# Do not store the skipped requirements
# mark skip package
return Text('')

View File

@@ -2,10 +2,10 @@ import sys
from itertools import chain
from typing import Text, Optional
from trains_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
from trains_agent.helper.package.base import PackageManager
from trains_agent.helper.process import Argv, DEVNULL
from trains_agent.session import Session
from clearml_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
from clearml_agent.helper.package.base import PackageManager
from clearml_agent.helper.process import Argv, DEVNULL
from clearml_agent.session import Session
class SystemPip(PackageManager):
@@ -17,6 +17,7 @@ class SystemPip(PackageManager):
"""
Program interface to the system pip.
"""
super(SystemPip, self).__init__()
self._bin = interpreter or sys.executable
self.session = session

View File

@@ -1,16 +1,18 @@
from typing import Any
from pathlib2 import Path
from trains_agent.helper.base import select_for_platform, rm_tree
from trains_agent.helper.package.base import PackageManager
from trains_agent.helper.process import Argv, PathLike
from trains_agent.session import Session
from clearml_agent.helper.base import select_for_platform, rm_tree, ExecutionInfo
from clearml_agent.helper.package.base import PackageManager
from clearml_agent.helper.process import Argv, PathLike
from clearml_agent.session import Session
from ..pip_api.system import SystemPip
from ..requirements import RequirementsManager
class VirtualenvPip(SystemPip, PackageManager):
def __init__(self, session, python, requirements_manager, path, interpreter=None):
# type: (Session, float, RequirementsManager, PathLike, PathLike) -> ()
def __init__(self, session, python, requirements_manager, path, interpreter=None, execution_info=None, **kwargs):
# type: (Session, float, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
"""
Program interface to virtualenv pip.
Must be given either path to virtualenv or source command.

View File

@@ -5,8 +5,8 @@ import attr
import sys
import os
from pathlib2 import Path
from trains_agent.helper.process import Argv, DEVNULL, check_if_command_exists
from trains_agent.session import Session, POETRY
from clearml_agent.helper.process import Argv, DEVNULL, check_if_command_exists
from clearml_agent.session import Session, POETRY
def prop_guard(prop, log_prop=None):

View File

@@ -0,0 +1,48 @@
from typing import Text
from .base import PackageManager
from .requirements import SimpleSubstitution
class PostRequirement(SimpleSubstitution):
name = ("horovod", )
optional_package_names = tuple()
def __init__(self, *args, **kwargs):
super(PostRequirement, self).__init__(*args, **kwargs)
self.post_install_req = []
# check if we need to replace the packages:
post_packages = self.config.get('agent.package_manager.post_packages', None)
if post_packages:
self.__class__.name = post_packages
post_optional_packages = self.config.get('agent.package_manager.post_optional_packages', None)
if post_optional_packages:
self.__class__.optional_package_names = post_optional_packages
def match(self, req):
# match both horovod
return req.name and (req.name.lower() in self.name or req.name.lower() in self.optional_package_names)
def post_install(self, session):
for req in self.post_install_req:
if req.name in self.optional_package_names:
# noinspection PyBroadException
try:
PackageManager.out_of_scope_install_package(req.tostr(markers=False))
except Exception:
pass
else:
PackageManager.out_of_scope_install_package(req.tostr(markers=False))
self.post_install_req = []
def replace(self, req):
"""
Replace a requirement
:raises: ValueError if version is pre-release
"""
# Store in post req install, and return nothing
self.post_install_req.append(req)
# mark skip package, we will install it in post install hook
return Text('')

Some files were not shown because too many files have changed in this diff Show More