mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
420 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
293cbc0ac6 | ||
|
|
4387ed73b6 | ||
|
|
43443ccf08 | ||
|
|
3d43240c8f | ||
|
|
fc58ba947b | ||
|
|
22672d2444 | ||
|
|
6a4fcda1bf | ||
|
|
a4ebf8293d | ||
|
|
10fb157d58 | ||
|
|
56058beec2 | ||
|
|
9f207d5155 | ||
|
|
8a2bea3c14 | ||
|
|
f1f9278928 | ||
|
|
2de1c926bf | ||
|
|
e1104e60bb | ||
|
|
8b2970350c | ||
|
|
a2758250b2 | ||
|
|
01e8ffd854 | ||
|
|
74edf6aa36 | ||
|
|
09c5ef99af | ||
|
|
17ae28a62f | ||
|
|
059a9385e9 | ||
|
|
9a321a410f | ||
|
|
919013d4fe | ||
|
|
05530b712b | ||
|
|
8d15fd8798 | ||
|
|
b34329934b | ||
|
|
85049d8705 | ||
|
|
6fbd70786e | ||
|
|
05a65548da | ||
|
|
6657003d65 | ||
|
|
95dde6ca0c | ||
|
|
c9fc092f4e | ||
|
|
432ee395e1 | ||
|
|
98fc4f0fb9 | ||
|
|
111e774c21 | ||
|
|
3dd8d783e1 | ||
|
|
7c3e420df4 | ||
|
|
55b065a114 | ||
|
|
faa97b6cc2 | ||
|
|
f5861b1e4a | ||
|
|
030cbb69f1 | ||
|
|
564f769ff7 | ||
|
|
2c7f091e57 | ||
|
|
dd5d24b0ca | ||
|
|
996bb797c3 | ||
|
|
9ad49a0d21 | ||
|
|
ba4fee7b19 | ||
|
|
0131db8b7d | ||
|
|
d2384a9a95 | ||
|
|
5b86c230c1 | ||
|
|
21e4be966f | ||
|
|
9c6cb421b3 | ||
|
|
52405c343d | ||
|
|
46f0c991c8 | ||
|
|
0254279ed5 | ||
|
|
0e1750f90e | ||
|
|
58e0dc42ec | ||
|
|
d16825029d | ||
|
|
fb639afcb9 | ||
|
|
eefb94d1bc | ||
|
|
f1e9266075 | ||
|
|
e1e3c84a8d | ||
|
|
ed1356976b | ||
|
|
2b815354e0 | ||
|
|
edae380a9e | ||
|
|
946e9d9ce9 | ||
|
|
a56343ffc7 | ||
|
|
159a6e9a5a | ||
|
|
6b7ee12dc1 | ||
|
|
3838247716 | ||
|
|
6e7d35a42a | ||
|
|
4c056a17b9 | ||
|
|
21d98afca5 | ||
|
|
6a1bf11549 | ||
|
|
7115a9b9a7 | ||
|
|
450df2f8d3 | ||
|
|
ccf752c4e4 | ||
|
|
3ed63e2154 | ||
|
|
a535f93cd6 | ||
|
|
b380ec54c6 | ||
|
|
a1274299ce | ||
|
|
c77224af68 | ||
|
|
95dadca45c | ||
|
|
685918fd9b | ||
|
|
bc85ddf78d | ||
|
|
5b5fb0b8a6 | ||
|
|
fec0ce1756 | ||
|
|
1e09b88b7a | ||
|
|
b6ca0fa6a5 | ||
|
|
307ec9213e | ||
|
|
a78a25d966 | ||
|
|
ebb6231f5a | ||
|
|
e1d65cb280 | ||
|
|
3fe92a92ba | ||
|
|
154db59ce6 | ||
|
|
afffa83063 | ||
|
|
787c7d88bb | ||
|
|
667c2ced3d | ||
|
|
7f5b3c8df4 | ||
|
|
46ded2864d | ||
|
|
40456be948 | ||
|
|
8d51aed679 | ||
|
|
bfc4ba38cd | ||
|
|
3cedc104df | ||
|
|
b367c80477 | ||
|
|
262b6d3a00 | ||
|
|
95e996bfda | ||
|
|
b6d132b226 | ||
|
|
4f17a2c17d | ||
|
|
00e8e9eb5a | ||
|
|
af6a77918f | ||
|
|
855622fd30 | ||
|
|
8cd12810f3 | ||
|
|
ebb955187d | ||
|
|
85e1fadf9b | ||
|
|
249b51a31b | ||
|
|
da19ef26c4 | ||
|
|
f69e16ea9d | ||
|
|
efa1f71dac | ||
|
|
692cb8cf13 | ||
|
|
ebdc215632 | ||
|
|
b2da639582 | ||
|
|
71fdb43f10 | ||
|
|
ca2791c65e | ||
|
|
dd75cedaab | ||
|
|
669fb1a6e5 | ||
|
|
5d517c91b5 | ||
|
|
6be75abc86 | ||
|
|
4c777fa2ee | ||
|
|
dc5e0033c8 | ||
|
|
3dd5973734 | ||
|
|
53d379205f | ||
|
|
57cde21c48 | ||
|
|
396abf13b6 | ||
|
|
6e7fb5f331 | ||
|
|
1d5c118b70 | ||
|
|
18612aac4d | ||
|
|
76c533a2e8 | ||
|
|
9eee213683 | ||
|
|
e4861fc0fb | ||
|
|
53ef984065 | ||
|
|
26e62da1a8 | ||
|
|
d2f3614ab0 | ||
|
|
c6d767bd64 | ||
|
|
efb06891a8 | ||
|
|
70771b12a9 | ||
|
|
3f7a4840cc | ||
|
|
e28048dc25 | ||
|
|
2ef5d38b32 | ||
|
|
d216d70cdf | ||
|
|
0de10345f7 | ||
|
|
a243fa211f | ||
|
|
d794b047be | ||
|
|
f0fd62a28f | ||
|
|
e8493d3807 | ||
|
|
5353e9c44d | ||
|
|
75f5814f9f | ||
|
|
94b8b5520d | ||
|
|
42450dcbc4 | ||
|
|
ef47225d41 | ||
|
|
e61accefb9 | ||
|
|
5c1543d112 | ||
|
|
7ff6aee20c | ||
|
|
37ea381d98 | ||
|
|
67fc884895 | ||
|
|
1e3646b57c | ||
|
|
ba2db4e727 | ||
|
|
077148be00 | ||
|
|
594ee5842e | ||
|
|
a69766bd8b | ||
|
|
857a750eb1 | ||
|
|
26aa50f1b5 | ||
|
|
8b4f1eefc2 | ||
|
|
97c2e21dcc | ||
|
|
918dd39b87 | ||
|
|
7776e906c4 | ||
|
|
1bf865ec08 | ||
|
|
3f1ce847dc | ||
|
|
9006c2d28f | ||
|
|
ec216198a0 | ||
|
|
fe6adbf110 | ||
|
|
2693c565ba | ||
|
|
9054ea37c2 | ||
|
|
7292263f86 | ||
|
|
f8a6cd697f | ||
|
|
ec9d027678 | ||
|
|
48a145a8bd | ||
|
|
71d2ab4ce7 | ||
|
|
12a8872b27 | ||
|
|
820ab4dc0c | ||
|
|
1d1ffd17fb | ||
|
|
d96b8ff906 | ||
|
|
e687418194 | ||
|
|
a5a797ec5e | ||
|
|
ff6cee4a44 | ||
|
|
9acbad28f7 | ||
|
|
560e689ccd | ||
|
|
f66e42ddb1 | ||
|
|
d9856d5de5 | ||
|
|
24177cc5a9 | ||
|
|
178af0dee8 | ||
|
|
51eb0a713c | ||
|
|
249aa006cb | ||
|
|
c08e2ac0bb | ||
|
|
335ef91d8e | ||
|
|
6c7a639673 | ||
|
|
5f77cad5ac | ||
|
|
0228ae0494 | ||
|
|
165677e800 | ||
|
|
2e5298b737 | ||
|
|
c9ffb8a053 | ||
|
|
2466eed23f | ||
|
|
6e31171d31 | ||
|
|
592254709e | ||
|
|
e43f31eb80 | ||
|
|
f50ba005b5 | ||
|
|
1011544533 | ||
|
|
6572023173 | ||
|
|
9c7e2aacd0 | ||
|
|
715f102f6d | ||
|
|
5446aed9cf | ||
|
|
b94ec85461 | ||
|
|
f55f4f7535 | ||
|
|
c87da3a079 | ||
|
|
c3590a53a8 | ||
|
|
a4315722ab | ||
|
|
c901bd331c | ||
|
|
df97f170a2 | ||
|
|
a30a2dad66 | ||
|
|
2432f5bb68 | ||
|
|
341086d86a | ||
|
|
1163c96438 | ||
|
|
4c120d7cd0 | ||
|
|
966a9758b8 | ||
|
|
f58071fc74 | ||
|
|
8712c5e636 | ||
|
|
a51f9bed49 | ||
|
|
531e514003 | ||
|
|
2cd9e706c8 | ||
|
|
e3e6a1dda8 | ||
|
|
92b5ce61a0 | ||
|
|
36073ad488 | ||
|
|
d89d0f9ff5 | ||
|
|
14c48d0a78 | ||
|
|
b1ee3e105b | ||
|
|
1f53c4fd1b | ||
|
|
bfed3ccf4d | ||
|
|
d521482409 | ||
|
|
53eba5658f | ||
|
|
bb64e4a850 | ||
|
|
771690d5c0 | ||
|
|
d39e30995a | ||
|
|
363aaeaba8 | ||
|
|
fa1307e62c | ||
|
|
e7c9e9695b | ||
|
|
bf07b7f76d | ||
|
|
5afb604e3d | ||
|
|
b3e8be6296 | ||
|
|
2cb452b1c2 | ||
|
|
938fcc4530 | ||
|
|
73625bf00f | ||
|
|
f41ed09dc1 | ||
|
|
f03c4576f7 | ||
|
|
6c5087e425 | ||
|
|
5a6caf6399 | ||
|
|
a07053d961 | ||
|
|
aa9a9a25fb | ||
|
|
cd4a39d8fc | ||
|
|
92e3f00435 | ||
|
|
a890e36a36 | ||
|
|
bed94ee431 | ||
|
|
175e99b12b | ||
|
|
2a941e3abf | ||
|
|
3c8e0ae5db | ||
|
|
e416ab526b | ||
|
|
e17246d8ea | ||
|
|
f6f043d1ca | ||
|
|
db57441c5d | ||
|
|
31d90be0a1 | ||
|
|
5a080798cb | ||
|
|
21c4857795 | ||
|
|
4149afa896 | ||
|
|
b196ab5793 | ||
|
|
b39b54bbaf | ||
|
|
26d76f52ac | ||
|
|
2fff28845d | ||
|
|
5e4c495d62 | ||
|
|
5c5802c089 | ||
|
|
06010ef1b7 | ||
|
|
bd411a1984 | ||
|
|
29d24e3eaa | ||
|
|
0fbbe774fa | ||
|
|
aede6f4bac | ||
|
|
84706ba66d | ||
|
|
6b602889a5 | ||
|
|
cd046927f3 | ||
|
|
5ed47d2d2c | ||
|
|
fd068c0933 | ||
|
|
9456e493ac | ||
|
|
3b08a73245 | ||
|
|
42606d9247 | ||
|
|
499b3dfa66 | ||
|
|
ca360b7d43 | ||
|
|
6470b16b70 | ||
|
|
4c9410c5fe | ||
|
|
351f0657c3 | ||
|
|
382604e923 | ||
|
|
b48f25a7f9 | ||
|
|
b76e4fc02b | ||
|
|
27cf7dd67f | ||
|
|
05ec45352c | ||
|
|
0e7546f248 | ||
|
|
e3c8bd5666 | ||
|
|
3ae1741343 | ||
|
|
53c106c3af | ||
|
|
44fc7dffe6 | ||
|
|
aaa6b32f9f | ||
|
|
821a0c4a2b | ||
|
|
6373237960 | ||
|
|
1caf7b104f | ||
|
|
176b4a4cde | ||
|
|
29bf993be7 | ||
|
|
eda597dea5 | ||
|
|
8c56777125 | ||
|
|
7e90ebd5db | ||
|
|
3a07bfe1d7 | ||
|
|
0694b9e8af | ||
|
|
742cbf5767 | ||
|
|
e93384b99b | ||
|
|
3c4e976093 | ||
|
|
1e795beec8 | ||
|
|
4f7407084d | ||
|
|
ae3d034531 | ||
|
|
a2db1f5ab5 | ||
|
|
cec6420c8f | ||
|
|
4f18bb7ea0 | ||
|
|
3ec2a3a92e | ||
|
|
823b67a3ce | ||
|
|
24dc59e31f | ||
|
|
08ff5e6db7 | ||
|
|
e60a6f9d14 | ||
|
|
161656d9e4 | ||
|
|
8569c02b33 | ||
|
|
35e714d8d9 | ||
|
|
6f8d5710d6 | ||
|
|
a671692832 | ||
|
|
5c8675e43a | ||
|
|
60a58f6fad | ||
|
|
948fc4c6ce | ||
|
|
5be5f3209d | ||
|
|
537b67e0cd | ||
|
|
82c5e55fe4 | ||
|
|
5f0d51d485 | ||
|
|
945dd816ad | ||
|
|
45009e6cc2 | ||
|
|
8eace6d57b | ||
|
|
3774fa6abd | ||
|
|
e71e6865d2 | ||
|
|
0e8f1528b1 | ||
|
|
c331babf51 | ||
|
|
c59d268995 | ||
|
|
9e9fcb0ba9 | ||
|
|
f33e0b2f78 | ||
|
|
0e4b99351f | ||
|
|
81edd2860f | ||
|
|
14ac584577 | ||
|
|
9ce6baf074 | ||
|
|
92a1e07b33 | ||
|
|
cb6bdece39 | ||
|
|
2ea38364bb | ||
|
|
cf6fdc0d81 | ||
|
|
91eec99563 | ||
|
|
f8cbaa9a06 | ||
|
|
d9b9b4984b | ||
|
|
8a46dc6b03 | ||
|
|
205f9dd816 | ||
|
|
9dfa1294e2 | ||
|
|
f019905720 | ||
|
|
9c257858dd | ||
|
|
2006ab20dd | ||
|
|
0caf31719c | ||
|
|
5da7184276 | ||
|
|
50fccdab96 | ||
|
|
77d6ff6630 | ||
|
|
99614702ea | ||
|
|
58cb344ee6 | ||
|
|
22d5892b12 | ||
|
|
f619969efc | ||
|
|
ca242424ab | ||
|
|
407deb84e9 | ||
|
|
14589aa094 | ||
|
|
1260e3d942 | ||
|
|
b22d926d94 | ||
|
|
410cc8c7be | ||
|
|
784c676f5b | ||
|
|
296f7970df | ||
|
|
cd59933c9c | ||
|
|
b95d3f5300 | ||
|
|
fa0d5d8469 | ||
|
|
8229843018 | ||
|
|
c578b37c6d | ||
|
|
8ea062c0bd | ||
|
|
5d8bbde434 | ||
|
|
0462af6a3d | ||
|
|
5a94a4048e | ||
|
|
2602301e1d | ||
|
|
161993f66f | ||
|
|
b7f87fb8d3 | ||
|
|
8fdb87f1f5 | ||
|
|
a9a68d230e | ||
|
|
a1f2941ffd | ||
|
|
c548eeacfc | ||
|
|
428781af86 | ||
|
|
72efe2e9fe | ||
|
|
a455003c7f | ||
|
|
8c46cc55a3 | ||
|
|
d1e3d93332 | ||
|
|
b4d143812e | ||
|
|
6e1f74402e |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,3 +11,6 @@ build/
|
||||
dist/
|
||||
*.egg-info
|
||||
|
||||
# VSCode
|
||||
.vscode
|
||||
|
||||
|
||||
431
README.md
431
README.md
@@ -1,299 +1,348 @@
|
||||
# Allegro Trains Agent
|
||||
## Deep Learning DevOps For Everyone - Now supporting all platforms (Linux, macOS, and Windows)
|
||||
<div align="center">
|
||||
|
||||
"All the Deep-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_agent_logo.png?raw=true" width="250px">
|
||||
|
||||
[](https://img.shields.io/github/license/allegroai/trains-agent.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/trains-agent.svg)
|
||||
[](https://img.shields.io/pypi/v/trains-agent.svg)
|
||||
[](https://pypi.python.org/pypi/trains-agent/)
|
||||
**ClearML Agent - MLOps/LLMOps made easy
|
||||
MLOps/LLMOps scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
|
||||
### Help improve Trains by filling our 2-min [user survey](https://allegro.ai/lp/trains-user-survey/)
|
||||
[](https://img.shields.io/github/license/allegroai/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/v/clearml-agent.svg)
|
||||
[](https://pypi.org/project/clearml-agent/)
|
||||
[](https://artifacthub.io/packages/search?repo=allegroai)
|
||||
|
||||
**Trains Agent is an AI experiment cluster solution.**
|
||||
`🌟 ClearML is open-source - Leave a star to support the project! 🌟`
|
||||
|
||||
It is a zero configuration fire-and-forget execution agent, which combined with trains-server provides a full AI cluster solution.
|
||||
</div>
|
||||
|
||||
**Full AutoML in 5 steps**
|
||||
1. Install the [Trains Server](https://github.com/allegroai/trains-agent) (or use our [open server](https://demoapp.trains.allegro.ai))
|
||||
2. `pip install trains-agent` ([install](#installing-the-trains-agent) the Trains Agent on any GPU machine: on-premises / cloud / ...)
|
||||
3. Add [Trains](https://github.com/allegroai/trains) to your code with just 2 lines & run it once (on your machine / laptop)
|
||||
4. Change the [parameters](#using-the-trains-agent) in the UI & schedule for [execution](#using-the-trains-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
---
|
||||
|
||||
### ClearML-Agent
|
||||
|
||||
#### *Formerly known as Trains Agent*
|
||||
|
||||
* Run jobs (experiments) on any local or cloud based resource
|
||||
* Implement optimized resource utilization policies
|
||||
* Deploy execution environments with either virtualenv or fully docker containerized with zero effort
|
||||
* Launch-and-Forget service containers
|
||||
* [Cloud autoscaling](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler)
|
||||
* [Customizable cleanup](https://clear.ml/docs/latest/docs/guides/services/cleanup_service)
|
||||
* Advanced [pipeline building and execution](https://clear.ml/docs/latest/docs/guides/frameworks/pytorch/notebooks/table/tabular_training_pipeline)
|
||||
|
||||
It is a zero configuration fire-and-forget execution agent, providing a full ML/DL cluster solution.
|
||||
|
||||
**Full Automation in 5 steps**
|
||||
|
||||
1. ClearML Server [self-hosted](https://github.com/allegroai/clearml-server)
|
||||
or [free tier hosting](https://app.clear.ml)
|
||||
2. `pip install clearml-agent` ([install](#installing-the-clearml-agent) the ClearML Agent on any GPU machine:
|
||||
on-premises / cloud / ...)
|
||||
3. Create a [job](https://clear.ml/docs/latest/docs/apps/clearml_task) or
|
||||
add [ClearML](https://github.com/allegroai/clearml) to your code with just 2 lines of code
|
||||
4. Change the [parameters](#using-the-clearml-agent) in the UI & schedule for [execution](#using-the-clearml-agent) (or
|
||||
automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
5. :chart_with_downwards_trend: :chart_with_upwards_trend: :eyes: :beer:
|
||||
|
||||
"All the Deep/Machine-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
|
||||
**Using the Trains Agent, you can now set up a dynamic cluster with \*epsilon DevOps**
|
||||
**Try ClearML now** [Self Hosted](https://github.com/allegroai/clearml-server)
|
||||
or [Free tier Hosting](https://app.clear.ml)
|
||||
<a href="https://app.clear.ml"><img src="https://github.com/allegroai/clearml-agent/blob/master/docs/screenshots.gif?raw=true" width="100%"></a>
|
||||
|
||||
*epsilon - Because we are scientists :triangular_ruler: and nothing is really zero work
|
||||
### Simple, Flexible Experiment Orchestration
|
||||
|
||||
(Experience Trains live at [https://demoapp.trains.allegro.ai](https://demoapp.trains.allegro.ai))
|
||||
<a href="https://demoapp.trains.allegro.ai"><img src="https://raw.githubusercontent.com/allegroai/trains-agent/9f1e86c1ca45c984ee13edc9353c7b10c55d7257/docs/screenshots.gif" width="100%"></a>
|
||||
|
||||
## Simple, Flexible Experiment Orchestration
|
||||
**The Trains Agent was built to address the DL/ML R&D DevOps needs:**
|
||||
**The ClearML Agent was built to address the DL/ML R&D DevOps needs:**
|
||||
|
||||
* Easily add & remove machines from the cluster
|
||||
* Reuse machines without the need for any dedicated containers or images
|
||||
* **Combine GPU resources across any cloud and on-prem**
|
||||
* **No need for yaml/json/template configuration of any kind**
|
||||
* **No need for yaml / json / template configuration of any kind**
|
||||
* **User friendly UI**
|
||||
* Manageable resource allocation that can be used by researchers and engineers
|
||||
* Flexible and controllable scheduler with priority support
|
||||
* Automatic instance spinning in the cloud **(coming soon)**
|
||||
* Automatic instance spinning in the cloud
|
||||
|
||||
**Using the ClearML Agent, you can now set up a dynamic cluster with \*epsilon DevOps**
|
||||
|
||||
## But ... K8S?
|
||||
We think Kubernetes is awesome.
|
||||
Combined with KubeFlow it is a robust solution for production-grade DevOps.
|
||||
We've observed, however, that it can be a bit of an overkill as an R&D DL/ML solution.
|
||||
If you are considering K8S for your research, also consider that you will soon be managing **hundreds** of containers...
|
||||
*epsilon - Because we are :triangular_ruler: and nothing is really zero work
|
||||
|
||||
In our experience, handling and building the environments, having to package every experiment in a docker, managing those hundreds (or more) containers and building pipelines on top of it all, is very complicated (also, it’s usually out of scope for the research team, and overwhelming even for the DevOps team).
|
||||
### Kubernetes Integration (Optional)
|
||||
|
||||
We feel there has to be a better way, that can be just as powerful for R&D and at the same time allow integration with K8S **when the need arises**.
|
||||
(If you already have a K8S cluster for AI, detailed instructions on how to integrate Trains into your K8S cluster are [here](https://github.com/allegroai/trains-server-k8s/tree/master/trains-server-chart) with included [helm chart](https://github.com/allegroai/trains-server-helm))
|
||||
We think Kubernetes is awesome, but it is not a must to get started with remote execution agents and cluster management.
|
||||
We designed `clearml-agent` so you can run both bare-metal and on top of Kubernetes, in any combination that fits your environment.
|
||||
|
||||
You can find the Dockerfiles in the [docker folder](./docker) and the helm Chart in https://github.com/allegroai/clearml-helm-charts
|
||||
|
||||
#### Benefits of integrating existing Kubernetes cluster with ClearML
|
||||
|
||||
- ClearML-Agent adds the missing scheduling capabilities to your Kubernetes cluster
|
||||
- Users do not need to have direct Kubernetes access!
|
||||
- Easy learning curve with UI and CLI requiring no DevOps knowledge from end users
|
||||
- Unlike other solutions, ClearML-Agents work in tandem with other customers of your Kubernetes cluster
|
||||
- Allows for more flexible automation from code, building pipelines and visibility
|
||||
- A programmatic interface for easy CI/CD workflows, enabling GitOps to trigger jobs inside your cluster
|
||||
- Seamless integration with the ClearML ML/DL/GenAI experiment manager
|
||||
- Web UI for customization, scheduling & prioritization of jobs
|
||||
- **Enterprise Features**: RBAC, vault, multi-tenancy, scheduler, quota management, fractional GPU support
|
||||
|
||||
**Run the agent in Kubernetes Glue mode an map ClearML jobs directly to K8s jobs:**
|
||||
- Use the [ClearML Agent Helm Chart](https://github.com/allegroai/clearml-helm-charts/tree/main/charts/clearml-agent) to spin an agent pod acting as a controller
|
||||
- Or run the [clearml-k8s glue](https://github.com/allegroai/clearml-agent/blob/master/examples/k8s_glue_example.py) on
|
||||
a Kubernetes cpu node
|
||||
- The clearml-k8s glue pulls jobs from the ClearML job execution queue and prepares a Kubernetes job (based on provided
|
||||
yaml template)
|
||||
- Inside each pod the clearml-agent will install the job (experiment) environment and spin and monitor the
|
||||
experiment's process, fully visible in the clearml UI
|
||||
- Benefits: Kubernetes full view of all running jobs in the system
|
||||
- **Enterprise Features**
|
||||
- Full scheduler features added on Top of Kubernetes, with quota/over-quota management, priorities and order.
|
||||
- Fractional GPU support, allowing multiple isolated containers sharing the same GPU with memory/compute limit per container
|
||||
|
||||
### SLURM (Optional)
|
||||
|
||||
Yes! Slurm integration is available, check the [documentation](https://clear.ml/docs/latest/docs/clearml_agent/#slurm) for further details
|
||||
|
||||
### Using the ClearML Agent
|
||||
|
||||
## Using the Trains Agent
|
||||
**Full scale HPC with a click of a button**
|
||||
|
||||
The Trains Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the job and monitors its progress.
|
||||
The ClearML Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the
|
||||
job and monitors its progress.
|
||||
|
||||
Any 'Draft' experiment can be scheduled for execution by a Trains agent.
|
||||
Any 'Draft' experiment can be scheduled for execution by a ClearML agent.
|
||||
|
||||
A previously run experiment can be put into 'Draft' state by either of two methods:
|
||||
* Using the **'Reset'** action from the experiment right-click context menu in the
|
||||
Trains UI - This will clear any results and artifacts the previous run had created.
|
||||
* Using the **'Clone'** action from the experiment right-click context menu in the
|
||||
Trains UI - This will create a new 'Draft' experiment with the same configuration as the original experiment.
|
||||
|
||||
An experiment is scheduled for execution using the **'Enqueue'** action from the experiment
|
||||
right-click context menu in the Trains UI and selecting the execution queue.
|
||||
* Using the **'Reset'** action from the experiment right-click context menu in the ClearML UI - This will clear any
|
||||
results and artifacts the previous run had created.
|
||||
* Using the **'Clone'** action from the experiment right-click context menu in the ClearML UI - This will create a new
|
||||
'Draft' experiment with the same configuration as the original experiment.
|
||||
|
||||
An experiment is scheduled for execution using the **'Enqueue'** action from the experiment right-click context menu in
|
||||
the ClearML UI and selecting the execution queue.
|
||||
|
||||
See [creating an experiment and enqueuing it for execution](#from-scratch).
|
||||
|
||||
Once an experiment is enqueued, it will be picked up and executed by a Trains agent monitoring this queue.
|
||||
Once an experiment is enqueued, it will be picked up and executed by a ClearML Agent monitoring this queue.
|
||||
|
||||
The Trains UI Workers & Queues page provides ongoing execution information:
|
||||
- Workers Tab: Monitor you cluster
|
||||
The ClearML UI Workers & Queues page provides ongoing execution information:
|
||||
|
||||
- Workers Tab: Monitor you cluster
|
||||
- Review available resources
|
||||
- Monitor machines statistics (CPU / GPU / Disk / Network)
|
||||
- Queues Tab:
|
||||
- Queues Tab:
|
||||
- Control the scheduling order of jobs
|
||||
- Cancel or abort job execution
|
||||
- Move jobs between execution queues
|
||||
|
||||
### What The Trains Agent Actually Does
|
||||
The Trains Agent executes experiments using the following process:
|
||||
- Create a new virtual environment (or launch the selected docker image)
|
||||
- Clone the code into the virtual-environment (or inside the docker)
|
||||
- Install python packages based on the package requirements listed for the experiment
|
||||
- Special note for PyTorch: The Trains Agent will automatically select the
|
||||
torch packages based on the CUDA_VERSION environment variable of the machine
|
||||
- Execute the code, while monitoring the process
|
||||
- Log all stdout/stderr in the Trains UI, including the cloning and installation process, for easy debugging
|
||||
- Monitor the execution and allow you to manually abort the job using the Trains UI (or, in the unfortunate case of a code crash, catch the error and signal the experiment has failed)
|
||||
#### What The ClearML Agent Actually Does
|
||||
|
||||
### System Design & Flow
|
||||
```text
|
||||
+-----------------+
|
||||
| GPU Machine |
|
||||
Development Machine | |
|
||||
+------------------------+ | +-------------+ |
|
||||
| Data Scientist's | +--------------+ | |Trains Agent | |
|
||||
| DL/ML Code | | WEB UI | | | | |
|
||||
| | | | | | +---------+ | |
|
||||
| | | | | | | DL/ML | | |
|
||||
| | +--------------+ | | | Code | | |
|
||||
| | User Clones Exp #1 / . . . . . . . / | | | | | |
|
||||
| +-------------------+ | into Exp #2 / . . . . . . . / | | +---------+ | |
|
||||
| | Trains | | +---------------/-_____________-/ | | | |
|
||||
| +---------+---------+ | | | | ^ | |
|
||||
+-----------|------------+ | | +------|------+ |
|
||||
| | +--------|--------+
|
||||
Auto-Magically | |
|
||||
Creates Exp #1 | The Trains Agent
|
||||
\ User Change Hyper-Parameters Pulls Exp #2, setup the
|
||||
| | environment & clone code.
|
||||
| | Start execution with the
|
||||
+------------|------------+ | +--------------------+ new set of Hyper-Parameters.
|
||||
| +---------v---------+ | | | Trains Server | |
|
||||
| | Experiment #1 | | | | | |
|
||||
| +-------------------+ | | | Execution Queue | |
|
||||
| || | | | | |
|
||||
| +-------------------+<----------+ | | |
|
||||
| | | | | | |
|
||||
| | Experiment #2 | | | | |
|
||||
| +-------------------<------------\ | | |
|
||||
| | ------------->---------------+ | |
|
||||
| | User Send Exp #2 | |Execute Exp #2 +--------------------+
|
||||
| | For Execution | +---------------+ |
|
||||
| Trains Server | | |
|
||||
+-------------------------+ +--------------------+
|
||||
```
|
||||
The ClearML Agent executes experiments using the following process:
|
||||
|
||||
### Installing the Trains Agent
|
||||
- Create a new virtual environment (or launch the selected docker image)
|
||||
- Clone the code into the virtual-environment (or inside the docker)
|
||||
- Install python packages based on the package requirements listed for the experiment
|
||||
- Special note for PyTorch: The ClearML Agent will automatically select the torch packages based on the CUDA_VERSION
|
||||
environment variable of the machine
|
||||
- Execute the code, while monitoring the process
|
||||
- Log all stdout/stderr in the ClearML UI, including the cloning and installation process, for easy debugging
|
||||
- Monitor the execution and allow you to manually abort the job using the ClearML UI (or, in the unfortunate case of a
|
||||
code crash, catch the error and signal the experiment has failed)
|
||||
|
||||
#### System Design & Flow
|
||||
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_architecture.png" width="100%" alt="clearml-architecture">
|
||||
|
||||
#### Installing the ClearML Agent
|
||||
|
||||
```bash
|
||||
pip install trains-agent
|
||||
pip install clearml-agent
|
||||
```
|
||||
|
||||
### Trains Agent Usage Examples
|
||||
#### ClearML Agent Usage Examples
|
||||
|
||||
Full Interface and capabilities are available with
|
||||
```bash
|
||||
trains-agent --help
|
||||
trains-agent daemon --help
|
||||
```
|
||||
|
||||
### Configuring the Trains Agent
|
||||
|
||||
```bash
|
||||
trains-agent init
|
||||
clearml-agent --help
|
||||
clearml-agent daemon --help
|
||||
```
|
||||
|
||||
Note: The Trains Agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default Trains Agent cache folder is `~/.trains`
|
||||
#### Configuring the ClearML Agent
|
||||
|
||||
See full details in your configuration file at `~/trains.conf`
|
||||
|
||||
Note: The **Trains agent** extends the **Trains** configuration file `~/trains.conf`
|
||||
They are designed to share the same configuration file, see example [here](docs/trains.conf)
|
||||
|
||||
### Running the Trains Agent
|
||||
|
||||
For debug and experimentation, start the Trains agent in `foreground` mode, where all the output is printed to screen
|
||||
```bash
|
||||
trains-agent daemon --queue default --foreground
|
||||
clearml-agent init
|
||||
```
|
||||
|
||||
For actual service mode, all the stdout will be stored automatically into a temporary file (no need to pipe)
|
||||
Notice: with `--detached` flag, the *trains-agent* will be running in the background
|
||||
Note: The ClearML Agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default
|
||||
ClearML Agent cache folder is `~/.clearml`.
|
||||
|
||||
See full details in your configuration file at `~/clearml.conf`.
|
||||
|
||||
Note: The **ClearML Agent** extends the **ClearML** configuration file `~/clearml.conf`.
|
||||
They are designed to share the same configuration file, see example [here](docs/clearml.conf)
|
||||
|
||||
#### Running the ClearML Agent
|
||||
|
||||
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen:
|
||||
|
||||
```bash
|
||||
trains-agent daemon --detached --queue default
|
||||
clearml-agent daemon --queue default --foreground
|
||||
```
|
||||
|
||||
GPU allocation is controlled via the standard OS environment `NVIDIA_VISIBLE_DEVICES` or `--gpus` flag (or disabled with `--cpu-only`).
|
||||
For actual service mode, all the stdout will be stored automatically into a temporary file (no need to pipe).
|
||||
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
|
||||
|
||||
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for the `trains-agent` <br>
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for the `trains-agent`
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine:
|
||||
Notice: with `--detached` flag, the *trains-agent* will be running in the background
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0 --queue default
|
||||
trains-agent daemon --detached --gpus 1 --queue default
|
||||
clearml-agent daemon --detached --queue default
|
||||
```
|
||||
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent
|
||||
GPU allocation is controlled via the standard OS environment `NVIDIA_VISIBLE_DEVICES` or `--gpus` flag (or disabled
|
||||
with `--cpu-only`).
|
||||
|
||||
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPUs will be allocated for
|
||||
the `clearml-agent`. <br>
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES="none"`, no gpu will be allocated for
|
||||
the `clearml-agent`.
|
||||
|
||||
Example: spin two agents, one per GPU on the same machine:
|
||||
|
||||
Notice: with `--detached` flag, the *clearml-agent* will run in the background
|
||||
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu
|
||||
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu
|
||||
clearml-agent daemon --detached --gpus 0 --queue default
|
||||
clearml-agent daemon --detached --gpus 1 --queue default
|
||||
```
|
||||
|
||||
#### Starting the Trains Agent in docker mode
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two GPUs per agent
|
||||
|
||||
For debug and experimentation, start the Trains agent in `foreground` mode, where all the output is printed to screen
|
||||
```bash
|
||||
trains-agent daemon --queue default --docker --foreground
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu
|
||||
```
|
||||
|
||||
For actual service mode, all the stdout will be stored automatically into a file (no need to pipe)
|
||||
Notice: with `--detached` flag, the *trains-agent* will be running in the background
|
||||
##### Starting the ClearML Agent in docker mode
|
||||
|
||||
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen
|
||||
|
||||
```bash
|
||||
trains-agent daemon --detached --queue default --docker
|
||||
clearml-agent daemon --queue default --docker --foreground
|
||||
```
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda docker:
|
||||
For actual service mode, all the stdout will be stored automatically into a file (no need to pipe).
|
||||
Notice: with `--detached` flag, the *clearml-agent* will run in the background
|
||||
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda
|
||||
trains-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --queue default --docker
|
||||
```
|
||||
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda docker:
|
||||
Example: spin two agents, one per gpu on the same machine, with default `nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04`
|
||||
docker:
|
||||
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda
|
||||
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
clearml-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
```
|
||||
|
||||
#### Starting the Trains Agent - Priority Queues
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two GPUs per agent, with default
|
||||
`nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04` docker:
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
```
|
||||
|
||||
##### Starting the ClearML Agent - Priority Queues
|
||||
|
||||
Priority Queues are also supported, example use case:
|
||||
|
||||
High priority queue: `important_jobs` Low priority queue: `default`
|
||||
High priority queue: `important_jobs`, low priority queue: `default`
|
||||
|
||||
```bash
|
||||
trains-agent daemon --queue important_jobs default
|
||||
```
|
||||
The **Trains Agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
|
||||
|
||||
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [open server](https://demoapp.trains.allegro.ai/workers-and-queues/queues)
|
||||
|
||||
#### Stopping the Trains Agent
|
||||
|
||||
To stop a **Trains Agent** running in the background, run the same command line used to start the agent with `--stop` appended.
|
||||
For example, to stop the first of the above shown same machine, single gpu agents:
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda --stop
|
||||
clearml-agent daemon --queue important_jobs default
|
||||
```
|
||||
|
||||
## How do I create an experiment on the Trains Server? <a name="from-scratch"></a>
|
||||
* Integrate [Trains](https://github.com/allegroai/trains) with your code
|
||||
The **ClearML Agent** will first try to pull jobs from the `important_jobs` queue, and only if it is empty, the agent
|
||||
will try to pull from the `default` queue.
|
||||
|
||||
Adding queues, managing job order within a queue, and moving jobs between queues, is available using the Web UI, see
|
||||
example on our [free server](https://app.clear.ml/workers-and-queues/queues)
|
||||
|
||||
##### Stopping the ClearML Agent
|
||||
|
||||
To stop a **ClearML Agent** running in the background, run the same command line used to start the agent with `--stop`
|
||||
appended. For example, to stop the first of the above shown same machine, single gpu agents:
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04 --stop
|
||||
```
|
||||
|
||||
### How do I create an experiment on the ClearML Server? <a name="from-scratch"></a>
|
||||
|
||||
* Integrate [ClearML](https://github.com/allegroai/clearml) with your code
|
||||
* Execute the code on your machine (Manually / PyCharm / Jupyter Notebook)
|
||||
* As your code is running, **Trains** creates an experiment logging all the necessary execution information:
|
||||
- Git repository link and commit ID (or an entire jupyter notebook)
|
||||
- Git diff (we’re not saying you never commit and push, but still...)
|
||||
- Python packages used by your code (including specific versions used)
|
||||
- Hyper-Parameters
|
||||
- Input Artifacts
|
||||
* As your code is running, **ClearML** creates an experiment logging all the necessary execution information:
|
||||
- Git repository link and commit ID (or an entire jupyter notebook)
|
||||
- Git diff (we’re not saying you never commit and push, but still...)
|
||||
- Python packages used by your code (including specific versions used)
|
||||
- Hyperparameters
|
||||
- Input artifacts
|
||||
|
||||
You now have a 'template' of your experiment with everything required for automated execution
|
||||
|
||||
* In the Trains UI, Right click on the experiment and select 'clone'. A copy of your experiment will be created.
|
||||
* In the ClearML UI, right-click on the experiment and select 'clone'. A copy of your experiment will be created.
|
||||
* You now have a new draft experiment cloned from your original experiment, feel free to edit it
|
||||
- Change the Hyper-Parameters
|
||||
- Switch to the latest code base of the repository
|
||||
- Update package versions
|
||||
- Select a specific docker image to run in (see docker execution mode section)
|
||||
- Or simply change nothing to run the same experiment again...
|
||||
* Schedule the newly created experiment for execution: Right-click the experiment and select 'enqueue'
|
||||
- Change the hyperparameters
|
||||
- Switch to the latest code base of the repository
|
||||
- Update package versions
|
||||
- Select a specific docker image to run in (see docker execution mode section)
|
||||
- Or simply change nothing to run the same experiment again...
|
||||
* Schedule the newly created experiment for execution: right-click the experiment and select 'enqueue'
|
||||
|
||||
## Trains-Agent Services Mode <a name="services"></a>
|
||||
### ClearML-Agent Services Mode <a name="services"></a>
|
||||
|
||||
Trains-Agent Services is a special mode of Trains-Agent that provides the ability to launch long-lasting jobs
|
||||
that previously had to be executed on local / dedicated machines. It allows a single agent to
|
||||
launch multiple dockers (Tasks) for different use cases. To name a few use cases, auto-scaler service (spinning instances
|
||||
when the need arises and the budget allows), Controllers (Implementing pipelines and more sophisticated DevOps logic),
|
||||
Optimizer (such as Hyper-parameter Optimization or sweeping), and Application (such as interactive Bokeh apps for
|
||||
increased data transparency)
|
||||
ClearML-Agent Services is a special mode of ClearML-Agent that provides the ability to launch long-lasting jobs that
|
||||
previously had to be executed on local / dedicated machines. It allows a single agent to launch multiple dockers (Tasks)
|
||||
for different use cases:
|
||||
* Auto-scaler service (spinning instances when the need arises and the budget allows)
|
||||
* Controllers (Implementing pipelines and more sophisticated DevOps logic)
|
||||
* Optimizer (such as Hyperparameter Optimization or sweeping)
|
||||
* Application (such as interactive Bokeh apps for increased data transparency)
|
||||
|
||||
Trains-Agent Services mode will spin **any** task enqueued into the specified queue.
|
||||
Every task launched by Trains-Agent Services will be registered as a new node in the system,
|
||||
providing tracking and transparency capabilities.
|
||||
Currently trains-agent in services-mode supports cpu only configuration. Trains-agent services mode can be launched alongside GPU agents.
|
||||
ClearML-Agent Services mode will spin **any** task enqueued into the specified queue. Every task launched by
|
||||
ClearML-Agent Services will be registered as a new node in the system, providing tracking and transparency capabilities.
|
||||
Currently, clearml-agent in services-mode supports CPU only configuration. ClearML-Agent services mode can be launched
|
||||
alongside GPU agents.
|
||||
|
||||
```bash
|
||||
trains-agent daemon --services-mode --detached --queue services --create-queue --docker ubuntu:18.04 --cpu-only
|
||||
clearml-agent daemon --services-mode --detached --queue services --create-queue --docker ubuntu:18.04 --cpu-only
|
||||
```
|
||||
|
||||
**Note**: It is the user's responsibility to make sure the proper tasks are pushed into the specified queue.
|
||||
**Note**: It is the user's responsibility to make sure the proper tasks are pushed into the specified queue.
|
||||
|
||||
### AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
|
||||
|
||||
## AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
|
||||
The Trains Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the Trains package.
|
||||
The ClearML Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the
|
||||
ClearML package.
|
||||
|
||||
Sample AutoML & Orchestration examples can be found in the Trains [example/automation](https://github.com/allegroai/trains/tree/master/examples/automation) folder.
|
||||
Sample AutoML & Orchestration examples can be found in the
|
||||
ClearML [example/automation](https://github.com/allegroai/clearml/tree/master/examples/automation) folder.
|
||||
|
||||
AutoML examples
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/trains/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
|
||||
AutoML examples:
|
||||
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/clearml/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/trains/blob/master/examples/automation/manual_random_param_search_example.py)
|
||||
- This example will create multiple copies of the Keras experiment-template, with different hyper-parameter combinations
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/clearml/blob/master/examples/automation/manual_random_param_search_example.py)
|
||||
- This example will create multiple copies of the Keras experiment-template, with different hyperparameter
|
||||
combinations
|
||||
|
||||
Experiment Pipeline examples
|
||||
- [First step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/task_piping_example.py)
|
||||
Experiment Pipeline examples:
|
||||
|
||||
- [First step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/task_piping_example.py)
|
||||
- This example will "process data", and once done, will launch a copy of the 'second step' experiment-template
|
||||
- [Second step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/toy_base_task.py)
|
||||
- [Second step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/toy_base_task.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
|
||||
## License
|
||||
### License
|
||||
|
||||
Apache License, Version 2.0 (see the [LICENSE](https://www.apache.org/licenses/LICENSE-2.0.html) for more information)
|
||||
|
||||
@@ -4,15 +4,15 @@ import argparse
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
from trains_agent.backend_api.session.datamodel import UnusedKwargsWarning
|
||||
from clearml_agent.backend_api.session.datamodel import UnusedKwargsWarning
|
||||
|
||||
import trains_agent
|
||||
from trains_agent.config import get_config
|
||||
from trains_agent.definitions import FileBuffering, CONFIG_FILE
|
||||
from trains_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
|
||||
from trains_agent.helper.process import ExitStatus
|
||||
import clearml_agent
|
||||
from clearml_agent.config import get_config
|
||||
from clearml_agent.definitions import FileBuffering, CONFIG_FILE
|
||||
from clearml_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
|
||||
from clearml_agent.helper.process import ExitStatus
|
||||
from . import interface, session, definitions, commands
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError, CustomBuildScriptFailed
|
||||
from .helper.trace import PackageTrace
|
||||
from .interface import get_parser
|
||||
|
||||
@@ -44,10 +44,12 @@ def run_command(parser, args, command_name):
|
||||
debug = command._session.debug_mode
|
||||
func = getattr(command, command_name)
|
||||
return func(**args_dict)
|
||||
except CustomBuildScriptFailed as e:
|
||||
command_class.exit(e.message, e.errno)
|
||||
except ConfigFileNotFound:
|
||||
message = 'Cannot find configuration file in "{}".\n' \
|
||||
'To create a configuration file, run:\n' \
|
||||
'$ trains_agent init'.format(reverse_home_folder_expansion(CONFIG_FILE))
|
||||
'$ clearml_agent init'.format(reverse_home_folder_expansion(CONFIG_FILE))
|
||||
command_class.exit(message)
|
||||
except APIError as api_error:
|
||||
if not debug:
|
||||
424
clearml_agent/backend_api/config/default/agent.conf
Normal file
424
clearml_agent/backend_api/config/default/agent.conf
Normal file
@@ -0,0 +1,424 @@
|
||||
{
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Override with os environment: CLEARML_WORKER_ID
|
||||
# worker_id: "clearml-agent-machine1:gpu0"
|
||||
worker_id: ""
|
||||
|
||||
# worker name, replaces the hostname when creating a unique name for this worker
|
||||
# Override with os environment: CLEARML_WORKER_NAME
|
||||
# worker_name: "clearml-agent-machine1"
|
||||
worker_name: ""
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# To learn how to generate git token GitHub/Bitbucket/GitLab:
|
||||
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
|
||||
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
|
||||
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# Limit credentials to a single domain, for example: github.com,
|
||||
# all other domains will use public access (no user/pass). Default: always send user/pass for any VCS domain
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# Force the root folder of the git repository (instead of the working directory) into the PYHTONPATH
|
||||
# default false, only the working directory will be added to the PYHTONPATH
|
||||
# force_git_root_python_path: false
|
||||
|
||||
# if set, use GIT_ASKPASS to pass user/pass when cloning / fetch repositories
|
||||
# it solves passing user/token to git submodules.
|
||||
# this is a safer way to ensure multiple users using the same repository will
|
||||
# not accidentally leak credentials
|
||||
# Note: this is only supported on Linux systems
|
||||
# enable_git_ask_pass: true
|
||||
|
||||
# in docker mode, if container's entrypoint automatically activated a virtual environment
|
||||
# use the activated virtual environment and install everything there
|
||||
# set to False to disable, and always create a new venv inheriting from the system_site_packages
|
||||
# docker_use_activated_venv: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: ["<20.2 ; python_version < '3.10'", "<22.3 ; python_version >= '3.10'"],
|
||||
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
|
||||
# poetry_version: "<2",
|
||||
# poetry_install_extra_args: ["-v"]
|
||||
|
||||
# virtual environment inherits packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
# install with --upgrade
|
||||
force_upgrade: false,
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
|
||||
# control the pytorch wheel resolving algorithm, options are: "pip", "direct", "none"
|
||||
# Override with environment variable CLEARML_AGENT_PACKAGE_PYTORCH_RESOLVE
|
||||
# "pip" (default): would automatically detect the cuda version, and supply pip with the correct
|
||||
# extra-index-url, based on pytorch.org tables
|
||||
# "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
|
||||
# and matching the automatically detected cuda version with the required pytorch wheel.
|
||||
# if the exact cuda version is not found for the required pytorch wheel, it will try
|
||||
# a lower cuda version until a match is found
|
||||
# "none": No resolver used, install pytorch like any other package
|
||||
# pytorch_resolve: "pip"
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", "nvidia", "defaults", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||
torch_nightly: false,
|
||||
|
||||
# if set to true, the agent will look for the "poetry.lock" file
|
||||
# in the passed current working directory instead of the repository's root directory.
|
||||
poetry_files_from_repo_working_dir: false
|
||||
},
|
||||
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.clearml/vcs-cache
|
||||
},
|
||||
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
|
||||
# cached folder for specific python package download (used for pytorch package caching)
|
||||
pip_download_cache {
|
||||
enabled: true,
|
||||
path: ~/.clearml/pip-download-cache
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
|
||||
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
|
||||
# default is false, automatically mounts ~/.ssh
|
||||
# Must be set to True if using "clearml-session" with this agent!
|
||||
# disable_ssh_mount: false
|
||||
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
# pip cache folder mapped into docker, used for python package caching
|
||||
docker_pip_cache = ~/.clearml/pip-cache
|
||||
# apt cache folder mapped into docker, used for ubuntu package caching
|
||||
docker_apt_cache = ~/.clearml/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# Allow the extra docker arg to override task level docker arg (if the same argument is passed on both),
|
||||
# if set to False, a task docker arg will override the docker extra arg
|
||||
# docker_args_extra_precedes_task: true
|
||||
|
||||
# allows the following task docker args to be overridden by the extra_docker_arguments
|
||||
# protected_docker_extra_args: ["privileged", "security-opt", "network", "ipc"]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
# Defined using a list of items of the format: "<hours> <days>".
|
||||
# hours - use values 0-23, single values would count as start hour and end at midnight.
|
||||
# days - use days in abbreviated format (SUN-SAT)
|
||||
# use '-' for ranges and ',' to separate singular values.
|
||||
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
|
||||
# uptime: ["17-20 SUN,TUE"]
|
||||
|
||||
# optional downtime configuration, can be used only when uptime is not used.
|
||||
# If downtime is specified, agent will be idle in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
|
||||
# Use the same format as described above for uptime
|
||||
# downtime: []
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
# docker_init_bash_script = [
|
||||
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||
# "chown -R root /root/.cache/pip",
|
||||
# "apt-get update",
|
||||
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
|
||||
# ]
|
||||
|
||||
# set the preprocessing bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# docker_preprocess_bash_script = [
|
||||
# "echo \"starting docker\"",
|
||||
#]
|
||||
|
||||
# If False replace \r with \n and display full console output
|
||||
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||
# suppress_carriage_return: true
|
||||
|
||||
# CUDA versions used for Conda setup & solving PyTorch wheel packages
|
||||
# Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Sanitize configuration printout using these settings
|
||||
sanitize_config_printout {
|
||||
# Hide values of configuration keys matching these regexps
|
||||
hide_secrets: ["^sanitize_config_printout$", "secret", "pass", "token", "account_key", "contents"]
|
||||
# As above, only show field's value keys if value is a dictionary
|
||||
hide_secrets_recursive: ["^environment$"]
|
||||
# Do not hide for keys matching these regexps
|
||||
dont_hide_secrets: ["^enable_git_ask_pass$"]
|
||||
# Hide secrets in docker commands, according to the 'agent.hide_docker_command_env_vars' settings
|
||||
docker_commands: ["^extra_docker_arguments$"]
|
||||
# Hide password in URLs found in keys matching these regexps (handles single URLs, lists and dictionaries)
|
||||
urls: ["^extra_index_url$"]
|
||||
}
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
parse_embedded_urls: true
|
||||
}
|
||||
|
||||
# Maximum execution time (in seconds) for Task's abort function call
|
||||
abort_callback_max_timeout: 1800
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
docker_internal_mounts {
|
||||
sdk_cache: "/clearml_agent_cache"
|
||||
apt_cache: "/var/cache/apt/archives"
|
||||
ssh_folder: "~/.ssh"
|
||||
ssh_ro_folder: "/.ssh"
|
||||
pip_cache: "/root/.cache/pip"
|
||||
poetry_cache: "/root/.cache/pypoetry"
|
||||
vcs_cache: "/root/.clearml/vcs-cache"
|
||||
venv_build: "~/.clearml/venvs-builds"
|
||||
pip_download: "/root/.clearml/pip-download-cache"
|
||||
}
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
|
||||
# Custom variables may be specified using the docker_container_name_format_fields option.
|
||||
# Note: resulting name must start with an alphanumeric character and
|
||||
# continue with alphanumeric characters, underscores (_), dots (.) and/or dashes (-)
|
||||
# docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
|
||||
# Specify custom variables for the docker_container_name_format option using a mapping of variable name
|
||||
# to a (nested) task field (using "." as a task field separator, digits specify array index)
|
||||
# docker_container_name_format_fields: { foo: "bar.moo" }
|
||||
|
||||
# Apply top-level environment section from configuration into os.environ
|
||||
apply_environment: true
|
||||
# Top-level environment section is in the form of:
|
||||
# environment {
|
||||
# key: value
|
||||
# ...
|
||||
# }
|
||||
# and is applied to the OS environment as `key=value` for each key/value pair
|
||||
|
||||
# Apply top-level files section from configuration into local file system
|
||||
apply_files: true
|
||||
# Top-level files section allows auto-generating files at designated paths with a predefined contents
|
||||
# and target format. Options include:
|
||||
# contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
|
||||
# format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
|
||||
# base64-encoded contents string, otherwise ignored
|
||||
# path: the target file's path, may include ~ and inplace env vars
|
||||
# target_format: format used to encode contents before writing into the target file. Supported values are json,
|
||||
# yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
|
||||
# overwrite: overwrite the target file in case it exists. Default is true.
|
||||
# mode: file-system mode to be applied to the file after its creation. The mode string will be parsed into an
|
||||
# integer (e.g. "0o777" for -rwxrwxrwx)
|
||||
#
|
||||
# Example:
|
||||
# files {
|
||||
# myfile1 {
|
||||
# contents: "The quick brown fox jumped over the lazy dog"
|
||||
# path: "/tmp/fox.txt"
|
||||
# }
|
||||
# myjsonfile {
|
||||
# contents: {
|
||||
# some {
|
||||
# nested {
|
||||
# value: [1, 2, 3, 4]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# path: "/tmp/test.json"
|
||||
# target_format: json
|
||||
# }
|
||||
# }
|
||||
|
||||
# Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
# If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
# will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
# The script can also be specified using the CLEARML_AGENT_CUSTOM_BUILD_SCRIPT environment variable.
|
||||
#
|
||||
# When running the script, the following environment variables will be set:
|
||||
# - CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
# contents in JSON format
|
||||
# - CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
# - CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
# - CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
# - CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
# - CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
# this file must be in the following JSON format:
|
||||
# ```json
|
||||
# {
|
||||
# "binary": "/absolute/path/to/python-executable",
|
||||
# "entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
# "working_dir": "/absolute/path/to/task-working/dir"
|
||||
# }
|
||||
# ```
|
||||
# If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
# skip virtual environment creation.
|
||||
#
|
||||
# In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
# In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
# into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
# standard flow.
|
||||
custom_build_script: ""
|
||||
|
||||
# Crash on exception: by default when encountering an exception while running a task,
|
||||
# the agent will catch the exception, log it and continue running.
|
||||
# Set this to `true` to propagate exceptions and crash the agent.
|
||||
# crash_on_exception: true
|
||||
|
||||
# Disable task docker override. If true, the agent will use the default docker image and ignore any docker image
|
||||
# and arguments specified in the task's container section (setup shell script from the task container section will
|
||||
# be used in any case, if specified).
|
||||
disable_task_docker_override: false
|
||||
|
||||
# Choose the default docker based on the Task properties,
|
||||
# Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project'
|
||||
# Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme$" string
|
||||
#
|
||||
# "default_docker": {
|
||||
# "image": "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04",
|
||||
# # optional arguments to pass to docker image
|
||||
# # arguments: ["--ipc=host", ]
|
||||
# "match_rules": [
|
||||
# {
|
||||
# "image": "sample_container:tag",
|
||||
# "arguments": "-e VALUE=1 --ipc=host",
|
||||
# "match": {
|
||||
# "script": {
|
||||
# "requirements": {
|
||||
# "pip": {
|
||||
# "tensorflow": "~=1.6"
|
||||
# }
|
||||
# },
|
||||
# "repository": "",
|
||||
# "branch": "master"
|
||||
# },
|
||||
# "project": "example"
|
||||
# }
|
||||
# },
|
||||
# {
|
||||
# "image": "another_container:tag",
|
||||
# "arguments": "",
|
||||
# "match": {
|
||||
# "project": "^examples", # anything that starts with "examples", e.g. "examples", "examples/sub_project"
|
||||
# }
|
||||
# }
|
||||
# ]
|
||||
# },
|
||||
#
|
||||
}
|
||||
@@ -28,10 +28,15 @@
|
||||
|
||||
pool_maxsize: 512
|
||||
pool_connections: 512
|
||||
|
||||
# Override the default http method, use "put" if working behind GCP load balancer (default: "get")
|
||||
# default_method: "get"
|
||||
}
|
||||
|
||||
auth {
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token
|
||||
token_expiration_threshold_sec = 360
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token. Default 12 hours
|
||||
token_expiration_threshold_sec: 43200
|
||||
# When requesting a token, request specific expiration time. Server default (and maximum) is 30 days
|
||||
# request_token_expiration_sec: 2592000
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
# TRAINS - default SDK configuration
|
||||
# ClearML - default SDK configuration
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
default_base_dir: "~/.trains/cache"
|
||||
# Defaults to <system_temp_folder>/clearml_cache
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
size {
|
||||
# max_used_bytes = -1
|
||||
min_free_bytes = 10GB
|
||||
@@ -98,7 +98,7 @@
|
||||
google.storage {
|
||||
# # Default project and credentials file
|
||||
# # Will be used when no bucket configuration is found
|
||||
# project: "trains"
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
|
||||
# # Specific credentials per bucket and sub directory
|
||||
@@ -106,7 +106,7 @@
|
||||
# {
|
||||
# bucket: "my-bucket"
|
||||
# subdir: "path/in/bucket" # Not required
|
||||
# project: "trains"
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
# },
|
||||
# ]
|
||||
@@ -114,7 +114,7 @@
|
||||
azure.storage {
|
||||
# containers: [
|
||||
# {
|
||||
# account_name: "trains"
|
||||
# account_name: "clearml"
|
||||
# account_key: "secret"
|
||||
# # container_name:
|
||||
# }
|
||||
@@ -140,7 +140,7 @@
|
||||
vcs_repo_detect_async: true
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||
# This stores "git diff" or into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff: true
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
@@ -155,8 +155,8 @@
|
||||
# do not analyze the entire repository.
|
||||
force_analyze_entire_repo: false
|
||||
|
||||
# If set to true, *trains* update message will not be printed to the console
|
||||
# this value can be overwritten with os environment variable TRAINS_SUPPRESS_UPDATE_MESSAGE=1
|
||||
# If set to true, *clearml* update message will not be printed to the console
|
||||
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
|
||||
suppress_update_message: false
|
||||
|
||||
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
|
||||
@@ -4,7 +4,7 @@ import re
|
||||
import attr
|
||||
import six
|
||||
|
||||
import pyhocon
|
||||
from clearml_agent.external import pyhocon
|
||||
|
||||
from .action import Action
|
||||
|
||||
@@ -106,15 +106,15 @@ class StrictSession(Session):
|
||||
init()
|
||||
return
|
||||
|
||||
original = os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR, None)
|
||||
original = LOCAL_CONFIG_FILE_OVERRIDE_VAR.get() or None
|
||||
try:
|
||||
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = str(config_file)
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR.set(str(config_file))
|
||||
init()
|
||||
finally:
|
||||
if original is None:
|
||||
os.environ.pop(LOCAL_CONFIG_FILE_OVERRIDE_VAR, None)
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR.pop()
|
||||
else:
|
||||
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = original
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR.set(original)
|
||||
|
||||
def send(self, request, *args, **kwargs):
|
||||
result = super(StrictSession, self).send(request, *args, **kwargs)
|
||||
@@ -222,7 +222,7 @@ class TableResponse(Response):
|
||||
return "" if result is None else result
|
||||
|
||||
fields = fields or self.fields
|
||||
from trains_agent.helper.base import create_table
|
||||
from clearml_agent.helper.base import create_table
|
||||
return create_table(
|
||||
(dict((attr, getter(item, attr)) for attr in fields) for item in self),
|
||||
titles=fields, columns=fields, headers=True,
|
||||
@@ -66,11 +66,16 @@ class DataModel(object):
|
||||
}
|
||||
|
||||
def validate(self, schema=None):
|
||||
jsonschema.validate(
|
||||
self.to_dict(),
|
||||
schema or self._schema,
|
||||
types=dict(array=(list, tuple), integer=six.integer_types),
|
||||
schema = schema or self._schema
|
||||
validator = jsonschema.validators.validator_for(schema)
|
||||
validator_cls = jsonschema.validators.extend(
|
||||
validator=validator,
|
||||
type_checker=validator.TYPE_CHECKER.redefine_many({
|
||||
"array": lambda s, instance: isinstance(instance, (list, tuple)),
|
||||
"integer": lambda s, instance: isinstance(instance, six.integer_types),
|
||||
}),
|
||||
)
|
||||
jsonschema.validate(self.to_dict(), schema, cls=validator_cls)
|
||||
|
||||
def __repr__(self):
|
||||
return '<{}.{}: {}>'.format(
|
||||
33
clearml_agent/backend_api/session/defs.py
Normal file
33
clearml_agent/backend_api/session/defs.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from clearml_agent.helper.environment import EnvEntry
|
||||
from clearml_agent.helper.environment.converters import safe_text_to_bool
|
||||
|
||||
|
||||
ENV_HOST = EnvEntry("CLEARML_API_HOST", "TRAINS_API_HOST")
|
||||
ENV_WEB_HOST = EnvEntry("CLEARML_WEB_HOST", "TRAINS_WEB_HOST")
|
||||
ENV_FILES_HOST = EnvEntry("CLEARML_FILES_HOST", "TRAINS_FILES_HOST")
|
||||
ENV_ACCESS_KEY = EnvEntry("CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY")
|
||||
ENV_SECRET_KEY = EnvEntry("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AUTH_TOKEN = EnvEntry("CLEARML_AUTH_TOKEN")
|
||||
ENV_VERBOSE = EnvEntry("CLEARML_API_VERBOSE", "TRAINS_API_VERBOSE", type=bool, default=False)
|
||||
ENV_HOST_VERIFY_CERT = EnvEntry("CLEARML_API_HOST_VERIFY_CERT", "TRAINS_API_HOST_VERIFY_CERT", type=bool, default=True)
|
||||
ENV_CONDA_ENV_PACKAGE = EnvEntry("CLEARML_CONDA_ENV_PACKAGE", "TRAINS_CONDA_ENV_PACKAGE")
|
||||
ENV_USE_CONDA_BASE_ENV = EnvEntry("CLEARML_USE_CONDA_BASE_ENV", type=bool)
|
||||
ENV_NO_DEFAULT_SERVER = EnvEntry("CLEARML_NO_DEFAULT_SERVER", "TRAINS_NO_DEFAULT_SERVER", type=bool, default=True)
|
||||
ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type=bool)
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
|
||||
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
|
||||
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
|
||||
ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
|
||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||
)
|
||||
ENV_FORCE_MAX_API_VERSION = EnvEntry("CLEARML_AGENT_FORCE_MAX_API_VERSION", type=str)
|
||||
|
||||
"""
|
||||
Experimental option to set the request method for all API requests and auth login.
|
||||
This could be useful when GET requests with payloads are blocked by a server as
|
||||
POST requests can be used instead.
|
||||
|
||||
However this has not been vigorously tested and may have unintended consequences.
|
||||
"""
|
||||
ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD", default="GET")
|
||||
@@ -5,10 +5,18 @@ import six
|
||||
|
||||
from .apimodel import ApiModel
|
||||
from .datamodel import DataModel
|
||||
from .defs import ENV_API_DEFAULT_REQ_METHOD
|
||||
|
||||
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST", "PUT"):
|
||||
raise ValueError(
|
||||
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
|
||||
)
|
||||
|
||||
|
||||
class Request(ApiModel):
|
||||
_method = 'get'
|
||||
def_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs:
|
||||
@@ -1,17 +1,26 @@
|
||||
|
||||
import json as json_lib
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import types
|
||||
from random import SystemRandom
|
||||
from socket import gethostname
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
from typing import Optional
|
||||
|
||||
import jwt
|
||||
import requests
|
||||
import six
|
||||
from pyhocon import ConfigTree
|
||||
from requests import RequestException
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
|
||||
from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
|
||||
from .callresult import CallResult
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST
|
||||
from .defs import (
|
||||
ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD,
|
||||
ENV_FORCE_MAX_API_VERSION)
|
||||
from .request import Request, BatchRequest
|
||||
from .token_manager import TokenManager
|
||||
from ..config import load
|
||||
@@ -19,6 +28,8 @@ from ..utils import get_http_session_with_retry, urllib_log_warning_setup
|
||||
from ...backend_config.environment import backward_compatibility_support
|
||||
from ...version import __version__
|
||||
|
||||
sys_random = SystemRandom()
|
||||
|
||||
|
||||
class LoginError(Exception):
|
||||
pass
|
||||
@@ -29,27 +40,30 @@ class MaxRequestSizeError(Exception):
|
||||
|
||||
|
||||
class Session(TokenManager):
|
||||
""" TRAINS API Session class. """
|
||||
""" ClearML API Session class. """
|
||||
|
||||
_AUTHORIZATION_HEADER = "Authorization"
|
||||
_WORKER_HEADER = "X-Trains-Worker"
|
||||
_ASYNC_HEADER = "X-Trains-Async"
|
||||
_CLIENT_HEADER = "X-Trains-Agent"
|
||||
_WORKER_HEADER = ("X-ClearML-Worker", "X-Trains-Worker", )
|
||||
_ASYNC_HEADER = ("X-ClearML-Async", "X-Trains-Async", )
|
||||
_CLIENT_HEADER = ("X-ClearML-Agent", "X-Trains-Agent", )
|
||||
|
||||
_async_status_code = 202
|
||||
_session_requests = 0
|
||||
_session_initial_timeout = (3.0, 10.)
|
||||
_session_timeout = (10.0, 30.)
|
||||
_session_initial_connect_retry = 4
|
||||
_session_initial_retry_connect_override = 4
|
||||
_write_session_data_size = 15000
|
||||
_write_session_timeout = (30.0, 30.)
|
||||
_request_exception_retry_timeout = (2.0, 3.0)
|
||||
|
||||
api_version = '2.1'
|
||||
default_host = "https://demoapi.trains.allegro.ai"
|
||||
default_web = "https://demoapp.trains.allegro.ai"
|
||||
default_files = "https://demofiles.trains.allegro.ai"
|
||||
feature_set = 'basic'
|
||||
default_host = "https://demoapi.demo.clear.ml"
|
||||
default_web = "https://demoapp.demo.clear.ml"
|
||||
default_files = "https://demofiles.demo.clear.ml"
|
||||
default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
|
||||
default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
|
||||
force_max_api_version = ENV_FORCE_MAX_API_VERSION.get()
|
||||
|
||||
# TODO: add requests.codes.gateway_timeout once we support async commits
|
||||
_retry_codes = [
|
||||
@@ -99,42 +113,51 @@ class Session(TokenManager):
|
||||
if initialize_logging:
|
||||
self.config.initialize_logging(debug=kwargs.get('debug', False))
|
||||
|
||||
token_expiration_threshold_sec = self.config.get(
|
||||
"auth.token_expiration_threshold_sec", 60
|
||||
)
|
||||
|
||||
super(Session, self).__init__(
|
||||
token_expiration_threshold_sec=token_expiration_threshold_sec, **kwargs
|
||||
)
|
||||
super(Session, self).__init__(config=config, **kwargs)
|
||||
|
||||
self._verbose = verbose if verbose is not None else ENV_VERBOSE.get()
|
||||
self._logger = logger
|
||||
self.__auth_token = None
|
||||
self._propagate_exceptions_on_send = True
|
||||
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key)
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
self.update_default_api_method()
|
||||
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret)
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
if ENV_AUTH_TOKEN.get(
|
||||
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
|
||||
):
|
||||
self.set_auth_token(ENV_AUTH_TOKEN.get())
|
||||
else:
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key),
|
||||
value_cb=lambda key, value: print("Using environment access key {}={}".format(key, value))
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
|
||||
value_cb=lambda key, value: print("Using environment secret key {}=********".format(key))
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
if self.access_key == self.default_key and self.secret_key == self.default_secret:
|
||||
print("Using built-in ClearML default key/secret")
|
||||
|
||||
host = host or self.get_api_server_host(config=self.config)
|
||||
if not host:
|
||||
raise ValueError("host is required in init or config")
|
||||
raise ValueError(
|
||||
"Could not find host server definition "
|
||||
"(missing `~/clearml.conf` or Environment CLEARML_API_HOST)\n"
|
||||
"To get started with ClearML: setup your own `clearml-server`, "
|
||||
"or create a free account at https://app.clear.ml and run `clearml-agent init`"
|
||||
)
|
||||
|
||||
self.__host = host.strip("/")
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
self.__worker = worker or gethostname()
|
||||
|
||||
@@ -145,22 +168,29 @@ class Session(TokenManager):
|
||||
self.client = client or "api-{}".format(__version__)
|
||||
|
||||
# limit the reconnect retries, so we get an error if we are starting the session
|
||||
http_no_retries_config = dict(**http_retries_config)
|
||||
http_no_retries_config['connect'] = self._session_initial_connect_retry
|
||||
self.__http_session = get_http_session_with_retry(**http_no_retries_config)
|
||||
_, self.__http_session = self._setup_session(
|
||||
http_retries_config,
|
||||
initial_session=True,
|
||||
default_initial_connect_override=(False if kwargs.get("command") == "execute" else None)
|
||||
)
|
||||
# try to connect with the server
|
||||
self.refresh_token()
|
||||
|
||||
# for resilience, from now on we won't allow propagating exceptions when sending requests
|
||||
self._propagate_exceptions_on_send = False
|
||||
|
||||
# create the default session with many retries
|
||||
self.__http_session = get_http_session_with_retry(**http_retries_config)
|
||||
http_retries_config, self.__http_session = self._setup_session(http_retries_config)
|
||||
|
||||
# update api version from server response
|
||||
try:
|
||||
token_dict = jwt.decode(self.token, verify=False)
|
||||
token_dict = TokenManager.get_decoded_token(self.token, verify=False)
|
||||
api_version = token_dict.get('api_version')
|
||||
if not api_version:
|
||||
api_version = '2.2' if token_dict.get('env', '') == 'prod' else Session.api_version
|
||||
|
||||
Session.api_version = str(api_version)
|
||||
Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
|
||||
except (jwt.DecodeError, ValueError):
|
||||
pass
|
||||
|
||||
@@ -169,17 +199,109 @@ class Session(TokenManager):
|
||||
# notice: this is across the board warning omission
|
||||
urllib_log_warning_setup(total_retries=http_retries_config.get('total', 0), display_warning_after=3)
|
||||
|
||||
if self.force_max_api_version and self.check_min_api_version(self.force_max_api_version):
|
||||
print("Using forced API version {}".format(self.force_max_api_version))
|
||||
Session.max_api_version = Session.api_version = str(self.force_max_api_version)
|
||||
|
||||
self.pre_vault_config = None
|
||||
|
||||
def _setup_session(self, http_retries_config, initial_session=False, default_initial_connect_override=None):
|
||||
# type: (dict, bool, Optional[bool]) -> (dict, requests.Session)
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
if initial_session:
|
||||
kwargs = {} if default_initial_connect_override is None else {
|
||||
"default": default_initial_connect_override
|
||||
}
|
||||
if ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(**kwargs):
|
||||
connect_retries = self._session_initial_retry_connect_override
|
||||
try:
|
||||
value = ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(converter=str)
|
||||
if not isinstance(value, bool):
|
||||
connect_retries = abs(int(value))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
http_retries_config = dict(**http_retries_config)
|
||||
http_retries_config['connect'] = connect_retries
|
||||
|
||||
return http_retries_config, get_http_session_with_retry(config=self.config or None, **http_retries_config)
|
||||
|
||||
def update_default_api_method(self):
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
|
||||
# Make sure we update the config object, so we pass it into the new containers when we map them
|
||||
self.config.put("api.http.default_method", ENV_API_DEFAULT_REQ_METHOD.get())
|
||||
# notice the default setting of Request.def_method are already set by the OS environment
|
||||
elif self.config.get("api.http.default_method", None):
|
||||
def_method = str(self.config.get("api.http.default_method", None)).strip()
|
||||
if def_method.upper() not in ("GET", "POST", "PUT"):
|
||||
raise ValueError(
|
||||
"api.http.default_method variable must be 'get', 'post' or 'put' (any case is allowed)."
|
||||
)
|
||||
Request.def_method = def_method
|
||||
Request._method = Request.def_method
|
||||
|
||||
def load_vaults(self):
|
||||
# () -> Optional[bool]
|
||||
if not self.check_min_api_version("2.15") or self.feature_set == "basic":
|
||||
return
|
||||
|
||||
if ENV_DISABLE_VAULT_SUPPORT.get():
|
||||
print("Vault support is disabled")
|
||||
return
|
||||
|
||||
def parse(vault):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
print("Loaded {} vault: {}".format(
|
||||
vault.get("scope", ""),
|
||||
(vault.get("description", None) or "")[:50] or vault.get("id", ""))
|
||||
)
|
||||
d = vault.get("data", None)
|
||||
if d:
|
||||
r = ConfigFactory.parse_string(d)
|
||||
if isinstance(r, (ConfigTree, dict)):
|
||||
return r
|
||||
except Exception as e:
|
||||
print("Failed parsing vault {}: {}".format(vault.get("description", "<unknown>"), e))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# Use params and not data/json otherwise payload might be dropped if we're using GET with a strict firewall
|
||||
res = self.send_request("users", "get_vaults", params="enabled=true&types=config&types=config")
|
||||
if res.ok:
|
||||
vaults = res.json().get("data", {}).get("vaults", [])
|
||||
data = list(filter(None, map(parse, vaults)))
|
||||
if data:
|
||||
self.pre_vault_config = self.config.copy()
|
||||
self.config.set_overrides(*data)
|
||||
return True
|
||||
elif res.status_code != 404:
|
||||
raise Exception(res.json().get("meta", {}).get("result_msg", res.text))
|
||||
except Exception as ex:
|
||||
print("Failed getting vaults: {}".format(ex))
|
||||
|
||||
def verify_feature_set(self, feature_set):
|
||||
if isinstance(feature_set, str):
|
||||
feature_set = [feature_set]
|
||||
if self.feature_set not in feature_set:
|
||||
raise ValueError('ClearML-server does not support requested feature set {}'.format(feature_set))
|
||||
|
||||
def _send_request(
|
||||
self,
|
||||
service,
|
||||
action,
|
||||
version=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
headers=None,
|
||||
auth=None,
|
||||
data=None,
|
||||
json=None,
|
||||
refresh_token_if_unauthorized=True,
|
||||
params=None,
|
||||
):
|
||||
""" Internal implementation for making a raw API request.
|
||||
- Constructs the api endpoint name
|
||||
@@ -192,8 +314,10 @@ class Session(TokenManager):
|
||||
"""
|
||||
host = self.host
|
||||
headers = headers.copy() if headers else {}
|
||||
headers[self._WORKER_HEADER] = self.worker
|
||||
headers[self._CLIENT_HEADER] = self.client
|
||||
for h in self._WORKER_HEADER:
|
||||
headers[h] = self.worker
|
||||
for h in self._CLIENT_HEADER:
|
||||
headers[h] = self.client
|
||||
|
||||
token_refreshed_on_error = False
|
||||
url = (
|
||||
@@ -201,6 +325,7 @@ class Session(TokenManager):
|
||||
if version
|
||||
else "{host}/{service}.{action}"
|
||||
).format(**locals())
|
||||
|
||||
while True:
|
||||
if data and len(data) > self._write_session_data_size:
|
||||
timeout = self._write_session_timeout
|
||||
@@ -208,16 +333,29 @@ class Session(TokenManager):
|
||||
timeout = self._session_initial_timeout
|
||||
else:
|
||||
timeout = self._session_timeout
|
||||
res = self.__http_session.request(
|
||||
method, url, headers=headers, auth=auth, data=data, json=json, timeout=timeout)
|
||||
|
||||
try:
|
||||
res = self.__http_session.request(
|
||||
method, url, headers=headers, auth=auth, data=data, json=json, timeout=timeout, params=params)
|
||||
except RequestException as ex:
|
||||
if self._propagate_exceptions_on_send:
|
||||
raise
|
||||
sleep_time = sys_random.uniform(*self._request_exception_retry_timeout)
|
||||
self._logger.error(
|
||||
"{} exception sending {} {}: {} (retrying in {:.1f}sec)".format(
|
||||
type(ex).__name__, method.upper(), url, str(ex), sleep_time
|
||||
)
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
continue
|
||||
|
||||
if (
|
||||
refresh_token_if_unauthorized
|
||||
and res.status_code == requests.codes.unauthorized
|
||||
and not token_refreshed_on_error
|
||||
):
|
||||
# it seems we're unauthorized, so we'll try to refresh our token once in case permissions changed since
|
||||
# the last time we got the token, and try again
|
||||
# it seems we're unauthorized, so we'll try to refresh our token once in case permissions changed
|
||||
# since the last time we got the token, and try again
|
||||
self.refresh_token()
|
||||
token_refreshed_on_error = True
|
||||
# try again
|
||||
@@ -240,16 +378,21 @@ class Session(TokenManager):
|
||||
headers[self._AUTHORIZATION_HEADER] = "Bearer {}".format(self.token)
|
||||
return headers
|
||||
|
||||
def set_auth_token(self, auth_token):
|
||||
self.__access_key = self.__secret_key = None
|
||||
self._set_token(auth_token)
|
||||
|
||||
def send_request(
|
||||
self,
|
||||
service,
|
||||
action,
|
||||
version=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
headers=None,
|
||||
data=None,
|
||||
json=None,
|
||||
async_enable=False,
|
||||
params=None,
|
||||
):
|
||||
"""
|
||||
Send a raw API request.
|
||||
@@ -262,13 +405,15 @@ class Session(TokenManager):
|
||||
content type will be application/json)
|
||||
:param data: Dictionary, bytes, or file-like object to send in the request body
|
||||
:param async_enable: whether request is asynchronous
|
||||
:param params: additional query parameters
|
||||
:return: requests Response instance
|
||||
"""
|
||||
headers = self.add_auth_headers(
|
||||
headers.copy() if headers else {}
|
||||
)
|
||||
if async_enable:
|
||||
headers[self._ASYNC_HEADER] = "1"
|
||||
for h in self._ASYNC_HEADER:
|
||||
headers[h] = "1"
|
||||
return self._send_request(
|
||||
service=service,
|
||||
action=action,
|
||||
@@ -277,6 +422,7 @@ class Session(TokenManager):
|
||||
headers=headers,
|
||||
data=data,
|
||||
json=json,
|
||||
params=params,
|
||||
)
|
||||
|
||||
def send_request_batch(
|
||||
@@ -287,7 +433,7 @@ class Session(TokenManager):
|
||||
headers=None,
|
||||
data=None,
|
||||
json=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
):
|
||||
"""
|
||||
Send a raw batch API request. Batch requests always use application/json-lines content type.
|
||||
@@ -436,8 +582,11 @@ class Session(TokenManager):
|
||||
if not config:
|
||||
return None
|
||||
|
||||
return ENV_HOST.get(default=(config.get("api.api_server", None) or
|
||||
config.get("api.host", None) or cls.default_host))
|
||||
default = config.get("api.api_server", None) or config.get("api.host", None)
|
||||
if not ENV_NO_DEFAULT_SERVER.get():
|
||||
default = default or cls.default_host
|
||||
|
||||
return ENV_HOST.get(default=default)
|
||||
|
||||
@classmethod
|
||||
def get_app_server_host(cls, config=None):
|
||||
@@ -464,7 +613,7 @@ class Session(TokenManager):
|
||||
if parsed.port == 8008:
|
||||
return host.replace(':8008', ':8080', 1)
|
||||
|
||||
raise ValueError('Could not detect TRAINS web application server')
|
||||
raise ValueError('Could not detect ClearML web application server')
|
||||
|
||||
@classmethod
|
||||
def get_files_server_host(cls, config=None):
|
||||
@@ -505,7 +654,7 @@ class Session(TokenManager):
|
||||
return v + (0,) * max(0, 3 - len(v))
|
||||
return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))
|
||||
|
||||
def _do_refresh_token(self, old_token, exp=None):
|
||||
def _do_refresh_token(self, current_token, exp=None):
|
||||
""" TokenManager abstract method implementation.
|
||||
Here we ignore the old token and simply obtain a new token.
|
||||
"""
|
||||
@@ -517,16 +666,23 @@ class Session(TokenManager):
|
||||
)
|
||||
)
|
||||
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
auth = None
|
||||
headers = None
|
||||
if self.access_key and self.secret_key:
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
elif current_token:
|
||||
headers = dict(Authorization="Bearer {}".format(current_token))
|
||||
|
||||
res = None
|
||||
try:
|
||||
data = {"expiration_sec": exp} if exp else {}
|
||||
res = self._send_request(
|
||||
method=Request.def_method,
|
||||
service="auth",
|
||||
action="login",
|
||||
auth=auth,
|
||||
json=data,
|
||||
headers=headers,
|
||||
refresh_token_if_unauthorized=False,
|
||||
params={"expiration_sec": exp} if exp else {},
|
||||
)
|
||||
try:
|
||||
resp = res.json()
|
||||
@@ -541,20 +697,23 @@ class Session(TokenManager):
|
||||
)
|
||||
if verbose:
|
||||
self._logger.info("Received new token")
|
||||
return resp["data"]["token"]
|
||||
token = resp["data"]["token"]
|
||||
if ENV_AUTH_TOKEN.get():
|
||||
os.environ[ENV_AUTH_TOKEN.key] = token
|
||||
return token
|
||||
except LoginError:
|
||||
six.reraise(*sys.exc_info())
|
||||
except KeyError as ex:
|
||||
# check if this is a misconfigured api server (getting 200 without the data section)
|
||||
if res and res.status_code == 200:
|
||||
raise ValueError('It seems *api_server* is misconfigured. '
|
||||
'Is this the TRAINS API server {} ?'.format(self.get_api_server_host()))
|
||||
'Is this the ClearML API server {} ?'.format(self.get_api_server_host()))
|
||||
else:
|
||||
raise LoginError("Response data mismatch: No 'token' in 'data' value from res, receive : {}, "
|
||||
"exception: {}".format(res, ex))
|
||||
except requests.ConnectionError as ex:
|
||||
raise ValueError('Connection Error: it seems *api_server* is misconfigured. '
|
||||
'Is this the TRAINS API server {} ?'.format('/'.join(ex.request.url.split('/')[:3])))
|
||||
'Is this the ClearML API server {} ?'.format('/'.join(ex.request.url.split('/')[:3])))
|
||||
except Exception as ex:
|
||||
raise LoginError('Unrecognized Authentication Error: {} {}'.format(type(ex), ex))
|
||||
|
||||
@@ -562,3 +721,13 @@ class Session(TokenManager):
|
||||
return "{self.__class__.__name__}[{self.host}, {self.access_key}/{secret_key}]".format(
|
||||
self=self, secret_key=self.secret_key[:5] + "*" * (len(self.secret_key) - 5)
|
||||
)
|
||||
|
||||
@property
|
||||
def propagate_exceptions_on_send(self):
|
||||
# type: () -> bool
|
||||
return self._propagate_exceptions_on_send
|
||||
|
||||
@propagate_exceptions_on_send.setter
|
||||
def propagate_exceptions_on_send(self, value):
|
||||
# type: (bool) -> None
|
||||
self._propagate_exceptions_on_send = value
|
||||
@@ -3,11 +3,14 @@ from abc import ABCMeta, abstractmethod
|
||||
from time import time
|
||||
|
||||
import jwt
|
||||
from jwt.algorithms import get_default_algorithms
|
||||
import six
|
||||
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class TokenManager(object):
|
||||
_default_token_exp_threshold_sec = 12 * 60 * 60
|
||||
_default_req_token_expiration_sec = None
|
||||
|
||||
@property
|
||||
def token_expiration_threshold_sec(self):
|
||||
@@ -40,17 +43,30 @@ class TokenManager(object):
|
||||
return self.__token
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=60,
|
||||
**kwargs
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=None,
|
||||
config=None,
|
||||
**kwargs
|
||||
):
|
||||
super(TokenManager, self).__init__()
|
||||
assert isinstance(token_history, (type(None), dict))
|
||||
self.token_expiration_threshold_sec = token_expiration_threshold_sec
|
||||
self.req_token_expiration_sec = req_token_expiration_sec
|
||||
if config:
|
||||
req_token_expiration_sec = req_token_expiration_sec or config.get(
|
||||
"api.auth.request_token_expiration_sec", None
|
||||
)
|
||||
token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec
|
||||
or config.get("api.auth.token_expiration_threshold_sec", None)
|
||||
)
|
||||
self.token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec or self._default_token_exp_threshold_sec
|
||||
)
|
||||
self.req_token_expiration_sec = (
|
||||
req_token_expiration_sec or self._default_req_token_expiration_sec
|
||||
)
|
||||
self._set_token(token)
|
||||
|
||||
def _calc_token_valid_period_sec(self, token, exp=None, at_least_sec=None):
|
||||
@@ -58,7 +74,9 @@ class TokenManager(object):
|
||||
try:
|
||||
exp = exp or self._get_token_exp(token)
|
||||
if at_least_sec:
|
||||
at_least_sec = max(at_least_sec, self.token_expiration_threshold_sec)
|
||||
at_least_sec = max(
|
||||
at_least_sec, self.token_expiration_threshold_sec
|
||||
)
|
||||
else:
|
||||
at_least_sec = self.token_expiration_threshold_sec
|
||||
return max(0, (exp - time() - at_least_sec))
|
||||
@@ -66,10 +84,26 @@ class TokenManager(object):
|
||||
pass
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def get_decoded_token(cls, token, verify=False):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
if hasattr(jwt, '__version__') and jwt.__version__[0] == '1':
|
||||
return jwt.decode(
|
||||
token,
|
||||
verify=verify,
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
return jwt.decode(
|
||||
token,
|
||||
options=dict(verify_signature=verify),
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_token_exp(cls, token):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
return jwt.decode(token, verify=False).get('exp', sys.maxsize)
|
||||
return cls.get_decoded_token(token).get("exp", sys.maxsize)
|
||||
|
||||
def _set_token(self, token):
|
||||
if token:
|
||||
@@ -80,7 +114,9 @@ class TokenManager(object):
|
||||
self.__token_expiration_sec = 0
|
||||
|
||||
def get_token_valid_period_sec(self):
|
||||
return self._calc_token_valid_period_sec(self.__token, self.token_expiration_sec)
|
||||
return self._calc_token_valid_period_sec(
|
||||
self.__token, self.token_expiration_sec
|
||||
)
|
||||
|
||||
def _get_token(self):
|
||||
if self.get_token_valid_period_sec() <= 0:
|
||||
@@ -92,4 +128,6 @@ class TokenManager(object):
|
||||
pass
|
||||
|
||||
def refresh_token(self):
|
||||
self._set_token(self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec))
|
||||
self._set_token(
|
||||
self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec)
|
||||
)
|
||||
@@ -6,16 +6,9 @@ import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util import Retry
|
||||
from urllib3 import PoolManager
|
||||
import six
|
||||
|
||||
from .session.defs import ENV_HOST_VERIFY_CERT
|
||||
|
||||
if six.PY3:
|
||||
from functools import lru_cache
|
||||
elif six.PY2:
|
||||
# python 2 support
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
|
||||
|
||||
__disable_certificate_verification_warning = 0
|
||||
|
||||
@@ -93,7 +86,10 @@ def get_http_session_with_retry(
|
||||
session = requests.Session()
|
||||
|
||||
if backoff_max is not None:
|
||||
Retry.BACKOFF_MAX = backoff_max
|
||||
if "BACKOFF_MAX" in vars(Retry):
|
||||
Retry.BACKOFF_MAX = backoff_max
|
||||
else:
|
||||
Retry.DEFAULT_BACKOFF_MAX = backoff_max
|
||||
|
||||
retry = Retry(
|
||||
total=total, connect=connect, read=read, redirect=redirect, status=status,
|
||||
@@ -107,7 +103,7 @@ def get_http_session_with_retry(
|
||||
if not session.verify and __disable_certificate_verification_warning < 2:
|
||||
# show warning
|
||||
__disable_certificate_verification_warning += 1
|
||||
logging.getLogger('TRAINS').warning(
|
||||
logging.getLogger('ClearML').warning(
|
||||
msg='InsecureRequestWarning: Certificate verification is disabled! Adding '
|
||||
'certificate verification is strongly advised. See: '
|
||||
'https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings')
|
||||
@@ -1,4 +1,3 @@
|
||||
from .defs import Environment
|
||||
from .config import Config, ConfigEntry
|
||||
from .errors import ConfigurationError
|
||||
from .environment import EnvEntry
|
||||
@@ -4,15 +4,11 @@ import functools
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from fnmatch import fnmatch
|
||||
from os.path import expanduser
|
||||
from typing import Any
|
||||
|
||||
import pyhocon
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
from pyparsing import (
|
||||
ParseFatalException,
|
||||
ParseException,
|
||||
@@ -20,6 +16,9 @@ from pyparsing import (
|
||||
ParseSyntaxException,
|
||||
)
|
||||
|
||||
from clearml_agent.external import pyhocon
|
||||
from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
|
||||
|
||||
from .defs import (
|
||||
Environment,
|
||||
DEFAULT_CONFIG_FOLDER,
|
||||
@@ -71,6 +70,10 @@ class Config(object):
|
||||
|
||||
# used in place of None in Config.get as default value because None is a valid value
|
||||
_MISSING = object()
|
||||
extra_config_values_env_key_sep = "__"
|
||||
extra_config_values_env_key_prefix = [
|
||||
"CLEARML_AGENT" + extra_config_values_env_key_sep,
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -90,6 +93,7 @@ class Config(object):
|
||||
self._env = env or os.environ.get("TRAINS_ENV", Environment.default)
|
||||
self.config_paths = set()
|
||||
self.is_server = is_server
|
||||
self._overrides_configs = None
|
||||
|
||||
if self._verbose:
|
||||
print("Config env:%s" % str(self._env))
|
||||
@@ -100,6 +104,7 @@ class Config(object):
|
||||
)
|
||||
if self._env not in get_options(Environment):
|
||||
raise ValueError("Invalid environment %s" % env)
|
||||
|
||||
if relative_to is not None:
|
||||
self.load_relative_to(relative_to)
|
||||
|
||||
@@ -138,7 +143,7 @@ class Config(object):
|
||||
else:
|
||||
env_config_paths = []
|
||||
|
||||
env_config_path_override = os.environ.get(ENV_CONFIG_PATH_OVERRIDE_VAR)
|
||||
env_config_path_override = ENV_CONFIG_PATH_OVERRIDE_VAR.get()
|
||||
if env_config_path_override:
|
||||
env_config_paths = [expanduser(env_config_path_override)]
|
||||
|
||||
@@ -158,14 +163,16 @@ class Config(object):
|
||||
if LOCAL_CONFIG_PATHS:
|
||||
config = functools.reduce(
|
||||
lambda cfg, path: ConfigTree.merge_configs(
|
||||
cfg, self._read_recursive(path, verbose=self._verbose), copy_trees=True
|
||||
cfg,
|
||||
self._read_recursive(path, verbose=self._verbose),
|
||||
copy_trees=True,
|
||||
),
|
||||
LOCAL_CONFIG_PATHS,
|
||||
config,
|
||||
)
|
||||
|
||||
local_config_files = LOCAL_CONFIG_FILES
|
||||
local_config_override = os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR)
|
||||
local_config_override = LOCAL_CONFIG_FILE_OVERRIDE_VAR.get()
|
||||
if local_config_override:
|
||||
local_config_files = [expanduser(local_config_override)]
|
||||
|
||||
@@ -181,9 +188,42 @@ class Config(object):
|
||||
config,
|
||||
)
|
||||
|
||||
config = ConfigTree.merge_configs(
|
||||
config, self._read_extra_env_config_values(), copy_trees=True
|
||||
)
|
||||
|
||||
config = self.resolve_override_configs(config)
|
||||
|
||||
config["env"] = env
|
||||
return config
|
||||
|
||||
def resolve_override_configs(self, initial=None):
|
||||
if not self._overrides_configs:
|
||||
return initial
|
||||
return functools.reduce(
|
||||
lambda cfg, override: ConfigTree.merge_configs(cfg, override, copy_trees=True),
|
||||
self._overrides_configs,
|
||||
initial or ConfigTree(),
|
||||
)
|
||||
|
||||
def _read_extra_env_config_values(self) -> ConfigTree:
|
||||
""" Loads extra configuration from environment-injected values """
|
||||
result = ConfigTree()
|
||||
|
||||
for prefix in self.extra_config_values_env_key_prefix:
|
||||
keys = sorted(k for k in os.environ if k.startswith(prefix))
|
||||
for key in keys:
|
||||
path = (
|
||||
key[len(prefix) :]
|
||||
.replace(self.extra_config_values_env_key_sep, ".")
|
||||
.lower()
|
||||
)
|
||||
result = ConfigTree.merge_configs(
|
||||
result, ConfigFactory.parse_string("{}: {}".format(path, os.environ[key]))
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def replace(self, config):
|
||||
self._config = config
|
||||
|
||||
@@ -254,6 +294,12 @@ class Config(object):
|
||||
)
|
||||
return value
|
||||
|
||||
def put(self, key, value):
|
||||
self._config.put(key, value)
|
||||
|
||||
def pop(self, key, default=None):
|
||||
return self._config.pop(key, default=default)
|
||||
|
||||
def to_dict(self):
|
||||
return self._config.as_plain_ordered_dict()
|
||||
|
||||
@@ -340,3 +386,10 @@ class Config(object):
|
||||
except Exception as ex:
|
||||
print("Failed loading %s: %s" % (file_path, ex))
|
||||
raise
|
||||
|
||||
def set_overrides(self, *dicts):
|
||||
""" Set several override dictionaries or ConfigTree objects which should be merged onto the configuration """
|
||||
self._overrides_configs = [
|
||||
d if isinstance(d, ConfigTree) else pyhocon.ConfigFactory.from_dict(d) for d in dicts
|
||||
]
|
||||
self.reload()
|
||||
8
clearml_agent/backend_config/converters.py
Normal file
8
clearml_agent/backend_config/converters.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from clearml_agent.helper.environment.converters import (
|
||||
base64_to_text,
|
||||
text_to_bool,
|
||||
text_to_int,
|
||||
safe_text_to_bool,
|
||||
any_to_bool,
|
||||
or_,
|
||||
)
|
||||
@@ -1,6 +1,8 @@
|
||||
from os.path import expanduser
|
||||
from pathlib2 import Path
|
||||
|
||||
from ..backend_config.environment import EnvEntry
|
||||
|
||||
ENV_VAR = 'TRAINS_ENV'
|
||||
""" Name of system environment variable that can be used to specify the config environment name """
|
||||
|
||||
@@ -17,23 +19,24 @@ ENV_CONFIG_PATHS = [
|
||||
|
||||
|
||||
LOCAL_CONFIG_PATHS = [
|
||||
# '/etc/opt/trains', # used by servers for docker-generated configuration
|
||||
# expanduser('~/.trains/config'),
|
||||
# '/etc/opt/clearml', # used by servers for docker-generated configuration
|
||||
# expanduser('~/.clearml/config'),
|
||||
]
|
||||
""" Local config paths, not related to environment """
|
||||
|
||||
|
||||
LOCAL_CONFIG_FILES = [
|
||||
expanduser('~/trains.conf'), # used for workstation configuration (end-users, workers)
|
||||
expanduser('~/clearml.conf'), # used for workstation configuration (end-users, workers)
|
||||
]
|
||||
""" Local config files (not paths) """
|
||||
|
||||
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR = 'TRAINS_CONFIG_FILE'
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR = EnvEntry('CLEARML_CONFIG_FILE', 'TRAINS_CONFIG_FILE', )
|
||||
""" Local config file override environment variable. If this is set, no other local config files will be used. """
|
||||
|
||||
|
||||
ENV_CONFIG_PATH_OVERRIDE_VAR = 'TRAINS_CONFIG_PATH'
|
||||
ENV_CONFIG_PATH_OVERRIDE_VAR = EnvEntry('CLEARML_CONFIG_PATH', 'TRAINS_CONFIG_PATH', )
|
||||
"""
|
||||
Environment-related config path override environment variable. If this is set, no other env config path will be used.
|
||||
"""
|
||||
6
clearml_agent/backend_config/entry.py
Normal file
6
clearml_agent/backend_config/entry.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from clearml_agent.helper.environment import Entry, NotSet
|
||||
|
||||
__all__ = [
|
||||
"Entry",
|
||||
"NotSet"
|
||||
]
|
||||
46
clearml_agent/backend_config/environment.py
Normal file
46
clearml_agent/backend_config/environment.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from os import environ
|
||||
|
||||
from clearml_agent.helper.environment import EnvEntry
|
||||
|
||||
|
||||
def backward_compatibility_support():
|
||||
from ..definitions import ENVIRONMENT_CONFIG, ENVIRONMENT_SDK_PARAMS, ENVIRONMENT_BACKWARD_COMPATIBLE
|
||||
if ENVIRONMENT_BACKWARD_COMPATIBLE.get():
|
||||
# Add TRAINS_ prefix on every CLEARML_ os environment we support
|
||||
for k, v in ENVIRONMENT_CONFIG.items():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
trains_vars = [var for var in v.vars if var.startswith('CLEARML_')]
|
||||
if not trains_vars:
|
||||
continue
|
||||
alg_var = trains_vars[0].replace('CLEARML_', 'TRAINS_', 1)
|
||||
if alg_var not in v.vars:
|
||||
v.vars = tuple(list(v.vars) + [alg_var])
|
||||
except:
|
||||
continue
|
||||
for k, v in ENVIRONMENT_SDK_PARAMS.items():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
trains_vars = [var for var in v if var.startswith('CLEARML_')]
|
||||
if not trains_vars:
|
||||
continue
|
||||
alg_var = trains_vars[0].replace('CLEARML_', 'TRAINS_', 1)
|
||||
if alg_var not in v:
|
||||
ENVIRONMENT_SDK_PARAMS[k] = tuple(list(v) + [alg_var])
|
||||
except:
|
||||
continue
|
||||
|
||||
# set OS environ:
|
||||
keys = list(environ.keys())
|
||||
for k in keys:
|
||||
if not k.startswith('CLEARML_'):
|
||||
continue
|
||||
backwards_k = k.replace('CLEARML_', 'TRAINS_', 1)
|
||||
if backwards_k not in keys:
|
||||
environ[backwards_k] = environ[k]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"EnvEntry",
|
||||
"backward_compatibility_support"
|
||||
]
|
||||
@@ -4,11 +4,11 @@ from pathlib2 import Path
|
||||
|
||||
|
||||
def logger(path=None):
|
||||
name = "trains"
|
||||
name = "clearml"
|
||||
if path:
|
||||
p = Path(path)
|
||||
module = (p.parent if p.stem.startswith('_') else p).stem
|
||||
name = "trains.%s" % module
|
||||
name = "clearml.%s" % module
|
||||
return logging.getLogger(name)
|
||||
|
||||
|
||||
125
clearml_agent/backend_config/utils.py
Normal file
125
clearml_agent/backend_config/utils.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import base64
|
||||
import os
|
||||
from os.path import expandvars, expanduser
|
||||
from pathlib import Path
|
||||
from typing import List, TYPE_CHECKING
|
||||
|
||||
from clearml_agent.external.pyhocon import HOCONConverter, ConfigTree
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .config import Config
|
||||
|
||||
|
||||
def get_items(cls):
|
||||
""" get key/value items from an enum-like class (members represent enumeration key/value) """
|
||||
return {k: v for k, v in vars(cls).items() if not k.startswith('_')}
|
||||
|
||||
|
||||
def get_options(cls):
|
||||
""" get options from an enum-like class (members represent enumeration key/value) """
|
||||
return get_items(cls).values()
|
||||
|
||||
|
||||
def apply_environment(config):
|
||||
# type: (Config) -> List[str]
|
||||
env_vars = config.get("environment", None)
|
||||
if not env_vars:
|
||||
return []
|
||||
if isinstance(env_vars, (list, tuple)):
|
||||
env_vars = dict(env_vars)
|
||||
|
||||
keys = list(filter(None, env_vars.keys()))
|
||||
|
||||
for key in keys:
|
||||
value = env_vars[key]
|
||||
os.environ[str(key)] = str(value if value is not None else "")
|
||||
|
||||
return keys
|
||||
|
||||
|
||||
def apply_files(config):
|
||||
# type: (Config) -> None
|
||||
files = config.get("files", None)
|
||||
if not files:
|
||||
return
|
||||
|
||||
if isinstance(files, (list, tuple)):
|
||||
files = dict(files)
|
||||
|
||||
print("Creating files from configuration")
|
||||
for key, data in files.items():
|
||||
path = data.get("path")
|
||||
fmt = data.get("format", "string")
|
||||
target_fmt = data.get("target_format", "string")
|
||||
overwrite = bool(data.get("overwrite", True))
|
||||
contents = data.get("contents")
|
||||
mode = data.get("mode")
|
||||
|
||||
target = Path(expanduser(expandvars(path)))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if target.is_dir():
|
||||
print("Skipped [{}]: is a directory {}".format(key, target))
|
||||
continue
|
||||
|
||||
if not overwrite and target.is_file():
|
||||
print("Skipped [{}]: file exists {}".format(key, target))
|
||||
continue
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: can't access {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
if contents:
|
||||
try:
|
||||
if fmt == "base64":
|
||||
contents = base64.b64decode(contents)
|
||||
if target_fmt != "bytes":
|
||||
contents = contents.decode("utf-8")
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed decoding {} ({})".format(key, fmt, ex))
|
||||
continue
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed creating path {} ({})".format(key, target.parent, ex))
|
||||
continue
|
||||
|
||||
try:
|
||||
if target_fmt == "bytes":
|
||||
try:
|
||||
target.write_bytes(contents)
|
||||
except TypeError:
|
||||
# simpler error so the user won't get confused
|
||||
raise TypeError("a bytes-like object is required")
|
||||
else:
|
||||
try:
|
||||
if target_fmt == "json":
|
||||
text = HOCONConverter.to_json(contents)
|
||||
elif target_fmt in ("yaml", "yml"):
|
||||
text = HOCONConverter.to_yaml(contents)
|
||||
else:
|
||||
if isinstance(contents, ConfigTree):
|
||||
contents = contents.as_plain_ordered_dict()
|
||||
text = str(contents)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed encoding to {} ({})".format(key, target_fmt, ex))
|
||||
continue
|
||||
target.write_text(text)
|
||||
print("Saved [{}]: {}".format(key, target))
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed saving file {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
try:
|
||||
if mode:
|
||||
if isinstance(mode, int):
|
||||
mode = int(str(mode), 8)
|
||||
else:
|
||||
mode = int(mode, 8)
|
||||
target.chmod(mode)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed setting mode {} for {} ({})".format(key, mode, target, ex))
|
||||
continue
|
||||
@@ -9,16 +9,16 @@ from operator import attrgetter
|
||||
from traceback import print_exc
|
||||
from typing import Text
|
||||
|
||||
from trains_agent.helper.console import ListFormatter, print_text
|
||||
from trains_agent.helper.dicts import filter_keys
|
||||
from clearml_agent.helper.console import ListFormatter, print_text
|
||||
from clearml_agent.helper.dicts import filter_keys
|
||||
|
||||
import six
|
||||
from trains_agent.backend_api import services
|
||||
from clearml_agent.backend_api import services
|
||||
|
||||
from trains_agent.errors import APIError, CommandFailedError
|
||||
from trains_agent.helper.base import Singleton, return_list, print_parameters, dump_yaml, load_yaml, error, warning
|
||||
from trains_agent.interface.base import ObjectID
|
||||
from trains_agent.session import Session
|
||||
from clearml_agent.errors import APIError, CommandFailedError
|
||||
from clearml_agent.helper.base import Singleton, return_list, print_parameters, dump_yaml, load_yaml, error, warning
|
||||
from clearml_agent.interface.base import ObjectID
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
class NameResolutionError(CommandFailedError):
|
||||
@@ -74,7 +74,7 @@ class BaseCommandSection(object):
|
||||
|
||||
@staticmethod
|
||||
def log(message, *args):
|
||||
print("trains-agent: {}".format(message % args))
|
||||
print("clearml-agent: {}".format(message % args))
|
||||
|
||||
@classmethod
|
||||
def exit(cls, message, code=1): # type: (Text, int) -> ()
|
||||
@@ -118,11 +118,15 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
""" The name of the REST service used by this command """
|
||||
pass
|
||||
|
||||
def get(self, endpoint, *args, **kwargs):
|
||||
return self._session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def get(self, endpoint, *args, service=None, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
service = service or self.service
|
||||
return session.get(service=service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def post(self, endpoint, *args, **kwargs):
|
||||
return self._session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def post(self, endpoint, *args, service=None, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
service = service or self.service
|
||||
return session.post(service=service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def get_with_act_as(self, endpoint, *args, **kwargs):
|
||||
return self._session.get_with_act_as(service=self.service, action=endpoint, *args, **kwargs)
|
||||
@@ -345,7 +349,7 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
except AttributeError:
|
||||
raise NameResolutionError('Name resolution unavailable for {}'.format(service))
|
||||
|
||||
request = request_cls.from_dict(dict(name=name, only_fields=['name', 'id']))
|
||||
request = request_cls.from_dict(dict(name=re.escape(name), only_fields=['name', 'id']))
|
||||
# from_dict will ignore unrecognised keyword arguments - not all GetAll's have only_fields
|
||||
response = getattr(self._session.send_api(request), service)
|
||||
matches = [db_object for db_object in response if name.lower() == db_object.name.lower()]
|
||||
@@ -1,4 +1,4 @@
|
||||
from trains_agent.commands.base import ServiceCommandSection
|
||||
from clearml_agent.commands.base import ServiceCommandSection
|
||||
|
||||
|
||||
class Config(ServiceCommandSection):
|
||||
@@ -1,18 +1,21 @@
|
||||
from __future__ import print_function
|
||||
|
||||
from six.moves import input
|
||||
from pyhocon import ConfigFactory, ConfigMissingException
|
||||
from typing import Dict, Optional
|
||||
|
||||
from pathlib2 import Path
|
||||
from six.moves import input
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from trains_agent.backend_api.session import Session
|
||||
from trains_agent.backend_api.session.defs import ENV_HOST
|
||||
from trains_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
|
||||
from clearml_agent.backend_api.session import Session
|
||||
from clearml_agent.backend_api.session.defs import ENV_HOST
|
||||
from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
from clearml_agent.external.pyhocon import ConfigFactory, ConfigMissingException
|
||||
|
||||
description = """
|
||||
Please create new trains credentials through the profile page in your trains web app (e.g. https://demoapp.trains.allegro.ai/profile)
|
||||
In the profile page, press "Create new credentials", then press "Copy to clipboard".
|
||||
Please create new clearml credentials through the settings page in your `clearml-server` web app,
|
||||
or create a free account at https://app.clear.ml/settings/webapp-configuration
|
||||
|
||||
In the settings > workspace page, press "Create new credentials", then press "Copy to clipboard".
|
||||
|
||||
Paste copied configuration here:
|
||||
"""
|
||||
@@ -25,24 +28,33 @@ except Exception:
|
||||
|
||||
host_description = """
|
||||
Editing configuration file: {CONFIG_FILE}
|
||||
Enter the url of the trains-server's Web service, for example: {HOST}
|
||||
Enter the url of the clearml-server's Web service, for example: {HOST} or https://app.clear.ml
|
||||
""".format(
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[0],
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[-1],
|
||||
HOST=def_host,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
print('TRAINS-AGENT setup process')
|
||||
conf_file = Path(LOCAL_CONFIG_FILES[0]).absolute()
|
||||
print('CLEARML-AGENT setup process')
|
||||
for f in LOCAL_CONFIG_FILES:
|
||||
conf_file = Path(f).absolute()
|
||||
if conf_file.exists():
|
||||
break
|
||||
|
||||
if conf_file.exists() and conf_file.is_file() and conf_file.stat().st_size > 0:
|
||||
print('Configuration file already exists: {}'.format(str(conf_file)))
|
||||
print('Leaving setup, feel free to edit the configuration file.')
|
||||
print('Leaving setup. If you\'ve previously initialized the ClearML SDK on this machine, manually add an \'agent\' section to this file.')
|
||||
return
|
||||
|
||||
print(description, end='')
|
||||
sentinel = ''
|
||||
parse_input = '\n'.join(iter(input, sentinel))
|
||||
parse_input = ''
|
||||
for line in iter(input, sentinel):
|
||||
parse_input += line+'\n'
|
||||
if line.rstrip() == '}':
|
||||
break
|
||||
|
||||
credentials = None
|
||||
api_server = None
|
||||
web_server = None
|
||||
@@ -73,7 +85,7 @@ def main():
|
||||
host = input_url('API Host', api_server)
|
||||
else:
|
||||
print(host_description)
|
||||
host = input_url('WEB Host', '')
|
||||
host = input_url('WEB Host', 'https://app.clear.ml')
|
||||
|
||||
parsed_host = verify_url(host)
|
||||
api_host, files_host, web_host = parse_host(parsed_host, allow_input=True)
|
||||
@@ -86,7 +98,7 @@ def main():
|
||||
|
||||
files_host = input_url('File Store Host', files_host)
|
||||
|
||||
print('\nTRAINS Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
|
||||
print('\nClearML Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
|
||||
web_host, api_host, files_host))
|
||||
|
||||
retry = 1
|
||||
@@ -101,13 +113,34 @@ def main():
|
||||
print('Exiting setup without creating configuration file')
|
||||
return
|
||||
|
||||
selection = input_options(
|
||||
'Default Output URI (used to automatically store models and artifacts)',
|
||||
{'N': 'None', 'S': 'ClearML Server', 'C': 'Custom'},
|
||||
default='None'
|
||||
)
|
||||
if selection == 'Custom':
|
||||
print('Custom Default Output URI: ', end='')
|
||||
default_output_uri = input().strip()
|
||||
elif selection == "ClearML Server":
|
||||
default_output_uri = files_host
|
||||
else:
|
||||
default_output_uri = None
|
||||
|
||||
print('\nDefault Output URI: {}'.format(default_output_uri if default_output_uri else 'not set'))
|
||||
|
||||
# get GIT User/Pass for cloning
|
||||
print('Enter git username for repository cloning (leave blank for SSH key authentication): [] ', end='')
|
||||
git_user = input()
|
||||
if git_user.strip():
|
||||
print('Enter password for user \'{}\': '.format(git_user), end='')
|
||||
print(
|
||||
"Git personal token is equivalent to a password, to learn how to generate a token:\n"
|
||||
" GitHub: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token\n" # noqa
|
||||
" Bitbucket: https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/\n"
|
||||
" GitLab: https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html\n"
|
||||
)
|
||||
print('Enter git personal token for user \'{}\': '.format(git_user), end='')
|
||||
git_pass = input()
|
||||
print('Git repository cloning will be using user={} password={}'.format(git_user, git_pass))
|
||||
print('Git repository cloning will be using user={} token={}'.format(git_user, git_pass))
|
||||
else:
|
||||
git_user = None
|
||||
git_pass = None
|
||||
@@ -140,14 +173,14 @@ def main():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open(str(conf_file), 'wt') as f:
|
||||
header = '# TRAINS-AGENT configuration file\n' \
|
||||
header = '# CLEARML-AGENT configuration file\n' \
|
||||
'api {\n' \
|
||||
' # Notice: \'host\' is the api server (default port 8008), not the web server.\n' \
|
||||
' api_server: %s\n' \
|
||||
' web_server: %s\n' \
|
||||
' files_server: %s\n' \
|
||||
' # Credentials are generated using the webapp, %s/profile\n' \
|
||||
' # Override with os environment: TRAINS_API_ACCESS_KEY / TRAINS_API_SECRET_KEY\n' \
|
||||
' # Credentials are generated using the webapp, %s/settings\n' \
|
||||
' # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY\n' \
|
||||
' credentials {"access_key": "%s", "secret_key": "%s"}\n' \
|
||||
'}\n\n' % (api_host, web_host, files_host,
|
||||
web_host, credentials['access_key'], credentials['secret_key'])
|
||||
@@ -158,17 +191,24 @@ def main():
|
||||
'agent.git_pass=\"{}\"\n' \
|
||||
'\n'.format(git_user or '', git_pass or '')
|
||||
f.write(git_credentials)
|
||||
extra_index_str = '# extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]\n' \
|
||||
extra_index_str = '# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]\n' \
|
||||
'agent.package_manager.extra_index_url= ' \
|
||||
'[\n{}\n]\n\n'.format("\n".join(map("\"{}\"".format, extra_index_urls)))
|
||||
f.write(extra_index_str)
|
||||
if default_output_uri:
|
||||
default_output_url_str = '# Default Task output_uri. if output_uri is not provided to Task.init, ' \
|
||||
'default_output_uri will be used instead.\n' \
|
||||
'sdk.development.default_output_uri="{}"\n' \
|
||||
'\n'.format(default_output_uri.strip('"'))
|
||||
f.write(default_output_url_str)
|
||||
default_conf = default_conf.replace('default_output_uri: ""', '# default_output_uri: ""')
|
||||
f.write(default_conf)
|
||||
except Exception:
|
||||
print('Error! Could not write configuration file at: {}'.format(str(conf_file)))
|
||||
return
|
||||
|
||||
print('\nNew configuration stored in {}'.format(str(conf_file)))
|
||||
print('TRAINS-AGENT setup completed successfully.')
|
||||
print('CLEARML-AGENT setup completed successfully.')
|
||||
|
||||
|
||||
def parse_host(parsed_host, allow_input=True):
|
||||
@@ -288,6 +328,25 @@ def input_url(host_type, host=None):
|
||||
return host
|
||||
|
||||
|
||||
def input_options(message, options, default=None):
|
||||
# type: (str, Dict[str, str], Optional[str]) -> str
|
||||
options_msg = "/".join(
|
||||
"".join(('(' + c.upper() + ')') if c == o else c for c in option)
|
||||
for o, option in options.items()
|
||||
)
|
||||
if default:
|
||||
options_msg += " [{}]".format(default)
|
||||
while True:
|
||||
print('{}: {} '.format(message, options_msg), end='')
|
||||
res = input().strip()
|
||||
if not res:
|
||||
return default
|
||||
elif res.lower() in options:
|
||||
return options[res.lower()]
|
||||
elif res.upper() in options:
|
||||
return options[res.upper()]
|
||||
|
||||
|
||||
def input_host_port(host_type, parsed_host):
|
||||
print('Enter port for {} host '.format(host_type), end='')
|
||||
replace_port = input().lower()
|
||||
@@ -309,7 +368,7 @@ def verify_url(parse_input):
|
||||
parsed_host = None
|
||||
except Exception:
|
||||
parsed_host = None
|
||||
print('Could not parse url {}\nEnter your trains-server host: '.format(parse_input), end='')
|
||||
print('Could not parse url {}\nEnter your clearml-server host: '.format(parse_input), end='')
|
||||
return parsed_host
|
||||
|
||||
|
||||
@@ -2,11 +2,10 @@ from __future__ import print_function
|
||||
|
||||
import json
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
|
||||
from future.builtins import super
|
||||
|
||||
from trains_agent.commands.base import ServiceCommandSection
|
||||
from trains_agent.helper.base import return_list
|
||||
from clearml_agent.commands.base import ServiceCommandSection
|
||||
from clearml_agent.helper.base import return_list
|
||||
|
||||
|
||||
class Events(ServiceCommandSection):
|
||||
@@ -21,14 +20,16 @@ class Events(ServiceCommandSection):
|
||||
""" Events command service endpoint """
|
||||
return 'events'
|
||||
|
||||
def send_events(self, list_events):
|
||||
def send_events(self, list_events, session=None):
|
||||
def send_packet(jsonlines):
|
||||
if not jsonlines:
|
||||
return 0
|
||||
num_lines = len(jsonlines)
|
||||
jsonlines = '\n'.join(jsonlines)
|
||||
|
||||
new_events = self.post('add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'})
|
||||
new_events = self.post(
|
||||
'add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'}, session=session
|
||||
)
|
||||
if new_events['added'] != num_lines:
|
||||
print('Error (%s) sending events only %d of %d registered' %
|
||||
(new_events['errors'], new_events['added'], num_lines))
|
||||
@@ -57,7 +58,43 @@ class Events(ServiceCommandSection):
|
||||
# print('Sending events done: %d / %d events sent' % (sent_events, len(list_events)))
|
||||
return sent_events
|
||||
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG'):
|
||||
def send_log_events_with_timestamps(
|
||||
self, worker_id, task_id, lines_with_ts: List[Tuple[str, str]], level="DEBUG", session=None
|
||||
):
|
||||
log_events = []
|
||||
|
||||
# break log lines into event packets
|
||||
for ts, line in return_list(lines_with_ts):
|
||||
# HACK ignore terminal reset ANSI code
|
||||
if line == '\x1b[0m':
|
||||
continue
|
||||
while line:
|
||||
if len(line) <= self.max_event_size:
|
||||
msg = line
|
||||
line = None
|
||||
else:
|
||||
msg = line[:self.max_event_size]
|
||||
line = line[self.max_event_size:]
|
||||
|
||||
log_events.append(
|
||||
{
|
||||
"type": "log",
|
||||
"level": level,
|
||||
"task": task_id,
|
||||
"worker": worker_id,
|
||||
"msg": msg,
|
||||
"timestamp": ts,
|
||||
}
|
||||
)
|
||||
|
||||
if line and ts is not None:
|
||||
# advance timestamp in case we break a line to more than one part
|
||||
ts += 1
|
||||
|
||||
# now send the events
|
||||
return self.send_events(list_events=log_events, session=session)
|
||||
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG', session=None):
|
||||
log_events = []
|
||||
base_timestamp = int(time.time() * 1000)
|
||||
base_log_items = {
|
||||
@@ -94,4 +131,4 @@ class Events(ServiceCommandSection):
|
||||
log_events.append(get_event(count))
|
||||
|
||||
# now send the events
|
||||
return self.send_events(list_events=log_events)
|
||||
return self.send_events(list_events=log_events, session=session)
|
||||
168
clearml_agent/commands/resolver.py
Normal file
168
clearml_agent/commands/resolver.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import json
|
||||
import re
|
||||
import shlex
|
||||
|
||||
from clearml_agent.backend_api.session import Request
|
||||
from clearml_agent.helper.package.requirements import (
|
||||
RequirementsManager, MarkerRequirement,
|
||||
compare_version_rules, )
|
||||
|
||||
|
||||
def resolve_default_container(session, task_id, container_config):
|
||||
container_lookup = session.config.get('agent.default_docker.match_rules', None)
|
||||
if not session.check_min_api_version("2.13") or not container_lookup:
|
||||
return container_config
|
||||
|
||||
# check backend support before sending any more requests (because they will fail and crash the Task)
|
||||
try:
|
||||
session.verify_feature_set('advanced')
|
||||
except ValueError:
|
||||
return container_config
|
||||
|
||||
result = session.send_request(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
version='2.14',
|
||||
json={'id': [task_id],
|
||||
'only_fields': ['script.requirements', 'script.binary',
|
||||
'script.repository', 'script.branch',
|
||||
'project', 'container'],
|
||||
'search_hidden': True},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
task_info = result.json()['data']['tasks'][0] if result.ok else {}
|
||||
except (ValueError, TypeError):
|
||||
return container_config
|
||||
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
# store tasks repository
|
||||
repository = task_info.get('script', {}).get('repository') or ''
|
||||
branch = task_info.get('script', {}).get('branch') or ''
|
||||
binary = task_info.get('script', {}).get('binary') or ''
|
||||
requested_container = task_info.get('container', {})
|
||||
|
||||
# get project full path
|
||||
project_full_name = ''
|
||||
if task_info.get('project', None):
|
||||
result = session.send_request(
|
||||
service='projects',
|
||||
action='get_all',
|
||||
version='2.13',
|
||||
json={
|
||||
'id': [task_info.get('project')],
|
||||
'only_fields': ['name'],
|
||||
},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
if result.ok:
|
||||
project_full_name = result.json()['data']['projects'][0]['name'] or ''
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
task_packages_lookup = {}
|
||||
for entry in container_lookup:
|
||||
match = entry.get('match', None)
|
||||
if not match:
|
||||
continue
|
||||
if match.get('project', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('project', None), project_full_name):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('project', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.repository', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.repository', None), repository):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.repository', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.branch', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.branch', None), branch):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.branch', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.binary', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.binary', None), binary):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.binary', None), entry))
|
||||
continue
|
||||
|
||||
# if match.get('image', None):
|
||||
# # noinspection PyBroadException
|
||||
# try:
|
||||
# if not re.search(match.get('image', None), requested_container.get('image', '')):
|
||||
# continue
|
||||
# except Exception:
|
||||
# print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
# match.get('image', None), entry))
|
||||
# continue
|
||||
|
||||
matched = True
|
||||
for req_section in ['script.requirements.pip', 'script.requirements.conda']:
|
||||
if not match.get(req_section, None):
|
||||
continue
|
||||
|
||||
match_pip_reqs = [MarkerRequirement(Requirement.parse('{} {}'.format(k, v)))
|
||||
for k, v in match.get(req_section, None).items()]
|
||||
|
||||
if not task_packages_lookup.get(req_section):
|
||||
req_section_parts = req_section.split('.')
|
||||
task_packages_lookup[req_section] = \
|
||||
RequirementsManager.parse_requirements_section_to_marker_requirements(
|
||||
requirements=task_info.get(req_section_parts[0], {}).get(
|
||||
req_section_parts[1], {}).get(req_section_parts[2], None)
|
||||
)
|
||||
|
||||
matched_all_reqs = True
|
||||
for mr in match_pip_reqs:
|
||||
matched_req = False
|
||||
for pr in task_packages_lookup[req_section]:
|
||||
if mr.req.name != pr.req.name:
|
||||
continue
|
||||
if compare_version_rules(mr.specs, pr.specs):
|
||||
matched_req = True
|
||||
break
|
||||
if not matched_req:
|
||||
matched_all_reqs = False
|
||||
break
|
||||
|
||||
# if ew have a match, check second section
|
||||
if matched_all_reqs:
|
||||
continue
|
||||
# no match stop
|
||||
matched = False
|
||||
break
|
||||
|
||||
if matched:
|
||||
if not container_config.get('image'):
|
||||
container_config['image'] = entry.get('image', None)
|
||||
if not container_config.get('arguments'):
|
||||
container_config['arguments'] = entry.get('arguments', None)
|
||||
container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
|
||||
print('Matching default container with rule:\n{}'.format(json.dumps(entry)))
|
||||
return container_config
|
||||
|
||||
return container_config
|
||||
|
||||
4540
clearml_agent/commands/worker.py
Normal file
4540
clearml_agent/commands/worker.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,8 @@
|
||||
"""
|
||||
Script for generating command-line completion.
|
||||
Called by trains_agent/utilities/complete.sh (or a copy of it) like so:
|
||||
Called by clearml_agent/utilities/complete.sh (or a copy of it) like so:
|
||||
|
||||
python -m trains_agent.complete "current command line"
|
||||
python -m clearml_agent.complete "current command line"
|
||||
|
||||
And writes line-separated completion targets to stdout.
|
||||
Results are line-separated in order to enable other whitespace in results.
|
||||
@@ -13,7 +13,7 @@ from __future__ import print_function
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from trains_agent.interface import get_parser
|
||||
from clearml_agent.interface import get_parser
|
||||
|
||||
|
||||
def is_argument_required(action):
|
||||
@@ -1,7 +1,7 @@
|
||||
from pyhocon import ConfigTree
|
||||
|
||||
import six
|
||||
from trains_agent.helper.base import Singleton
|
||||
|
||||
from clearml_agent.external.pyhocon import ConfigTree
|
||||
from clearml_agent.helper.base import Singleton
|
||||
|
||||
|
||||
@six.add_metaclass(Singleton)
|
||||
266
clearml_agent/definitions.py
Normal file
266
clearml_agent/definitions.py
Normal file
@@ -0,0 +1,266 @@
|
||||
import shlex
|
||||
from datetime import timedelta
|
||||
from enum import IntEnum
|
||||
from os import getenv, environ
|
||||
from typing import Text, Optional, Union, Tuple, Any
|
||||
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.helper.base import normalize_path
|
||||
from clearml_agent.helper.environment.converters import strtobool
|
||||
|
||||
PROGRAM_NAME = "clearml-agent"
|
||||
FROM_FILE_PREFIX_CHARS = "@"
|
||||
|
||||
CONFIG_DIR = normalize_path("~/.clearml")
|
||||
TOKEN_CACHE_FILE = normalize_path("~/.clearml.clearml_agent.tmp")
|
||||
|
||||
CONFIG_FILE_CANDIDATES = ["~/clearml.conf"]
|
||||
|
||||
|
||||
def find_config_path():
|
||||
for candidate in CONFIG_FILE_CANDIDATES:
|
||||
if Path(candidate).expanduser().exists():
|
||||
return candidate
|
||||
return CONFIG_FILE_CANDIDATES[0]
|
||||
|
||||
|
||||
CONFIG_FILE = normalize_path(find_config_path())
|
||||
|
||||
|
||||
class EnvironmentConfig(object):
|
||||
|
||||
conversions = {
|
||||
bool: lambda value: bool(strtobool(value)),
|
||||
six.text_type: lambda s: six.text_type(s).strip(),
|
||||
list: lambda s: shlex.split(s.strip()),
|
||||
}
|
||||
|
||||
def __init__(self, *names, **kwargs):
|
||||
self.vars = names
|
||||
self.type = kwargs.pop("type", six.text_type)
|
||||
|
||||
def pop(self):
|
||||
for k in self.vars:
|
||||
environ.pop(k, None)
|
||||
|
||||
def set(self, value):
|
||||
for k in self.vars:
|
||||
environ[k] = str(value)
|
||||
|
||||
def convert(self, value):
|
||||
return self.conversions.get(self.type, self.type)(value)
|
||||
|
||||
def get(self, key=False): # type: (bool) -> Optional[Union[Any, Tuple[Text, Any]]]
|
||||
for name in self.vars:
|
||||
value = getenv(name)
|
||||
if value:
|
||||
value = self.convert(value)
|
||||
if key:
|
||||
return name, value
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AGENT_AUTH_TOKEN = EnvironmentConfig("CLEARML_AUTH_TOKEN")
|
||||
ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
|
||||
ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")
|
||||
|
||||
ENVIRONMENT_CONFIG = {
|
||||
"api.api_server": EnvironmentConfig(
|
||||
"CLEARML_API_HOST",
|
||||
"TRAINS_API_HOST",
|
||||
),
|
||||
"api.files_server": EnvironmentConfig(
|
||||
"CLEARML_FILES_HOST",
|
||||
"TRAINS_FILES_HOST",
|
||||
),
|
||||
"api.web_server": EnvironmentConfig(
|
||||
"CLEARML_WEB_HOST",
|
||||
"TRAINS_WEB_HOST",
|
||||
),
|
||||
"api.credentials.access_key": EnvironmentConfig(
|
||||
"CLEARML_API_ACCESS_KEY",
|
||||
"TRAINS_API_ACCESS_KEY",
|
||||
),
|
||||
"api.credentials.secret_key": ENV_AGENT_SECRET_KEY,
|
||||
"agent.worker_name": EnvironmentConfig(
|
||||
"CLEARML_WORKER_NAME",
|
||||
"TRAINS_WORKER_NAME",
|
||||
),
|
||||
"agent.worker_id": EnvironmentConfig(
|
||||
"CLEARML_WORKER_ID",
|
||||
"TRAINS_WORKER_ID",
|
||||
),
|
||||
"agent.cuda_version": EnvironmentConfig("CLEARML_CUDA_VERSION", "TRAINS_CUDA_VERSION", "CUDA_VERSION"),
|
||||
"agent.cudnn_version": EnvironmentConfig("CLEARML_CUDNN_VERSION", "TRAINS_CUDNN_VERSION", "CUDNN_VERSION"),
|
||||
"agent.cpu_only": EnvironmentConfig(names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool),
|
||||
"agent.crash_on_exception": EnvironmentConfig("CLEAMRL_AGENT_CRASH_ON_EXCEPTION", type=bool),
|
||||
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
|
||||
"sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
|
||||
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
|
||||
"sdk.azure.storage.containers.0": {
|
||||
"account_name": EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
||||
"account_key": ENV_AZURE_ACCOUNT_KEY,
|
||||
},
|
||||
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
|
||||
}
|
||||
|
||||
ENVIRONMENT_SDK_PARAMS = {
|
||||
"task_id": (
|
||||
"CLEARML_TASK_ID",
|
||||
"TRAINS_TASK_ID",
|
||||
),
|
||||
"config_file": (
|
||||
"CLEARML_CONFIG_FILE",
|
||||
"TRAINS_CONFIG_FILE",
|
||||
),
|
||||
"log_level": (
|
||||
"CLEARML_LOG_LEVEL",
|
||||
"TRAINS_LOG_LEVEL",
|
||||
),
|
||||
"log_to_backend": (
|
||||
"CLEARML_LOG_TASK_TO_BACKEND",
|
||||
"TRAINS_LOG_TASK_TO_BACKEND",
|
||||
),
|
||||
}
|
||||
|
||||
ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig(names=("CLEARML_AGENT_ALG_ENV", "TRAINS_AGENT_ALG_ENV"), type=bool)
|
||||
|
||||
VIRTUAL_ENVIRONMENT_PATH = {
|
||||
"python2": normalize_path(CONFIG_DIR, "py2venv"),
|
||||
"python3": normalize_path(CONFIG_DIR, "py3venv"),
|
||||
}
|
||||
|
||||
DEFAULT_BASE_DIR = normalize_path(CONFIG_DIR, "data_cache")
|
||||
DEFAULT_HOST = "https://demoapi.demo.clear.ml"
|
||||
MAX_DATASET_SOURCES_COUNT = 50000
|
||||
|
||||
INVALID_WORKER_ID = (400, 1001)
|
||||
WORKER_ALREADY_REGISTERED = (400, 1003)
|
||||
|
||||
API_VERSION = "v1.5"
|
||||
TOKEN_EXPIRATION_SECONDS = int(timedelta(days=2).total_seconds())
|
||||
|
||||
METADATA_EXTENSION = ".json"
|
||||
|
||||
DEFAULT_VENV_UPDATE_URL = "https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
|
||||
WORKING_REPOSITORY_DIR = "task_repository"
|
||||
WORKING_STANDALONE_DIR = "code"
|
||||
DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
|
||||
PIP_EXTRA_INDICES = []
|
||||
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
|
||||
ENV_PIP_EXTRA_INSTALL_FLAGS = EnvironmentConfig("CLEARML_EXTRA_PIP_INSTALL_FLAGS", type=list)
|
||||
ENV_DOCKER_IMAGE = EnvironmentConfig("CLEARML_DOCKER_IMAGE", "TRAINS_DOCKER_IMAGE")
|
||||
ENV_WORKER_ID = EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID")
|
||||
ENV_WORKER_TAGS = EnvironmentConfig("CLEARML_WORKER_TAGS")
|
||||
ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig("CLEARML_AGENT_SKIP_PIP_VENV_INSTALL")
|
||||
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig("CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL", type=bool)
|
||||
ENV_AGENT_FORCE_CODE_DIR = EnvironmentConfig("CLEARML_AGENT_FORCE_CODE_DIR")
|
||||
ENV_AGENT_FORCE_EXEC_SCRIPT = EnvironmentConfig("CLEARML_AGENT_FORCE_EXEC_SCRIPT")
|
||||
ENV_AGENT_FORCE_POETRY = EnvironmentConfig("CLEARML_AGENT_FORCE_POETRY", type=bool)
|
||||
ENV_AGENT_FORCE_TASK_INIT = EnvironmentConfig("CLEARML_AGENT_FORCE_TASK_INIT", type=bool)
|
||||
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig("CLEARML_DOCKER_SKIP_GPUS_FLAG", "TRAINS_DOCKER_SKIP_GPUS_FLAG")
|
||||
ENV_AGENT_GIT_USER = EnvironmentConfig("CLEARML_AGENT_GIT_USER", "TRAINS_AGENT_GIT_USER")
|
||||
ENV_AGENT_GIT_PASS = EnvironmentConfig("CLEARML_AGENT_GIT_PASS", "TRAINS_AGENT_GIT_PASS")
|
||||
ENV_AGENT_GIT_HOST = EnvironmentConfig("CLEARML_AGENT_GIT_HOST", "TRAINS_AGENT_GIT_HOST")
|
||||
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig("CLEARML_AGENT_DISABLE_SSH_MOUNT", type=bool)
|
||||
ENV_SSH_AUTH_SOCK = EnvironmentConfig("SSH_AUTH_SOCK")
|
||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig("CLEARML_AGENT_EXEC_USER", "TRAINS_AGENT_EXEC_USER")
|
||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig("CLEARML_AGENT_EXTRA_PYTHON_PATH", "TRAINS_AGENT_EXTRA_PYTHON_PATH")
|
||||
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig(
|
||||
"CLEARML_AGENT_K8S_HOST_MOUNT",
|
||||
"CLEARML_AGENT_DOCKER_HOST_MOUNT",
|
||||
"TRAINS_AGENT_K8S_HOST_MOUNT",
|
||||
"TRAINS_AGENT_DOCKER_HOST_MOUNT",
|
||||
)
|
||||
ENV_VENV_CACHE_PATH = EnvironmentConfig("CLEARML_AGENT_VENV_CACHE_PATH")
|
||||
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_ARGS", type=list)
|
||||
ENV_EXTRA_DOCKER_LABELS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_LABELS", type=list)
|
||||
ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
|
||||
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
|
||||
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
|
||||
ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV")
|
||||
ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
|
||||
""" Maintain backwards compatible configuration when launching in standalone mode """
|
||||
|
||||
ENV_FORCE_DOCKER_AGENT_REPO = EnvironmentConfig("FORCE_CLEARML_AGENT_REPO", "CLEARML_AGENT_DOCKER_AGENT_REPO")
|
||||
|
||||
ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
|
||||
"""
|
||||
Specify a restart value for a services agent task containers.
|
||||
Note that when a restart value is provided, task containers will not be run with the '--rm' flag and will
|
||||
not be cleaned up automatically when completed (this will need to be done externally using the
|
||||
'docker container prune' command to free up resources).
|
||||
Value format for this env var is "<restart-value>;<task-selector>", where:
|
||||
- <restart-value> can be any valid restart value for docker-run (see https://docs.docker.com/engine/reference/commandline/run/#restart)
|
||||
- <task-selector> is optional, allowing to restrict this behaviour to specific tasks. The format is:
|
||||
"<path-to-task-field>=<value>" where:
|
||||
* <path-to-task-field> is a dot-separated path to a task field (e.g. "container.image")
|
||||
* <value> is optional. If not provided, the restart policy till be applied for the task container if the
|
||||
path provided exists. If provided, the restart policy will be applied if the value matches the value
|
||||
obtained from the task (value parsing and comparison is based on the type of value obtained from the task)
|
||||
For example:
|
||||
CLEARML_AGENT_SERVICES_DOCKER_RESTART=unless-stopped
|
||||
CLEARML_AGENT_SERVICES_DOCKER_RESTART=unless-stopped;container.image=some-image
|
||||
"""
|
||||
|
||||
ENV_FORCE_SYSTEM_SITE_PACKAGES = EnvironmentConfig("CLEARML_AGENT_FORCE_SYSTEM_SITE_PACKAGES", type=bool)
|
||||
""" Force system_site_packages: true when running tasks in containers (i.e. docker mode or k8s glue) """
|
||||
|
||||
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig("CLEARML_AGENT_CUSTOM_BUILD_SCRIPT")
|
||||
"""
|
||||
Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
The script can also be specified using the `agent.custom_build_script` configuration setting.
|
||||
|
||||
When running the script, the following environment variables will be set:
|
||||
- CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
contents in JSON format
|
||||
- CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
- CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
- CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
- CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
- CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
this file must be in the following JSON format:
|
||||
```json
|
||||
{
|
||||
"binary": "/absolute/path/to/python-executable",
|
||||
"entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
"working_dir": "/absolute/path/to/task-working/dir"
|
||||
}
|
||||
```
|
||||
If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
skip virtual environment creation.
|
||||
|
||||
In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
standard flow.
|
||||
"""
|
||||
|
||||
ENV_PACKAGE_PYTORCH_RESOLVE = EnvironmentConfig("CLEARML_AGENT_PACKAGE_PYTORCH_RESOLVE")
|
||||
|
||||
ENV_TEMP_STDOUT_FILE_DIR = EnvironmentConfig("CLEARML_AGENT_TEMP_STDOUT_FILE_DIR")
|
||||
|
||||
ENV_GIT_CLONE_VERBOSE = EnvironmentConfig("CLEARML_AGENT_GIT_CLONE_VERBOSE", type=bool)
|
||||
|
||||
ENV_GPU_FRACTIONS = EnvironmentConfig("CLEARML_AGENT_GPU_FRACTIONS")
|
||||
|
||||
|
||||
class FileBuffering(IntEnum):
|
||||
"""
|
||||
File buffering options:
|
||||
- UNSET: follows the defaults for the type of file,
|
||||
line-buffered for interactive (tty) text files and with a default chunk size otherwise
|
||||
- UNBUFFERED: no buffering at all
|
||||
- LINE_BUFFERED: per-line buffering, only valid for text files
|
||||
- values bigger than 1 indicate the size of the buffer in bytes and are not represented by the enum
|
||||
"""
|
||||
|
||||
UNSET = -1
|
||||
UNBUFFERED = 0
|
||||
LINE_BUFFERING = 1
|
||||
@@ -84,3 +84,13 @@ class MissingPackageError(CommandFailedError):
|
||||
def __str__(self):
|
||||
return '{self.__class__.__name__}: ' \
|
||||
'"{self.name}" package is required. Please run "pip install {self.name}"'.format(self=self)
|
||||
|
||||
|
||||
class CustomBuildScriptFailed(CommandFailedError):
|
||||
def __init__(self, errno, *args, **kwargs):
|
||||
super(CustomBuildScriptFailed, self).__init__(*args, **kwargs)
|
||||
self.errno = errno
|
||||
|
||||
|
||||
class SkippedCustomBuildScript(CommandFailedError):
|
||||
pass
|
||||
5
clearml_agent/external/pyhocon/__init__.py
vendored
Normal file
5
clearml_agent/external/pyhocon/__init__.py
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
from .config_parser import ConfigParser, ConfigFactory, ConfigMissingException
|
||||
from .config_tree import ConfigTree
|
||||
from .converter import HOCONConverter
|
||||
|
||||
__all__ = ["ConfigParser", "ConfigFactory", "ConfigMissingException", "ConfigTree", "HOCONConverter"]
|
||||
762
clearml_agent/external/pyhocon/config_parser.py
vendored
Normal file
762
clearml_agent/external/pyhocon/config_parser.py
vendored
Normal file
@@ -0,0 +1,762 @@
|
||||
import itertools
|
||||
import re
|
||||
import os
|
||||
import socket
|
||||
import contextlib
|
||||
import codecs
|
||||
from datetime import timedelta
|
||||
|
||||
from pyparsing import Forward, Keyword, QuotedString, Word, Literal, Suppress, Regex, Optional, SkipTo, ZeroOrMore, \
|
||||
Group, lineno, col, TokenConverter, replaceWith, alphanums, alphas8bit, ParseSyntaxException, StringEnd
|
||||
from pyparsing import ParserElement
|
||||
from .config_tree import ConfigTree, ConfigSubstitution, ConfigList, ConfigValues, ConfigUnquotedString, \
|
||||
ConfigInclude, NoneValue, ConfigQuotedString
|
||||
from .exceptions import ConfigSubstitutionException, ConfigMissingException, ConfigException
|
||||
import logging
|
||||
import copy
|
||||
|
||||
use_urllib2 = False
|
||||
try:
|
||||
# For Python 3.0 and later
|
||||
from urllib.request import urlopen
|
||||
from urllib.error import HTTPError, URLError
|
||||
except ImportError: # pragma: no cover
|
||||
# Fall back to Python 2's urllib2
|
||||
from urllib2 import urlopen, HTTPError, URLError
|
||||
|
||||
use_urllib2 = True
|
||||
try:
|
||||
basestring
|
||||
except NameError: # pragma: no cover
|
||||
basestring = str
|
||||
unicode = str
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#
|
||||
# Substitution Defaults
|
||||
#
|
||||
|
||||
|
||||
class DEFAULT_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
class MANDATORY_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
class NO_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
class STR_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
def period(period_value, period_unit):
|
||||
try:
|
||||
from dateutil.relativedelta import relativedelta as period_impl
|
||||
except Exception:
|
||||
from datetime import timedelta as period_impl
|
||||
|
||||
if period_unit == 'nanoseconds':
|
||||
period_unit = 'microseconds'
|
||||
period_value = int(period_value / 1000)
|
||||
|
||||
arguments = dict(zip((period_unit,), (period_value,)))
|
||||
|
||||
if period_unit == 'milliseconds':
|
||||
return timedelta(**arguments)
|
||||
|
||||
return period_impl(**arguments)
|
||||
|
||||
|
||||
class ConfigFactory(object):
|
||||
|
||||
@classmethod
|
||||
def parse_file(cls, filename, encoding='utf-8', required=True, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""Parse file
|
||||
|
||||
:param filename: filename
|
||||
:type filename: basestring
|
||||
:param encoding: file encoding
|
||||
:type encoding: basestring
|
||||
:param required: If true, raises an exception if can't load file
|
||||
:type required: boolean
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by its
|
||||
substitution expression (e.g., ${x})
|
||||
:type unresolved_value: class
|
||||
:return: Config object
|
||||
:type return: Config
|
||||
"""
|
||||
try:
|
||||
with codecs.open(filename, 'r', encoding=encoding) as fd:
|
||||
content = fd.read()
|
||||
return cls.parse_string(content, os.path.dirname(filename), resolve, unresolved_value)
|
||||
except IOError as e:
|
||||
if required:
|
||||
raise e
|
||||
logger.warn('Cannot include file %s. File does not exist or cannot be read.', filename)
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def parse_URL(cls, url, timeout=None, resolve=True, required=False, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""Parse URL
|
||||
|
||||
:param url: url to parse
|
||||
:type url: basestring
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
|
||||
its substitution expression (e.g., ${x})
|
||||
:type unresolved_value: boolean
|
||||
:return: Config object or []
|
||||
:type return: Config or list
|
||||
"""
|
||||
socket_timeout = socket._GLOBAL_DEFAULT_TIMEOUT if timeout is None else timeout
|
||||
|
||||
try:
|
||||
with contextlib.closing(urlopen(url, timeout=socket_timeout)) as fd:
|
||||
content = fd.read() if use_urllib2 else fd.read().decode('utf-8')
|
||||
return cls.parse_string(content, os.path.dirname(url), resolve, unresolved_value)
|
||||
except (HTTPError, URLError) as e:
|
||||
logger.warn('Cannot include url %s. Resource is inaccessible.', url)
|
||||
if required:
|
||||
raise e
|
||||
else:
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def parse_string(cls, content, basedir=None, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""Parse URL
|
||||
|
||||
:param content: content to parse
|
||||
:type content: basestring
|
||||
:param resolve: If true, resolve substitutions
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
|
||||
its substitution expression (e.g., ${x})
|
||||
:type unresolved_value: boolean
|
||||
:return: Config object
|
||||
:type return: Config
|
||||
"""
|
||||
return ConfigParser().parse(content, basedir, resolve, unresolved_value)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dictionary, root=False):
|
||||
"""Convert dictionary (and ordered dictionary) into a ConfigTree
|
||||
:param dictionary: dictionary to convert
|
||||
:type dictionary: dict
|
||||
:return: Config object
|
||||
:type return: Config
|
||||
"""
|
||||
|
||||
def create_tree(value):
|
||||
if isinstance(value, dict):
|
||||
res = ConfigTree(root=root)
|
||||
for key, child_value in value.items():
|
||||
res.put(key, create_tree(child_value))
|
||||
return res
|
||||
if isinstance(value, list):
|
||||
return [create_tree(v) for v in value]
|
||||
else:
|
||||
return value
|
||||
|
||||
return create_tree(dictionary)
|
||||
|
||||
|
||||
class ConfigParser(object):
|
||||
"""
|
||||
Parse HOCON files: https://github.com/typesafehub/config/blob/master/HOCON.md
|
||||
"""
|
||||
|
||||
REPLACEMENTS = {
|
||||
'\\\\': '\\',
|
||||
'\\\n': '\n',
|
||||
'\\n': '\n',
|
||||
'\\r': '\r',
|
||||
'\\t': '\t',
|
||||
'\\=': '=',
|
||||
'\\#': '#',
|
||||
'\\!': '!',
|
||||
'\\"': '"',
|
||||
}
|
||||
|
||||
period_type_map = {
|
||||
'nanoseconds': ['ns', 'nano', 'nanos', 'nanosecond', 'nanoseconds'],
|
||||
|
||||
'microseconds': ['us', 'micro', 'micros', 'microsecond', 'microseconds'],
|
||||
'milliseconds': ['ms', 'milli', 'millis', 'millisecond', 'milliseconds'],
|
||||
'seconds': ['s', 'second', 'seconds'],
|
||||
'minutes': ['m', 'minute', 'minutes'],
|
||||
'hours': ['h', 'hour', 'hours'],
|
||||
'weeks': ['w', 'week', 'weeks'],
|
||||
'days': ['d', 'day', 'days'],
|
||||
|
||||
}
|
||||
|
||||
optional_period_type_map = {
|
||||
'months': ['mo', 'month', 'months'], # 'm' from hocon spec removed. conflicts with minutes syntax.
|
||||
'years': ['y', 'year', 'years']
|
||||
}
|
||||
|
||||
supported_period_map = None
|
||||
|
||||
@classmethod
|
||||
def get_supported_period_type_map(cls):
|
||||
if cls.supported_period_map is None:
|
||||
cls.supported_period_map = {}
|
||||
cls.supported_period_map.update(cls.period_type_map)
|
||||
|
||||
try:
|
||||
from dateutil import relativedelta
|
||||
|
||||
if relativedelta is not None:
|
||||
cls.supported_period_map.update(cls.optional_period_type_map)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return cls.supported_period_map
|
||||
|
||||
@classmethod
|
||||
def parse(cls, content, basedir=None, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""parse a HOCON content
|
||||
|
||||
:param content: HOCON content to parse
|
||||
:type content: basestring
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
|
||||
its substitution expression (e.g., ${x})
|
||||
:type unresolved_value: boolean
|
||||
:return: a ConfigTree or a list
|
||||
"""
|
||||
|
||||
unescape_pattern = re.compile(r'\\.')
|
||||
|
||||
def replace_escape_sequence(match):
|
||||
value = match.group(0)
|
||||
return cls.REPLACEMENTS.get(value, value)
|
||||
|
||||
def norm_string(value):
|
||||
return unescape_pattern.sub(replace_escape_sequence, value)
|
||||
|
||||
def unescape_string(tokens):
|
||||
return ConfigUnquotedString(norm_string(tokens[0]))
|
||||
|
||||
def parse_multi_string(tokens):
|
||||
# remove the first and last 3 "
|
||||
return tokens[0][3: -3]
|
||||
|
||||
def convert_number(tokens):
|
||||
n = tokens[0]
|
||||
try:
|
||||
return int(n, 10)
|
||||
except ValueError:
|
||||
return float(n)
|
||||
|
||||
def safe_convert_number(tokens):
|
||||
n = tokens[0]
|
||||
try:
|
||||
return int(n, 10)
|
||||
except ValueError:
|
||||
try:
|
||||
return float(n)
|
||||
except ValueError:
|
||||
return n
|
||||
|
||||
def convert_period(tokens):
|
||||
|
||||
period_value = int(tokens.value)
|
||||
period_identifier = tokens.unit
|
||||
|
||||
period_unit = next((single_unit for single_unit, values
|
||||
in cls.get_supported_period_type_map().items()
|
||||
if period_identifier in values))
|
||||
|
||||
return period(period_value, period_unit)
|
||||
|
||||
# ${path} or ${?path} for optional substitution
|
||||
SUBSTITUTION_PATTERN = r"\$\{(?P<optional>\?)?(?P<variable>[^}]+)\}(?P<ws>[ \t]*)"
|
||||
|
||||
def create_substitution(instring, loc, token):
|
||||
# remove the ${ and }
|
||||
match = re.match(SUBSTITUTION_PATTERN, token[0])
|
||||
variable = match.group('variable')
|
||||
ws = match.group('ws')
|
||||
optional = match.group('optional') == '?'
|
||||
substitution = ConfigSubstitution(variable, optional, ws, instring, loc)
|
||||
return substitution
|
||||
|
||||
# ${path} or ${?path} for optional substitution
|
||||
STRING_PATTERN = '"(?P<value>(?:[^"\\\\]|\\\\.)*)"(?P<ws>[ \t]*)'
|
||||
|
||||
def create_quoted_string(instring, loc, token):
|
||||
# remove the ${ and }
|
||||
match = re.match(STRING_PATTERN, token[0])
|
||||
value = norm_string(match.group('value'))
|
||||
ws = match.group('ws')
|
||||
return ConfigQuotedString(value, ws, instring, loc)
|
||||
|
||||
def include_config(instring, loc, token):
|
||||
url = None
|
||||
file = None
|
||||
required = False
|
||||
|
||||
if token[0] == 'required':
|
||||
required = True
|
||||
final_tokens = token[1:]
|
||||
else:
|
||||
final_tokens = token
|
||||
|
||||
if len(final_tokens) == 1: # include "test"
|
||||
value = final_tokens[0].value if isinstance(final_tokens[0], ConfigQuotedString) else final_tokens[0]
|
||||
if value.startswith("http://") or value.startswith("https://") or value.startswith("file://"):
|
||||
url = value
|
||||
else:
|
||||
file = value
|
||||
elif len(final_tokens) == 2: # include url("test") or file("test")
|
||||
value = final_tokens[1].value if isinstance(token[1], ConfigQuotedString) else final_tokens[1]
|
||||
if final_tokens[0] == 'url':
|
||||
url = value
|
||||
else:
|
||||
file = value
|
||||
|
||||
if url is not None:
|
||||
logger.debug('Loading config from url %s', url)
|
||||
obj = ConfigFactory.parse_URL(
|
||||
url,
|
||||
resolve=False,
|
||||
required=required,
|
||||
unresolved_value=NO_SUBSTITUTION
|
||||
)
|
||||
elif file is not None:
|
||||
path = file if basedir is None else os.path.join(basedir, file)
|
||||
logger.debug('Loading config from file %s', path)
|
||||
obj = ConfigFactory.parse_file(
|
||||
path,
|
||||
resolve=False,
|
||||
required=required,
|
||||
unresolved_value=NO_SUBSTITUTION
|
||||
)
|
||||
else:
|
||||
raise ConfigException('No file or URL specified at: {loc}: {instring}', loc=loc, instring=instring)
|
||||
|
||||
return ConfigInclude(obj if isinstance(obj, list) else obj.items())
|
||||
|
||||
@contextlib.contextmanager
|
||||
def set_default_white_spaces():
|
||||
default = ParserElement.DEFAULT_WHITE_CHARS
|
||||
ParserElement.setDefaultWhitespaceChars(' \t')
|
||||
yield
|
||||
ParserElement.setDefaultWhitespaceChars(default)
|
||||
|
||||
with set_default_white_spaces():
|
||||
assign_expr = Forward()
|
||||
true_expr = Keyword("true", caseless=True).setParseAction(replaceWith(True))
|
||||
false_expr = Keyword("false", caseless=True).setParseAction(replaceWith(False))
|
||||
null_expr = Keyword("null", caseless=True).setParseAction(replaceWith(NoneValue()))
|
||||
# key = QuotedString('"', escChar='\\', unquoteResults=False) | Word(alphanums + alphas8bit + '._- /')
|
||||
regexp_numbers = r'[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE][+\-]?\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))'
|
||||
key = QuotedString('"', escChar='\\', unquoteResults=False) | \
|
||||
Regex(regexp_numbers, re.DOTALL).setParseAction(safe_convert_number) | \
|
||||
Word(alphanums + alphas8bit + '._- /')
|
||||
|
||||
eol = Word('\n\r').suppress()
|
||||
eol_comma = Word('\n\r,').suppress()
|
||||
comment = (Literal('#') | Literal('//')) - SkipTo(eol | StringEnd())
|
||||
comment_eol = Suppress(Optional(eol_comma) + comment)
|
||||
comment_no_comma_eol = (comment | eol).suppress()
|
||||
number_expr = Regex(regexp_numbers, re.DOTALL).setParseAction(convert_number)
|
||||
|
||||
period_types = itertools.chain.from_iterable(cls.get_supported_period_type_map().values())
|
||||
period_expr = Regex(r'(?P<value>\d+)\s*(?P<unit>' + '|'.join(period_types) + ')$'
|
||||
).setParseAction(convert_period)
|
||||
|
||||
# multi line string using """
|
||||
# Using fix described in http://pyparsing.wikispaces.com/share/view/3778969
|
||||
multiline_string = Regex('""".*?"*"""', re.DOTALL | re.UNICODE).setParseAction(parse_multi_string)
|
||||
# single quoted line string
|
||||
quoted_string = Regex(r'"(?:[^"\\\n]|\\.)*"[ \t]*', re.UNICODE).setParseAction(create_quoted_string)
|
||||
# unquoted string that takes the rest of the line until an optional comment
|
||||
# we support .properties multiline support which is like this:
|
||||
# line1 \
|
||||
# line2 \
|
||||
# so a backslash precedes the \n
|
||||
unquoted_string = Regex(r'(?:[^^`+?!@*&"\[\{\s\]\}#,=\$\\]|\\.)+[ \t]*',
|
||||
re.UNICODE).setParseAction(unescape_string)
|
||||
substitution_expr = Regex(r'[ \t]*\$\{[^\}]+\}[ \t]*').setParseAction(create_substitution)
|
||||
string_expr = multiline_string | quoted_string | unquoted_string
|
||||
|
||||
value_expr = period_expr | number_expr | true_expr | false_expr | null_expr | string_expr
|
||||
|
||||
include_content = (quoted_string | ((Keyword('url') | Keyword(
|
||||
'file')) - Literal('(').suppress() - quoted_string - Literal(')').suppress()))
|
||||
include_expr = (
|
||||
Keyword("include", caseless=True).suppress() + (
|
||||
include_content | (
|
||||
Keyword("required") - Literal('(').suppress() - include_content - Literal(')').suppress()
|
||||
)
|
||||
)
|
||||
).setParseAction(include_config)
|
||||
|
||||
root_dict_expr = Forward()
|
||||
dict_expr = Forward()
|
||||
list_expr = Forward()
|
||||
multi_value_expr = ZeroOrMore(comment_eol | include_expr | substitution_expr |
|
||||
dict_expr | list_expr | value_expr | (Literal('\\') - eol).suppress())
|
||||
# for a dictionary : or = is optional
|
||||
# last zeroOrMore is because we can have t = {a:4} {b: 6} {c: 7} which is dictionary concatenation
|
||||
inside_dict_expr = ConfigTreeParser(ZeroOrMore(comment_eol | include_expr | assign_expr | eol_comma))
|
||||
inside_root_dict_expr = ConfigTreeParser(ZeroOrMore(
|
||||
comment_eol | include_expr | assign_expr | eol_comma), root=True)
|
||||
dict_expr << Suppress('{') - inside_dict_expr - Suppress('}')
|
||||
root_dict_expr << Suppress('{') - inside_root_dict_expr - Suppress('}')
|
||||
list_entry = ConcatenatedValueParser(multi_value_expr)
|
||||
list_expr << Suppress('[') - ListParser(list_entry - ZeroOrMore(eol_comma - list_entry)) - Suppress(']')
|
||||
|
||||
# special case when we have a value assignment where the string can potentially be the remainder of the line
|
||||
assign_expr << Group(key - ZeroOrMore(comment_no_comma_eol) -
|
||||
(dict_expr | (Literal('=') | Literal(':') | Literal('+=')) -
|
||||
ZeroOrMore(comment_no_comma_eol) - ConcatenatedValueParser(multi_value_expr)))
|
||||
|
||||
# the file can be { ... } where {} can be omitted or []
|
||||
config_expr = ZeroOrMore(comment_eol | eol) + (list_expr | root_dict_expr |
|
||||
inside_root_dict_expr) + ZeroOrMore(comment_eol | eol_comma)
|
||||
config = config_expr.parseString(content, parseAll=True)[0]
|
||||
|
||||
if resolve:
|
||||
allow_unresolved = resolve and unresolved_value is not DEFAULT_SUBSTITUTION and \
|
||||
unresolved_value is not MANDATORY_SUBSTITUTION
|
||||
has_unresolved = cls.resolve_substitutions(config, allow_unresolved)
|
||||
if has_unresolved and unresolved_value is MANDATORY_SUBSTITUTION:
|
||||
raise ConfigSubstitutionException(
|
||||
'resolve cannot be set to True and unresolved_value to MANDATORY_SUBSTITUTION')
|
||||
|
||||
if unresolved_value is not NO_SUBSTITUTION and unresolved_value is not DEFAULT_SUBSTITUTION:
|
||||
cls.unresolve_substitutions_to_value(config, unresolved_value)
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def _resolve_variable(cls, config, substitution):
|
||||
"""
|
||||
:param config:
|
||||
:param substitution:
|
||||
:return: (is_resolved, resolved_variable)
|
||||
"""
|
||||
variable = substitution.variable
|
||||
try:
|
||||
return True, config.get(variable)
|
||||
except ConfigMissingException:
|
||||
# default to environment variable
|
||||
value = os.environ.get(variable)
|
||||
|
||||
if value is None:
|
||||
if substitution.optional:
|
||||
return False, None
|
||||
else:
|
||||
raise ConfigSubstitutionException(
|
||||
"Cannot resolve variable ${{{variable}}} (line: {line}, col: {col})".format(
|
||||
variable=variable,
|
||||
line=lineno(substitution.loc, substitution.instring),
|
||||
col=col(substitution.loc, substitution.instring)))
|
||||
elif isinstance(value, ConfigList) or isinstance(value, ConfigTree):
|
||||
raise ConfigSubstitutionException(
|
||||
"Cannot substitute variable ${{{variable}}} because it does not point to a "
|
||||
"string, int, float, boolean or null {type} (line:{line}, col: {col})".format(
|
||||
variable=variable,
|
||||
type=value.__class__.__name__,
|
||||
line=lineno(substitution.loc, substitution.instring),
|
||||
col=col(substitution.loc, substitution.instring)))
|
||||
return True, value
|
||||
|
||||
@classmethod
|
||||
def _fixup_self_references(cls, config, accept_unresolved=False):
|
||||
if isinstance(config, ConfigTree) and config.root:
|
||||
for key in config: # Traverse history of element
|
||||
history = config.history[key]
|
||||
previous_item = history[0]
|
||||
for current_item in history[1:]:
|
||||
for substitution in cls._find_substitutions(current_item):
|
||||
prop_path = ConfigTree.parse_key(substitution.variable)
|
||||
if len(prop_path) > 1 and config.get(substitution.variable, None) is not None:
|
||||
continue # If value is present in latest version, don't do anything
|
||||
if prop_path[0] == key:
|
||||
if isinstance(previous_item, ConfigValues) and not accept_unresolved:
|
||||
# We hit a dead end, we cannot evaluate
|
||||
raise ConfigSubstitutionException(
|
||||
"Property {variable} cannot be substituted. Check for cycles.".format(
|
||||
variable=substitution.variable
|
||||
)
|
||||
)
|
||||
else:
|
||||
value = previous_item if len(
|
||||
prop_path) == 1 else previous_item.get(".".join(prop_path[1:]))
|
||||
_, _, current_item = cls._do_substitute(substitution, value)
|
||||
previous_item = current_item
|
||||
|
||||
if len(history) == 1:
|
||||
for substitution in cls._find_substitutions(previous_item):
|
||||
prop_path = ConfigTree.parse_key(substitution.variable)
|
||||
if len(prop_path) > 1 and config.get(substitution.variable, None) is not None:
|
||||
continue # If value is present in latest version, don't do anything
|
||||
if prop_path[0] == key and substitution.optional:
|
||||
cls._do_substitute(substitution, None)
|
||||
if prop_path[0] == key:
|
||||
value = os.environ.get(key)
|
||||
if value is not None:
|
||||
cls._do_substitute(substitution, value)
|
||||
continue
|
||||
if substitution.optional: # special case, when self optional referencing without existing
|
||||
cls._do_substitute(substitution, None)
|
||||
|
||||
# traverse config to find all the substitutions
|
||||
@classmethod
|
||||
def _find_substitutions(cls, item):
|
||||
"""Convert HOCON input into a JSON output
|
||||
|
||||
:return: JSON string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
if isinstance(item, ConfigValues):
|
||||
return item.get_substitutions()
|
||||
|
||||
substitutions = []
|
||||
elements = []
|
||||
if isinstance(item, ConfigTree):
|
||||
elements = item.values()
|
||||
elif isinstance(item, list):
|
||||
elements = item
|
||||
|
||||
for child in elements:
|
||||
substitutions += cls._find_substitutions(child)
|
||||
return substitutions
|
||||
|
||||
@classmethod
|
||||
def _do_substitute(cls, substitution, resolved_value, is_optional_resolved=True):
|
||||
unresolved = False
|
||||
new_substitutions = []
|
||||
if isinstance(resolved_value, ConfigValues):
|
||||
resolved_value = resolved_value.transform()
|
||||
if isinstance(resolved_value, ConfigValues):
|
||||
unresolved = True
|
||||
result = resolved_value
|
||||
else:
|
||||
# replace token by substitution
|
||||
config_values = substitution.parent
|
||||
# if it is a string, then add the extra ws that was present in the original string after the substitution
|
||||
formatted_resolved_value = resolved_value \
|
||||
if resolved_value is None \
|
||||
or isinstance(resolved_value, (dict, list)) \
|
||||
or substitution.index == len(config_values.tokens) - 1 \
|
||||
else (str(resolved_value) + substitution.ws)
|
||||
# use a deepcopy of resolved_value to avoid mutation
|
||||
config_values.put(substitution.index, copy.deepcopy(formatted_resolved_value))
|
||||
transformation = config_values.transform()
|
||||
result = config_values.overriden_value \
|
||||
if transformation is None and not is_optional_resolved \
|
||||
else transformation
|
||||
|
||||
if result is None and config_values.key in config_values.parent:
|
||||
del config_values.parent[config_values.key]
|
||||
else:
|
||||
config_values.parent[config_values.key] = result
|
||||
s = cls._find_substitutions(result)
|
||||
if s:
|
||||
new_substitutions = s
|
||||
unresolved = True
|
||||
|
||||
return (unresolved, new_substitutions, result)
|
||||
|
||||
@classmethod
|
||||
def _final_fixup(cls, item):
|
||||
if isinstance(item, ConfigValues):
|
||||
return item.transform()
|
||||
elif isinstance(item, list):
|
||||
return list([cls._final_fixup(child) for child in item])
|
||||
elif isinstance(item, ConfigTree):
|
||||
items = list(item.items())
|
||||
for key, child in items:
|
||||
item[key] = cls._final_fixup(child)
|
||||
return item
|
||||
|
||||
@classmethod
|
||||
def unresolve_substitutions_to_value(cls, config, unresolved_value=STR_SUBSTITUTION):
|
||||
for substitution in cls._find_substitutions(config):
|
||||
if unresolved_value is STR_SUBSTITUTION:
|
||||
value = substitution.raw_str()
|
||||
elif unresolved_value is None:
|
||||
value = NoneValue()
|
||||
else:
|
||||
value = unresolved_value
|
||||
cls._do_substitute(substitution, value, False)
|
||||
cls._final_fixup(config)
|
||||
|
||||
@classmethod
|
||||
def resolve_substitutions(cls, config, accept_unresolved=False):
|
||||
has_unresolved = False
|
||||
cls._fixup_self_references(config, accept_unresolved)
|
||||
substitutions = cls._find_substitutions(config)
|
||||
if len(substitutions) > 0:
|
||||
unresolved = True
|
||||
any_unresolved = True
|
||||
_substitutions = []
|
||||
cache = {}
|
||||
while any_unresolved and len(substitutions) > 0 and set(substitutions) != set(_substitutions):
|
||||
unresolved = False
|
||||
any_unresolved = True
|
||||
_substitutions = substitutions[:]
|
||||
|
||||
for substitution in _substitutions:
|
||||
is_optional_resolved, resolved_value = cls._resolve_variable(config, substitution)
|
||||
|
||||
# if the substitution is optional
|
||||
if not is_optional_resolved and substitution.optional:
|
||||
resolved_value = None
|
||||
if isinstance(resolved_value, ConfigValues):
|
||||
parents = cache.get(resolved_value)
|
||||
if parents is None:
|
||||
parents = []
|
||||
link = resolved_value
|
||||
while isinstance(link, ConfigValues):
|
||||
parents.append(link)
|
||||
link = link.overriden_value
|
||||
cache[resolved_value] = parents
|
||||
|
||||
if isinstance(resolved_value, ConfigValues) \
|
||||
and substitution.parent in parents \
|
||||
and hasattr(substitution.parent, 'overriden_value') \
|
||||
and substitution.parent.overriden_value:
|
||||
|
||||
# self resolution, backtrack
|
||||
resolved_value = substitution.parent.overriden_value
|
||||
|
||||
unresolved, new_substitutions, result = cls._do_substitute(
|
||||
substitution, resolved_value, is_optional_resolved)
|
||||
any_unresolved = unresolved or any_unresolved
|
||||
substitutions.extend(new_substitutions)
|
||||
if not isinstance(result, ConfigValues):
|
||||
substitutions.remove(substitution)
|
||||
|
||||
cls._final_fixup(config)
|
||||
if unresolved:
|
||||
has_unresolved = True
|
||||
if not accept_unresolved:
|
||||
raise ConfigSubstitutionException("Cannot resolve {variables}. Check for cycles.".format(
|
||||
variables=', '.join('${{{variable}}}: (line: {line}, col: {col})'.format(
|
||||
variable=substitution.variable,
|
||||
line=lineno(substitution.loc, substitution.instring),
|
||||
col=col(substitution.loc, substitution.instring)) for substitution in substitutions)))
|
||||
|
||||
cls._final_fixup(config)
|
||||
return has_unresolved
|
||||
|
||||
|
||||
class ListParser(TokenConverter):
|
||||
"""Parse a list [elt1, etl2, ...]
|
||||
"""
|
||||
|
||||
def __init__(self, expr=None):
|
||||
super(ListParser, self).__init__(expr)
|
||||
self.saveAsList = True
|
||||
|
||||
def postParse(self, instring, loc, token_list):
|
||||
"""Create a list from the tokens
|
||||
|
||||
:param instring:
|
||||
:param loc:
|
||||
:param token_list:
|
||||
:return:
|
||||
"""
|
||||
cleaned_token_list = [token for tokens in (token.tokens if isinstance(token, ConfigInclude) else [token]
|
||||
for token in token_list if token != '')
|
||||
for token in tokens]
|
||||
config_list = ConfigList(cleaned_token_list)
|
||||
return [config_list]
|
||||
|
||||
|
||||
class ConcatenatedValueParser(TokenConverter):
|
||||
def __init__(self, expr=None):
|
||||
super(ConcatenatedValueParser, self).__init__(expr)
|
||||
self.parent = None
|
||||
self.key = None
|
||||
|
||||
def postParse(self, instring, loc, token_list):
|
||||
config_values = ConfigValues(token_list, instring, loc)
|
||||
return [config_values.transform()]
|
||||
|
||||
|
||||
class ConfigTreeParser(TokenConverter):
|
||||
"""
|
||||
Parse a config tree from tokens
|
||||
"""
|
||||
|
||||
def __init__(self, expr=None, root=False):
|
||||
super(ConfigTreeParser, self).__init__(expr)
|
||||
self.root = root
|
||||
self.saveAsList = True
|
||||
|
||||
def postParse(self, instring, loc, token_list):
|
||||
"""Create ConfigTree from tokens
|
||||
|
||||
:param instring:
|
||||
:param loc:
|
||||
:param token_list:
|
||||
:return:
|
||||
"""
|
||||
config_tree = ConfigTree(root=self.root)
|
||||
for element in token_list:
|
||||
expanded_tokens = element.tokens if isinstance(element, ConfigInclude) else [element]
|
||||
|
||||
for tokens in expanded_tokens:
|
||||
# key, value1 (optional), ...
|
||||
key = tokens[0].strip() if isinstance(tokens[0], (unicode, basestring)) else tokens[0]
|
||||
operator = '='
|
||||
if len(tokens) == 3 and tokens[1].strip() in [':', '=', '+=']:
|
||||
operator = tokens[1].strip()
|
||||
values = tokens[2:]
|
||||
elif len(tokens) == 2:
|
||||
values = tokens[1:]
|
||||
else:
|
||||
raise ParseSyntaxException("Unknown tokens {tokens} received".format(tokens=tokens))
|
||||
# empty string
|
||||
if len(values) == 0:
|
||||
config_tree.put(key, '')
|
||||
else:
|
||||
value = values[0]
|
||||
if isinstance(value, list) and operator == "+=":
|
||||
value = ConfigValues([ConfigSubstitution(key, True, '', False, loc), value], False, loc)
|
||||
config_tree.put(key, value, False)
|
||||
elif isinstance(value, unicode) and operator == "+=":
|
||||
value = ConfigValues([ConfigSubstitution(key, True, '', True, loc), ' ' + value], True, loc)
|
||||
config_tree.put(key, value, False)
|
||||
elif isinstance(value, list):
|
||||
config_tree.put(key, value, False)
|
||||
else:
|
||||
existing_value = config_tree.get(key, None)
|
||||
if isinstance(value, ConfigTree) and not isinstance(existing_value, list):
|
||||
# Only Tree has to be merged with tree
|
||||
config_tree.put(key, value, True)
|
||||
elif isinstance(value, ConfigValues):
|
||||
conf_value = value
|
||||
value.parent = config_tree
|
||||
value.key = key
|
||||
if isinstance(existing_value, list) or isinstance(existing_value, ConfigTree):
|
||||
config_tree.put(key, conf_value, True)
|
||||
else:
|
||||
config_tree.put(key, conf_value, False)
|
||||
else:
|
||||
config_tree.put(key, value, False)
|
||||
return config_tree
|
||||
608
clearml_agent/external/pyhocon/config_tree.py
vendored
Normal file
608
clearml_agent/external/pyhocon/config_tree.py
vendored
Normal file
@@ -0,0 +1,608 @@
|
||||
from collections import OrderedDict
|
||||
from pyparsing import lineno
|
||||
from pyparsing import col
|
||||
try:
|
||||
basestring
|
||||
except NameError: # pragma: no cover
|
||||
basestring = str
|
||||
unicode = str
|
||||
|
||||
import re
|
||||
import copy
|
||||
from .exceptions import ConfigException, ConfigWrongTypeException, ConfigMissingException
|
||||
|
||||
|
||||
class UndefinedKey(object):
|
||||
pass
|
||||
|
||||
|
||||
class NonExistentKey(object):
|
||||
pass
|
||||
|
||||
|
||||
class NoneValue(object):
|
||||
pass
|
||||
|
||||
|
||||
class ConfigTree(OrderedDict):
|
||||
KEY_SEP = '.'
|
||||
|
||||
def __init__(self, *args, **kwds):
|
||||
self.root = kwds.pop('root') if 'root' in kwds else False
|
||||
if self.root:
|
||||
self.history = {}
|
||||
super(ConfigTree, self).__init__(*args, **kwds)
|
||||
for key, value in self.items():
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = self
|
||||
value.index = key
|
||||
|
||||
@staticmethod
|
||||
def merge_configs(a, b, copy_trees=False):
|
||||
"""Merge config b into a
|
||||
|
||||
:param a: target config
|
||||
:type a: ConfigTree
|
||||
:param b: source config
|
||||
:type b: ConfigTree
|
||||
:return: merged config a
|
||||
"""
|
||||
for key, value in b.items():
|
||||
# if key is in both a and b and both values are dictionary then merge it otherwise override it
|
||||
if key in a and isinstance(a[key], ConfigTree) and isinstance(b[key], ConfigTree):
|
||||
if copy_trees:
|
||||
a[key] = a[key].copy()
|
||||
ConfigTree.merge_configs(a[key], b[key], copy_trees=copy_trees)
|
||||
else:
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = a
|
||||
value.key = key
|
||||
if key in a:
|
||||
value.overriden_value = a[key]
|
||||
a[key] = value
|
||||
if a.root:
|
||||
if b.root:
|
||||
a.history[key] = a.history.get(key, []) + b.history.get(key, [value])
|
||||
else:
|
||||
a.history[key] = a.history.get(key, []) + [value]
|
||||
|
||||
return a
|
||||
|
||||
def _put(self, key_path, value, append=False):
|
||||
key_elt = key_path[0]
|
||||
if len(key_path) == 1:
|
||||
# if value to set does not exist, override
|
||||
# if they are both configs then merge
|
||||
# if not then override
|
||||
if key_elt in self and isinstance(self[key_elt], ConfigTree) and isinstance(value, ConfigTree):
|
||||
if self.root:
|
||||
new_value = ConfigTree.merge_configs(ConfigTree(), self[key_elt], copy_trees=True)
|
||||
new_value = ConfigTree.merge_configs(new_value, value, copy_trees=True)
|
||||
self._push_history(key_elt, new_value)
|
||||
self[key_elt] = new_value
|
||||
else:
|
||||
ConfigTree.merge_configs(self[key_elt], value)
|
||||
elif append:
|
||||
# If we have t=1
|
||||
# and we try to put t.a=5 then t is replaced by {a: 5}
|
||||
l_value = self.get(key_elt, None)
|
||||
if isinstance(l_value, ConfigValues):
|
||||
l_value.tokens.append(value)
|
||||
l_value.recompute()
|
||||
elif isinstance(l_value, ConfigTree) and isinstance(value, ConfigValues):
|
||||
value.overriden_value = l_value
|
||||
value.tokens.insert(0, l_value)
|
||||
value.recompute()
|
||||
value.parent = self
|
||||
value.key = key_elt
|
||||
self._push_history(key_elt, value)
|
||||
self[key_elt] = value
|
||||
elif isinstance(l_value, list) and isinstance(value, ConfigValues):
|
||||
self._push_history(key_elt, value)
|
||||
value.overriden_value = l_value
|
||||
value.parent = self
|
||||
value.key = key_elt
|
||||
self[key_elt] = value
|
||||
elif isinstance(l_value, list):
|
||||
self[key_elt] = l_value + value
|
||||
self._push_history(key_elt, l_value)
|
||||
elif l_value is None:
|
||||
self._push_history(key_elt, value)
|
||||
self[key_elt] = value
|
||||
|
||||
else:
|
||||
raise ConfigWrongTypeException(
|
||||
u"Cannot concatenate the list {key}: {value} to {prev_value} of {type}".format(
|
||||
key='.'.join(key_path),
|
||||
value=value,
|
||||
prev_value=l_value,
|
||||
type=l_value.__class__.__name__)
|
||||
)
|
||||
else:
|
||||
# if there was an override keep overide value
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = self
|
||||
value.key = key_elt
|
||||
value.overriden_value = self.get(key_elt, None)
|
||||
self._push_history(key_elt, value)
|
||||
self[key_elt] = value
|
||||
else:
|
||||
next_config_tree = super(ConfigTree, self).get(key_elt)
|
||||
if not isinstance(next_config_tree, ConfigTree):
|
||||
# create a new dictionary or overwrite a previous value
|
||||
next_config_tree = ConfigTree()
|
||||
self._push_history(key_elt, next_config_tree)
|
||||
self[key_elt] = next_config_tree
|
||||
next_config_tree._put(key_path[1:], value, append)
|
||||
|
||||
def _push_history(self, key, value):
|
||||
if self.root:
|
||||
hist = self.history.get(key)
|
||||
if hist is None:
|
||||
hist = self.history[key] = []
|
||||
hist.append(value)
|
||||
|
||||
def _get(self, key_path, key_index=0, default=UndefinedKey):
|
||||
key_elt = key_path[key_index]
|
||||
elt = super(ConfigTree, self).get(key_elt, UndefinedKey)
|
||||
|
||||
if elt is UndefinedKey:
|
||||
if default is UndefinedKey:
|
||||
raise ConfigMissingException(u"No configuration setting found for key {key}".format(
|
||||
key='.'.join(key_path[: key_index + 1])))
|
||||
else:
|
||||
return default
|
||||
|
||||
if key_index == len(key_path) - 1:
|
||||
if isinstance(elt, NoneValue):
|
||||
return None
|
||||
elif isinstance(elt, list):
|
||||
return [None if isinstance(x, NoneValue) else x for x in elt]
|
||||
else:
|
||||
return elt
|
||||
elif isinstance(elt, ConfigTree):
|
||||
return elt._get(key_path, key_index + 1, default)
|
||||
else:
|
||||
if default is UndefinedKey:
|
||||
raise ConfigWrongTypeException(
|
||||
u"{key} has type {type} rather than dict".format(key='.'.join(key_path[:key_index + 1]),
|
||||
type=type(elt).__name__))
|
||||
else:
|
||||
return default
|
||||
|
||||
@staticmethod
|
||||
def parse_key(string):
|
||||
"""
|
||||
Split a key into path elements:
|
||||
- a.b.c => a, b, c
|
||||
- a."b.c" => a, QuotedKey("b.c") if . is any of the special characters: $}[]:=+#`^?!@*&.
|
||||
- "a" => a
|
||||
- a.b."c" => a, b, c (special case)
|
||||
:param string: either string key (parse '.' as sub-key) or int / float as regular keys
|
||||
:return:
|
||||
"""
|
||||
if isinstance(string, (int, float)):
|
||||
return [string]
|
||||
|
||||
special_characters = '$}[]:=+#`^?!@*&.'
|
||||
tokens = re.findall(
|
||||
r'"[^"]+"|[^{special_characters}]+'.format(special_characters=re.escape(special_characters)),
|
||||
string)
|
||||
|
||||
def contains_special_character(token):
|
||||
return any((c in special_characters) for c in token)
|
||||
|
||||
return [token if contains_special_character(token) else token.strip('"') for token in tokens]
|
||||
|
||||
def put(self, key, value, append=False):
|
||||
"""Put a value in the tree (dot separated)
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param value: value to put
|
||||
"""
|
||||
self._put(ConfigTree.parse_key(key), value, append)
|
||||
|
||||
def get(self, key, default=UndefinedKey):
|
||||
"""Get a value from the tree
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: object
|
||||
:return: value in the tree located at key
|
||||
"""
|
||||
return self._get(ConfigTree.parse_key(key), 0, default)
|
||||
|
||||
def get_string(self, key, default=UndefinedKey):
|
||||
"""Return string representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: basestring
|
||||
:return: string value
|
||||
:type return: basestring
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
string_value = unicode(value)
|
||||
if isinstance(value, bool):
|
||||
string_value = string_value.lower()
|
||||
return string_value
|
||||
|
||||
def pop(self, key, default=UndefinedKey):
|
||||
"""Remove specified key and return the corresponding value.
|
||||
If key is not found, default is returned if given, otherwise ConfigMissingException is raised
|
||||
|
||||
This method assumes the user wants to remove the last value in the chain so it parses via parse_key
|
||||
and pops the last value out of the dict.
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: object
|
||||
:param default: default value if key not found
|
||||
:return: value in the tree located at key
|
||||
"""
|
||||
if default != UndefinedKey and key not in self:
|
||||
return default
|
||||
|
||||
value = self.get(key, UndefinedKey)
|
||||
lst = ConfigTree.parse_key(key)
|
||||
parent = self.KEY_SEP.join(lst[0:-1])
|
||||
child = lst[-1]
|
||||
|
||||
if parent:
|
||||
self.get(parent).__delitem__(child)
|
||||
else:
|
||||
self.__delitem__(child)
|
||||
return value
|
||||
|
||||
def get_int(self, key, default=UndefinedKey):
|
||||
"""Return int representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: int
|
||||
:return: int value
|
||||
:type return: int
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
try:
|
||||
return int(value) if value is not None else None
|
||||
except (TypeError, ValueError):
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'int'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def get_float(self, key, default=UndefinedKey):
|
||||
"""Return float representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: float
|
||||
:return: float value
|
||||
:type return: float
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
try:
|
||||
return float(value) if value is not None else None
|
||||
except (TypeError, ValueError):
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'float'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def get_bool(self, key, default=UndefinedKey):
|
||||
"""Return boolean representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: bool
|
||||
:return: boolean value
|
||||
:type return: bool
|
||||
"""
|
||||
|
||||
# String conversions as per API-recommendations:
|
||||
# https://github.com/typesafehub/config/blob/master/HOCON.md#automatic-type-conversions
|
||||
bool_conversions = {
|
||||
None: None,
|
||||
'true': True, 'yes': True, 'on': True,
|
||||
'false': False, 'no': False, 'off': False
|
||||
}
|
||||
string_value = self.get_string(key, default)
|
||||
if string_value is not None:
|
||||
string_value = string_value.lower()
|
||||
try:
|
||||
return bool_conversions[string_value]
|
||||
except KeyError:
|
||||
raise ConfigException(
|
||||
u"{key} does not translate to a Boolean value".format(key=key))
|
||||
|
||||
def get_list(self, key, default=UndefinedKey):
|
||||
"""Return list representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: list
|
||||
:return: list value
|
||||
:type return: list
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
elif isinstance(value, ConfigTree):
|
||||
lst = []
|
||||
for k, v in sorted(value.items(), key=lambda kv: kv[0]):
|
||||
if re.match('^[1-9][0-9]*$|0', k):
|
||||
lst.append(v)
|
||||
else:
|
||||
raise ConfigException(u"{key} does not translate to a list".format(key=key))
|
||||
return lst
|
||||
elif value is None:
|
||||
return None
|
||||
else:
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'list'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def get_config(self, key, default=UndefinedKey):
|
||||
"""Return tree config representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: config
|
||||
:return: config value
|
||||
:type return: ConfigTree
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
if isinstance(value, dict):
|
||||
return value
|
||||
elif value is None:
|
||||
return None
|
||||
else:
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'config'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def __getitem__(self, item):
|
||||
val = self.get(item)
|
||||
if val is UndefinedKey:
|
||||
raise KeyError(item)
|
||||
return val
|
||||
|
||||
try:
|
||||
from collections import _OrderedDictItemsView
|
||||
except ImportError: # pragma: nocover
|
||||
pass
|
||||
else:
|
||||
def items(self): # pragma: nocover
|
||||
return self._OrderedDictItemsView(self)
|
||||
|
||||
def __getattr__(self, item):
|
||||
val = self.get(item, NonExistentKey)
|
||||
if val is NonExistentKey:
|
||||
return super(ConfigTree, self).__getattr__(item)
|
||||
return val
|
||||
|
||||
def __contains__(self, item):
|
||||
return self._get(self.parse_key(item), default=NoneValue) is not NoneValue
|
||||
|
||||
def with_fallback(self, config, resolve=True):
|
||||
"""
|
||||
return a new config with fallback on config
|
||||
:param config: config or filename of the config to fallback on
|
||||
:param resolve: resolve substitutions
|
||||
:return: new config with fallback on config
|
||||
"""
|
||||
if isinstance(config, ConfigTree):
|
||||
result = ConfigTree.merge_configs(copy.deepcopy(config), copy.deepcopy(self))
|
||||
else:
|
||||
from . import ConfigFactory
|
||||
result = ConfigTree.merge_configs(ConfigFactory.parse_file(config, resolve=False), copy.deepcopy(self))
|
||||
|
||||
if resolve:
|
||||
from . import ConfigParser
|
||||
ConfigParser.resolve_substitutions(result)
|
||||
return result
|
||||
|
||||
def as_plain_ordered_dict(self):
|
||||
"""return a deep copy of this config as a plain OrderedDict
|
||||
|
||||
The config tree should be fully resolved.
|
||||
|
||||
This is useful to get an object with no special semantics such as path expansion for the keys.
|
||||
In particular this means that keys that contain dots are not surrounded with '"' in the plain OrderedDict.
|
||||
|
||||
:return: this config as an OrderedDict
|
||||
:type return: OrderedDict
|
||||
"""
|
||||
def plain_value(v):
|
||||
if isinstance(v, list):
|
||||
return [plain_value(e) for e in v]
|
||||
elif isinstance(v, ConfigTree):
|
||||
return v.as_plain_ordered_dict()
|
||||
else:
|
||||
if isinstance(v, ConfigValues):
|
||||
raise ConfigException("The config tree contains unresolved elements")
|
||||
return v
|
||||
|
||||
return OrderedDict((key.strip('"') if isinstance(key, (unicode, basestring)) else key, plain_value(value))
|
||||
for key, value in self.items())
|
||||
|
||||
|
||||
class ConfigList(list):
|
||||
def __init__(self, iterable=[]):
|
||||
new_list = list(iterable)
|
||||
super(ConfigList, self).__init__(new_list)
|
||||
for index, value in enumerate(new_list):
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = self
|
||||
value.key = index
|
||||
|
||||
|
||||
class ConfigInclude(object):
|
||||
def __init__(self, tokens):
|
||||
self.tokens = tokens
|
||||
|
||||
|
||||
class ConfigValues(object):
|
||||
def __init__(self, tokens, instring, loc):
|
||||
self.tokens = tokens
|
||||
self.parent = None
|
||||
self.key = None
|
||||
self._instring = instring
|
||||
self._loc = loc
|
||||
self.overriden_value = None
|
||||
self.recompute()
|
||||
|
||||
def recompute(self):
|
||||
for index, token in enumerate(self.tokens):
|
||||
if isinstance(token, ConfigSubstitution):
|
||||
token.parent = self
|
||||
token.index = index
|
||||
|
||||
# no value return empty string
|
||||
if len(self.tokens) == 0:
|
||||
self.tokens = ['']
|
||||
|
||||
# if the last token is an unquoted string then right strip it
|
||||
if isinstance(self.tokens[-1], ConfigUnquotedString):
|
||||
# rstrip only whitespaces, not \n\r because they would have been used escaped
|
||||
self.tokens[-1] = self.tokens[-1].rstrip(' \t')
|
||||
|
||||
def has_substitution(self):
|
||||
return len(self.get_substitutions()) > 0
|
||||
|
||||
def get_substitutions(self):
|
||||
lst = []
|
||||
node = self
|
||||
while node:
|
||||
lst = [token for token in node.tokens if isinstance(token, ConfigSubstitution)] + lst
|
||||
if hasattr(node, 'overriden_value'):
|
||||
node = node.overriden_value
|
||||
if not isinstance(node, ConfigValues):
|
||||
break
|
||||
else:
|
||||
break
|
||||
return lst
|
||||
|
||||
def transform(self):
|
||||
def determine_type(token):
|
||||
return ConfigTree if isinstance(token, ConfigTree) else ConfigList if isinstance(token, list) else str
|
||||
|
||||
def format_str(v, last=False):
|
||||
if isinstance(v, ConfigQuotedString):
|
||||
return v.value + ('' if last else v.ws)
|
||||
else:
|
||||
return '' if v is None else unicode(v)
|
||||
|
||||
if self.has_substitution():
|
||||
return self
|
||||
|
||||
# remove None tokens
|
||||
tokens = [token for token in self.tokens if token is not None]
|
||||
|
||||
if not tokens:
|
||||
return None
|
||||
|
||||
# check if all tokens are compatible
|
||||
first_tok_type = determine_type(tokens[0])
|
||||
for index, token in enumerate(tokens[1:]):
|
||||
tok_type = determine_type(token)
|
||||
if first_tok_type is not tok_type:
|
||||
raise ConfigWrongTypeException(
|
||||
"Token '{token}' of type {tok_type} (index {index}) must be of type {req_tok_type} "
|
||||
"(line: {line}, col: {col})".format(
|
||||
token=token,
|
||||
index=index + 1,
|
||||
tok_type=tok_type.__name__,
|
||||
req_tok_type=first_tok_type.__name__,
|
||||
line=lineno(self._loc, self._instring),
|
||||
col=col(self._loc, self._instring)))
|
||||
|
||||
if first_tok_type is ConfigTree:
|
||||
child = []
|
||||
if hasattr(self, 'overriden_value'):
|
||||
node = self.overriden_value
|
||||
while node:
|
||||
if isinstance(node, ConfigValues):
|
||||
value = node.transform()
|
||||
if isinstance(value, ConfigTree):
|
||||
child.append(value)
|
||||
else:
|
||||
break
|
||||
elif isinstance(node, ConfigTree):
|
||||
child.append(node)
|
||||
else:
|
||||
break
|
||||
if hasattr(node, 'overriden_value'):
|
||||
node = node.overriden_value
|
||||
else:
|
||||
break
|
||||
|
||||
result = ConfigTree()
|
||||
for conf in reversed(child):
|
||||
ConfigTree.merge_configs(result, conf, copy_trees=True)
|
||||
for token in tokens:
|
||||
ConfigTree.merge_configs(result, token, copy_trees=True)
|
||||
return result
|
||||
elif first_tok_type is ConfigList:
|
||||
result = []
|
||||
main_index = 0
|
||||
for sublist in tokens:
|
||||
sublist_result = ConfigList()
|
||||
for token in sublist:
|
||||
if isinstance(token, ConfigValues):
|
||||
token.parent = result
|
||||
token.key = main_index
|
||||
main_index += 1
|
||||
sublist_result.append(token)
|
||||
result.extend(sublist_result)
|
||||
return result
|
||||
else:
|
||||
if len(tokens) == 1:
|
||||
if isinstance(tokens[0], ConfigQuotedString):
|
||||
return tokens[0].value
|
||||
return tokens[0]
|
||||
else:
|
||||
return ''.join(format_str(token) for token in tokens[:-1]) + format_str(tokens[-1], True)
|
||||
|
||||
def put(self, index, value):
|
||||
self.tokens[index] = value
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return '[ConfigValues: ' + ','.join(str(o) for o in self.tokens) + ']'
|
||||
|
||||
|
||||
class ConfigSubstitution(object):
|
||||
def __init__(self, variable, optional, ws, instring, loc):
|
||||
self.variable = variable
|
||||
self.optional = optional
|
||||
self.ws = ws
|
||||
self.index = None
|
||||
self.parent = None
|
||||
self.instring = instring
|
||||
self.loc = loc
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return '[ConfigSubstitution: ' + self.variable + ']'
|
||||
|
||||
|
||||
class ConfigUnquotedString(unicode):
|
||||
def __new__(cls, value):
|
||||
return super(ConfigUnquotedString, cls).__new__(cls, value)
|
||||
|
||||
|
||||
class ConfigQuotedString(object):
|
||||
def __init__(self, value, ws, instring, loc):
|
||||
self.value = value
|
||||
self.ws = ws
|
||||
self.instring = instring
|
||||
self.loc = loc
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return '[ConfigQuotedString: ' + self.value + ']'
|
||||
329
clearml_agent/external/pyhocon/converter.py
vendored
Normal file
329
clearml_agent/external/pyhocon/converter.py
vendored
Normal file
@@ -0,0 +1,329 @@
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
||||
from . import ConfigFactory
|
||||
from .config_tree import ConfigQuotedString
|
||||
from .config_tree import ConfigSubstitution
|
||||
from .config_tree import ConfigTree
|
||||
from .config_tree import ConfigValues
|
||||
from .config_tree import NoneValue
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
unicode = str
|
||||
|
||||
|
||||
class HOCONConverter(object):
|
||||
_number_re = r'[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE][+\-]?\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))'
|
||||
_number_re_matcher = re.compile(_number_re)
|
||||
|
||||
@classmethod
|
||||
def to_json(cls, config, compact=False, indent=2, level=0):
|
||||
"""Convert HOCON input into a JSON output
|
||||
|
||||
:return: JSON string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
lines = ""
|
||||
if isinstance(config, ConfigTree):
|
||||
if len(config) == 0:
|
||||
lines += '{}'
|
||||
else:
|
||||
lines += '{\n'
|
||||
bet_lines = []
|
||||
for key, item in config.items():
|
||||
bet_lines.append('{indent}"{key}": {value}'.format(
|
||||
indent=''.rjust((level + 1) * indent, ' '),
|
||||
key=key.strip('"'), # for dotted keys enclosed with "" to not be interpreted as nested key
|
||||
value=cls.to_json(item, compact, indent, level + 1))
|
||||
)
|
||||
lines += ',\n'.join(bet_lines)
|
||||
lines += '\n{indent}}}'.format(indent=''.rjust(level * indent, ' '))
|
||||
elif isinstance(config, list):
|
||||
if len(config) == 0:
|
||||
lines += '[]'
|
||||
else:
|
||||
lines += '[\n'
|
||||
bet_lines = []
|
||||
for item in config:
|
||||
bet_lines.append('{indent}{value}'.format(
|
||||
indent=''.rjust((level + 1) * indent, ' '),
|
||||
value=cls.to_json(item, compact, indent, level + 1))
|
||||
)
|
||||
lines += ',\n'.join(bet_lines)
|
||||
lines += '\n{indent}]'.format(indent=''.rjust(level * indent, ' '))
|
||||
elif isinstance(config, basestring):
|
||||
lines = json.dumps(config)
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
lines = 'null'
|
||||
elif config is True:
|
||||
lines = 'true'
|
||||
elif config is False:
|
||||
lines = 'false'
|
||||
else:
|
||||
lines = str(config)
|
||||
return lines
|
||||
|
||||
@staticmethod
|
||||
def _auto_indent(lines, section):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
indent = len(lines) - lines.rindex('\n')
|
||||
except Exception:
|
||||
indent = len(lines)
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
section_indent = section.index('\n')
|
||||
except Exception:
|
||||
section_indent = len(section)
|
||||
if section_indent < 3:
|
||||
return lines + section
|
||||
|
||||
indent = '\n' + ''.rjust(indent, ' ')
|
||||
return lines + indent.join([sec.strip() for sec in section.split('\n')])
|
||||
# indent = ''.rjust(indent, ' ')
|
||||
# return lines + section.replace('\n', '\n'+indent)
|
||||
|
||||
@classmethod
|
||||
def to_hocon(cls, config, compact=False, indent=2, level=0):
|
||||
"""Convert HOCON input into a HOCON output
|
||||
|
||||
:return: JSON string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
lines = ""
|
||||
if isinstance(config, ConfigTree):
|
||||
if len(config) == 0:
|
||||
lines += '{}'
|
||||
else:
|
||||
if level > 0: # don't display { at root level
|
||||
lines += '{\n'
|
||||
bet_lines = []
|
||||
|
||||
for key, item in config.items():
|
||||
if compact:
|
||||
full_key = key
|
||||
while isinstance(item, ConfigTree) and len(item) == 1:
|
||||
key, item = next(iter(item.items()))
|
||||
full_key += '.' + key
|
||||
else:
|
||||
full_key = key
|
||||
|
||||
if isinstance(full_key, float) or \
|
||||
(isinstance(full_key, (basestring, unicode)) and cls._number_re_matcher.match(full_key)):
|
||||
# if key can be casted to float, and it is a string, make sure we quote it
|
||||
full_key = '\"{}\"'.format(full_key)
|
||||
|
||||
bet_line = ('{indent}{key}{assign_sign} '.format(
|
||||
indent=''.rjust(level * indent, ' '),
|
||||
key=full_key,
|
||||
assign_sign='' if isinstance(item, dict) else ' =',)
|
||||
)
|
||||
value_line = cls.to_hocon(item, compact, indent, level + 1)
|
||||
if isinstance(item, (list, tuple)):
|
||||
bet_lines.append(cls._auto_indent(bet_line, value_line))
|
||||
else:
|
||||
bet_lines.append(bet_line + value_line)
|
||||
lines += '\n'.join(bet_lines)
|
||||
|
||||
if level > 0: # don't display { at root level
|
||||
lines += '\n{indent}}}'.format(indent=''.rjust((level - 1) * indent, ' '))
|
||||
elif isinstance(config, (list, tuple)):
|
||||
if len(config) == 0:
|
||||
lines += '[]'
|
||||
else:
|
||||
# lines += '[\n'
|
||||
lines += '['
|
||||
bet_lines = []
|
||||
base_len = len(lines)
|
||||
skip_comma = False
|
||||
for i, item in enumerate(config):
|
||||
if 0 < i and not skip_comma:
|
||||
# if not isinstance(item, (str, int, float)):
|
||||
# lines += ',\n{indent}'.format(indent=''.rjust(level * indent, ' '))
|
||||
# else:
|
||||
# lines += ', '
|
||||
lines += ', '
|
||||
|
||||
skip_comma = False
|
||||
new_line = cls.to_hocon(item, compact, indent, level + 1)
|
||||
lines += new_line
|
||||
if '\n' in new_line or len(lines) - base_len > 80:
|
||||
if i < len(config) - 1:
|
||||
lines += ',\n{indent}'.format(indent=''.rjust(level * indent, ' '))
|
||||
base_len = len(lines)
|
||||
skip_comma = True
|
||||
# bet_lines.append('{value}'.format(value=cls.to_hocon(item, compact, indent, level + 1)))
|
||||
|
||||
# lines += '\n'.join(bet_lines)
|
||||
# lines += ', '.join(bet_lines)
|
||||
|
||||
# lines += '\n{indent}]'.format(indent=''.rjust((level - 1) * indent, ' '))
|
||||
lines += ']'
|
||||
elif isinstance(config, basestring):
|
||||
if '\n' in config and len(config) > 1:
|
||||
lines = '"""{value}"""'.format(value=config) # multilines
|
||||
else:
|
||||
lines = '"{value}"'.format(value=cls.__escape_string(config))
|
||||
elif isinstance(config, ConfigValues):
|
||||
lines = ''.join(cls.to_hocon(o, compact, indent, level) for o in config.tokens)
|
||||
elif isinstance(config, ConfigSubstitution):
|
||||
lines = '${'
|
||||
if config.optional:
|
||||
lines += '?'
|
||||
lines += config.variable + '}' + config.ws
|
||||
elif isinstance(config, ConfigQuotedString):
|
||||
if '\n' in config.value and len(config.value) > 1:
|
||||
lines = '"""{value}"""'.format(value=config.value) # multilines
|
||||
else:
|
||||
lines = '"{value}"'.format(value=cls.__escape_string(config.value))
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
lines = 'null'
|
||||
elif config is True:
|
||||
lines = 'true'
|
||||
elif config is False:
|
||||
lines = 'false'
|
||||
else:
|
||||
lines = str(config)
|
||||
return lines
|
||||
|
||||
@classmethod
|
||||
def to_yaml(cls, config, compact=False, indent=2, level=0):
|
||||
"""Convert HOCON input into a YAML output
|
||||
|
||||
:return: YAML string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
lines = ""
|
||||
if isinstance(config, ConfigTree):
|
||||
if len(config) > 0:
|
||||
if level > 0:
|
||||
lines += '\n'
|
||||
bet_lines = []
|
||||
for key, item in config.items():
|
||||
bet_lines.append('{indent}{key}: {value}'.format(
|
||||
indent=''.rjust(level * indent, ' '),
|
||||
key=key.strip('"'), # for dotted keys enclosed with "" to not be interpreted as nested key,
|
||||
value=cls.to_yaml(item, compact, indent, level + 1))
|
||||
)
|
||||
lines += '\n'.join(bet_lines)
|
||||
elif isinstance(config, list):
|
||||
config_list = [line for line in config if line is not None]
|
||||
if len(config_list) == 0:
|
||||
lines += '[]'
|
||||
else:
|
||||
lines += '\n'
|
||||
bet_lines = []
|
||||
for item in config_list:
|
||||
bet_lines.append('{indent}- {value}'.format(indent=''.rjust(level * indent, ' '),
|
||||
value=cls.to_yaml(item, compact, indent, level + 1)))
|
||||
lines += '\n'.join(bet_lines)
|
||||
elif isinstance(config, basestring):
|
||||
# if it contains a \n then it's multiline
|
||||
lines = config.split('\n')
|
||||
if len(lines) == 1:
|
||||
lines = config
|
||||
else:
|
||||
lines = '|\n' + '\n'.join([line.rjust(level * indent, ' ') for line in lines])
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
lines = 'null'
|
||||
elif config is True:
|
||||
lines = 'true'
|
||||
elif config is False:
|
||||
lines = 'false'
|
||||
else:
|
||||
lines = str(config)
|
||||
return lines
|
||||
|
||||
@classmethod
|
||||
def to_properties(cls, config, compact=False, indent=2, key_stack=[]):
|
||||
"""Convert HOCON input into a .properties output
|
||||
|
||||
:return: .properties string representation
|
||||
:type return: basestring
|
||||
:return:
|
||||
"""
|
||||
|
||||
def escape_value(value):
|
||||
return value.replace('=', '\\=').replace('!', '\\!').replace('#', '\\#').replace('\n', '\\\n')
|
||||
|
||||
stripped_key_stack = [key.strip('"') for key in key_stack]
|
||||
lines = []
|
||||
if isinstance(config, ConfigTree):
|
||||
for key, item in config.items():
|
||||
if item is not None:
|
||||
lines.append(cls.to_properties(item, compact, indent, stripped_key_stack + [key]))
|
||||
elif isinstance(config, list):
|
||||
for index, item in enumerate(config):
|
||||
if item is not None:
|
||||
lines.append(cls.to_properties(item, compact, indent, stripped_key_stack + [str(index)]))
|
||||
elif isinstance(config, basestring):
|
||||
lines.append('.'.join(stripped_key_stack) + ' = ' + escape_value(config))
|
||||
elif config is True:
|
||||
lines.append('.'.join(stripped_key_stack) + ' = true')
|
||||
elif config is False:
|
||||
lines.append('.'.join(stripped_key_stack) + ' = false')
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
pass
|
||||
else:
|
||||
lines.append('.'.join(stripped_key_stack) + ' = ' + str(config))
|
||||
return '\n'.join([line for line in lines if len(line) > 0])
|
||||
|
||||
@classmethod
|
||||
def convert(cls, config, output_format='json', indent=2, compact=False):
|
||||
converters = {
|
||||
'json': cls.to_json,
|
||||
'properties': cls.to_properties,
|
||||
'yaml': cls.to_yaml,
|
||||
'hocon': cls.to_hocon,
|
||||
}
|
||||
|
||||
if output_format in converters:
|
||||
return converters[output_format](config, compact, indent)
|
||||
else:
|
||||
raise Exception("Invalid format '{format}'. Format must be 'json', 'properties', 'yaml' or 'hocon'".format(
|
||||
format=output_format))
|
||||
|
||||
@classmethod
|
||||
def convert_from_file(cls, input_file=None, output_file=None, output_format='json', indent=2, compact=False):
|
||||
"""Convert to json, properties or yaml
|
||||
|
||||
:param input_file: input file, if not specified stdin
|
||||
:param output_file: output file, if not specified stdout
|
||||
:param output_format: json, properties or yaml
|
||||
:return: json, properties or yaml string representation
|
||||
"""
|
||||
|
||||
if input_file is None:
|
||||
content = sys.stdin.read()
|
||||
config = ConfigFactory.parse_string(content)
|
||||
else:
|
||||
config = ConfigFactory.parse_file(input_file)
|
||||
|
||||
res = cls.convert(config, output_format, indent, compact)
|
||||
if output_file is None:
|
||||
print(res)
|
||||
else:
|
||||
with open(output_file, "w") as fd:
|
||||
fd.write(res)
|
||||
|
||||
@classmethod
|
||||
def __escape_match(cls, match):
|
||||
char = match.group(0)
|
||||
return {
|
||||
'\b': r'\b',
|
||||
'\t': r'\t',
|
||||
'\n': r'\n',
|
||||
'\f': r'\f',
|
||||
'\r': r'\r',
|
||||
'"': r'\"',
|
||||
'\\': r'\\',
|
||||
}.get(char) or (r'\u%04x' % ord(char))
|
||||
|
||||
@classmethod
|
||||
def __escape_string(cls, string):
|
||||
return re.sub(r'[\x00-\x1F"\\]', cls.__escape_match, string)
|
||||
17
clearml_agent/external/pyhocon/exceptions.py
vendored
Normal file
17
clearml_agent/external/pyhocon/exceptions.py
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
class ConfigException(Exception):
|
||||
|
||||
def __init__(self, message, ex=None):
|
||||
super(ConfigException, self).__init__(message)
|
||||
self._exception = ex
|
||||
|
||||
|
||||
class ConfigMissingException(ConfigException, KeyError):
|
||||
pass
|
||||
|
||||
|
||||
class ConfigSubstitutionException(ConfigException):
|
||||
pass
|
||||
|
||||
|
||||
class ConfigWrongTypeException(ConfigException):
|
||||
pass
|
||||
@@ -1,22 +1,26 @@
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES
|
||||
|
||||
from .requirement import Requirement
|
||||
|
||||
|
||||
def parse(reqstr):
|
||||
def parse(reqstr, cwd=None):
|
||||
"""
|
||||
Parse a requirements file into a list of Requirements
|
||||
|
||||
See: pip/req.py:parse_requirements()
|
||||
|
||||
:param reqstr: a string or file like object containing requirements
|
||||
:param cwd: Optional current working dir for -r file.txt loading
|
||||
:returns: a *generator* of Requirement objects
|
||||
"""
|
||||
filename = getattr(reqstr, 'name', None)
|
||||
try:
|
||||
# Python 2.x compatibility
|
||||
if not isinstance(reqstr, basestring):
|
||||
if not isinstance(reqstr, basestring): # noqa
|
||||
reqstr = reqstr.read()
|
||||
except NameError:
|
||||
# Python 3.x only
|
||||
@@ -30,18 +34,25 @@ def parse(reqstr):
|
||||
elif not line or line.startswith('#'):
|
||||
# comments are lines that start with # only
|
||||
continue
|
||||
elif line.startswith('-r') or line.startswith('--requirement'):
|
||||
elif line.startswith('-r ') or line.startswith('--requirement '):
|
||||
_, new_filename = line.split()
|
||||
new_file_path = os.path.join(os.path.dirname(filename or '.'),
|
||||
new_filename)
|
||||
new_file_path = os.path.join(
|
||||
os.path.dirname(filename or '.') if filename or not cwd else cwd, new_filename)
|
||||
if not os.path.exists(new_file_path):
|
||||
continue
|
||||
with open(new_file_path) as f:
|
||||
for requirement in parse(f):
|
||||
yield requirement
|
||||
elif line.startswith('-f') or line.startswith('--find-links') or \
|
||||
line.startswith('-i') or line.startswith('--index-url') or \
|
||||
line.startswith('--extra-index-url') or \
|
||||
line.startswith('--no-index'):
|
||||
warnings.warn('Private repos not supported. Skipping.')
|
||||
elif line.startswith('--extra-index-url'):
|
||||
extra_index = line[len('--extra-index-url'):].strip()
|
||||
extra_index = re.sub(r"\s+#.*$", "", extra_index) # strip comments
|
||||
if extra_index and extra_index not in PIP_EXTRA_INDICES:
|
||||
PIP_EXTRA_INDICES.append(extra_index)
|
||||
print(f"appended {extra_index} to list of extra pip indices")
|
||||
continue
|
||||
elif line.startswith('-Z') or line.startswith('--always-unzip'):
|
||||
warnings.warn('Unused option --always-unzip. Skipping.')
|
||||
@@ -20,6 +20,15 @@ VCS_REGEX = re.compile(
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
VCS_EXT_REGEX = re.compile(
|
||||
r'^(?P<scheme>{0})(@)'.format(r'|'.join(
|
||||
[scheme.replace('+', r'\+') for scheme in ['git+git']])) +
|
||||
r'((?P<login>[^/@]+)@)?'
|
||||
r'(?P<path>[^#@]+)'
|
||||
r'(@(?P<revision>[^#]+))?'
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
# This matches just about everyting
|
||||
LOCAL_REGEX = re.compile(
|
||||
r'^((?P<scheme>file)://)?'
|
||||
@@ -30,7 +39,7 @@ LOCAL_REGEX = re.compile(
|
||||
|
||||
class Requirement(object):
|
||||
"""
|
||||
Represents a single requirementfrom trains_agent.external.requirements_parser.requirement import Requirement
|
||||
Represents a single requirement from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
Typically instances of this class are created with ``Requirement.parse``.
|
||||
For local file requirements, there's no verification that the file
|
||||
@@ -100,7 +109,7 @@ class Requirement(object):
|
||||
|
||||
req = cls('-e {0}'.format(line))
|
||||
req.editable = True
|
||||
vcs_match = VCS_REGEX.match(line)
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
if vcs_match is not None:
|
||||
@@ -147,7 +156,7 @@ class Requirement(object):
|
||||
|
||||
req = cls(line)
|
||||
|
||||
vcs_match = VCS_REGEX.match(line)
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
uri_match = URI_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
@@ -205,6 +214,7 @@ class Requirement(object):
|
||||
def parse(cls, line):
|
||||
"""
|
||||
Parses a Requirement from a line of a requirement file.
|
||||
This is the main entry point for parsing a single requirements line (not parse_line!)
|
||||
|
||||
:param line: a line of a requirement file
|
||||
:returns: a Requirement instance for the given line
|
||||
@@ -217,7 +227,7 @@ class Requirement(object):
|
||||
return cls.parse_editable(
|
||||
re.sub(r'^(-e|--editable=?)\s*', '', line))
|
||||
elif '@' in line and ('#' not in line or line.index('#') > line.index('@')):
|
||||
# Allegro bug fix: support 'name @ git+' entries
|
||||
# ClearML bug fix: support 'name @ git+' entries
|
||||
name, uri = line.split('@', 1)
|
||||
name = name.strip()
|
||||
uri = uri.strip()
|
||||
@@ -226,7 +236,7 @@ class Requirement(object):
|
||||
# check if the name is valid & parsed
|
||||
Req.parse(name)
|
||||
# if we are here, name is a valid package name, check if the vcs part is valid
|
||||
if VCS_REGEX.match(uri):
|
||||
if VCS_REGEX.match(uri) or VCS_EXT_REGEX.match(uri):
|
||||
req = cls.parse_line(uri)
|
||||
req.name = name
|
||||
return req
|
||||
15
clearml_agent/glue/daemon.py
Normal file
15
clearml_agent/glue/daemon.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from threading import Thread
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
class K8sDaemon(Thread):
|
||||
|
||||
def __init__(self, agent):
|
||||
super(K8sDaemon, self).__init__(target=self.target)
|
||||
self.daemon = True
|
||||
self._agent = agent
|
||||
self.log = agent.log
|
||||
self._session: Session = agent._session
|
||||
|
||||
def target(self):
|
||||
pass
|
||||
11
clearml_agent/glue/definitions.py
Normal file
11
clearml_agent/glue/definitions.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from clearml_agent.helper.environment import EnvEntry
|
||||
|
||||
ENV_START_AGENT_SCRIPT_PATH = EnvEntry("CLEARML_K8S_GLUE_START_AGENT_SCRIPT_PATH", default="~/__start_agent__.sh")
|
||||
"""
|
||||
Script path to use when creating the bash script to run the agent inside the scheduled pod's docker container.
|
||||
Script will be appended to the specified file.
|
||||
"""
|
||||
|
||||
ENV_DEFAULT_EXECUTION_AGENT_ARGS = EnvEntry("K8S_GLUE_DEF_EXEC_AGENT_ARGS", default="--full-monitoring --require-queue")
|
||||
ENV_POD_AGENT_INSTALL_ARGS = EnvEntry("K8S_GLUE_POD_AGENT_INSTALL_ARGS", default="", lstrip=False)
|
||||
ENV_POD_MONITOR_LOG_BATCH_SIZE = EnvEntry("K8S_GLUE_POD_MONITOR_LOG_BATCH_SIZE", default=5, converter=int)
|
||||
12
clearml_agent/glue/errors.py
Normal file
12
clearml_agent/glue/errors.py
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
class GetPodsError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class GetJobsError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class GetPodCountError(Exception):
|
||||
pass
|
||||
|
||||
1140
clearml_agent/glue/k8s.py
Normal file
1140
clearml_agent/glue/k8s.py
Normal file
File diff suppressed because it is too large
Load Diff
236
clearml_agent/glue/pending_pods_daemon.py
Normal file
236
clearml_agent/glue/pending_pods_daemon.py
Normal file
@@ -0,0 +1,236 @@
|
||||
from time import sleep
|
||||
from typing import Dict, Tuple, Optional, List
|
||||
|
||||
from clearml_agent.backend_api.session import Request
|
||||
from clearml_agent.glue.utilities import get_bash_output
|
||||
|
||||
from clearml_agent.helper.process import stringify_bash_output
|
||||
|
||||
from .daemon import K8sDaemon
|
||||
from .utilities import get_path
|
||||
from .errors import GetPodsError
|
||||
|
||||
|
||||
class PendingPodsDaemon(K8sDaemon):
|
||||
def __init__(self, polling_interval: float, agent):
|
||||
super(PendingPodsDaemon, self).__init__(agent=agent)
|
||||
self._polling_interval = polling_interval
|
||||
self._last_tasks_msgs = {} # last msg updated for every task
|
||||
|
||||
def get_pods(self, pod_name=None):
|
||||
filters = ["status.phase=Pending"]
|
||||
if pod_name:
|
||||
filters.append(f"metadata.name={pod_name}")
|
||||
|
||||
if self._agent.using_jobs:
|
||||
return self._agent.get_pods_for_jobs(
|
||||
job_condition="status.active=1", pod_filters=filters, debug_msg="Detecting pending pods: {cmd}"
|
||||
)
|
||||
return self._agent.get_pods(filters=filters, debug_msg="Detecting pending pods: {cmd}")
|
||||
|
||||
def _get_pod_name(self, pod: dict):
|
||||
return get_path(pod, "metadata", "name")
|
||||
|
||||
def _get_k8s_resource_name(self, pod: dict):
|
||||
if self._agent.using_jobs:
|
||||
return get_path(pod, "metadata", "labels", "job-name")
|
||||
return get_path(pod, "metadata", "name")
|
||||
|
||||
def _get_task_id(self, pod: dict):
|
||||
return self._get_k8s_resource_name(pod).rpartition('-')[-1]
|
||||
|
||||
@staticmethod
|
||||
def _get_k8s_resource_namespace(pod: dict):
|
||||
return pod.get('metadata', {}).get('namespace', None)
|
||||
|
||||
def target(self):
|
||||
"""
|
||||
Handle pending objects (pods or jobs, depending on the agent mode).
|
||||
- Delete any pending objects that are not expected to recover
|
||||
- Delete any pending objects for whom the associated task was aborted
|
||||
"""
|
||||
while True:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# Get pods (standalone pods if we're in pods mode, or pods associated to jobs if we're in jobs mode)
|
||||
pods = self.get_pods()
|
||||
if pods is None:
|
||||
raise GetPodsError()
|
||||
|
||||
task_id_to_pod = dict()
|
||||
|
||||
for pod in pods:
|
||||
pod_name = self._get_pod_name(pod)
|
||||
if not pod_name:
|
||||
continue
|
||||
|
||||
task_id = self._get_task_id(pod)
|
||||
if not task_id:
|
||||
continue
|
||||
|
||||
namespace = self._get_k8s_resource_namespace(pod)
|
||||
if not namespace:
|
||||
continue
|
||||
|
||||
task_id_to_pod[task_id] = pod
|
||||
|
||||
msg = None
|
||||
tags = []
|
||||
|
||||
waiting = get_path(pod, 'status', 'containerStatuses', 0, 'state', 'waiting')
|
||||
if not waiting:
|
||||
condition = get_path(pod, 'status', 'conditions', 0)
|
||||
if condition:
|
||||
reason = condition.get('reason')
|
||||
if reason == 'Unschedulable':
|
||||
message = condition.get('message')
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
else:
|
||||
reason = waiting.get("reason", None)
|
||||
message = waiting.get("message", None)
|
||||
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
|
||||
if reason == 'ImagePullBackOff':
|
||||
self.delete_k8s_resource(k8s_resource=pod, msg=reason)
|
||||
try:
|
||||
self._session.api_client.tasks.failed(
|
||||
task=task_id,
|
||||
status_reason="K8S glue error: {}".format(msg),
|
||||
status_message="Changed by K8S glue",
|
||||
force=True
|
||||
)
|
||||
self._agent.send_logs(
|
||||
task_id, ["K8S Error: {}".format(msg)],
|
||||
session=self._session
|
||||
)
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pending monitor: Failed deleting task "{}"\nEX: {}'.format(task_id, ex)
|
||||
)
|
||||
|
||||
# clean up any msg for this task
|
||||
self._last_tasks_msgs.pop(task_id, None)
|
||||
continue
|
||||
|
||||
self._update_pending_task_msg(task_id, msg, tags)
|
||||
|
||||
if task_id_to_pod:
|
||||
self._process_tasks_for_pending_pods(task_id_to_pod)
|
||||
|
||||
# clean up any last message for a task that wasn't seen as a pod
|
||||
self._last_tasks_msgs = {k: v for k, v in self._last_tasks_msgs.items() if k in task_id_to_pod}
|
||||
except GetPodsError:
|
||||
pass
|
||||
except Exception:
|
||||
self.log.exception("Hanging pods daemon loop")
|
||||
|
||||
sleep(self._polling_interval)
|
||||
|
||||
def delete_k8s_resource(self, k8s_resource: dict, msg: str = None):
|
||||
delete_cmd = "kubectl delete {kind} {name} -n {namespace} --output name".format(
|
||||
kind=self._agent.kind,
|
||||
name=self._get_k8s_resource_name(k8s_resource),
|
||||
namespace=self._get_k8s_resource_namespace(k8s_resource)
|
||||
).strip()
|
||||
self.log.debug(" - deleting {} {}: {}".format(self._agent.kind, (" " + msg) if msg else "", delete_cmd))
|
||||
return get_bash_output(delete_cmd).strip()
|
||||
|
||||
def _process_tasks_for_pending_pods(self, task_id_to_details: Dict[str, dict]):
|
||||
self._handle_aborted_tasks(task_id_to_details)
|
||||
|
||||
def _handle_aborted_tasks(self, pending_tasks_details: Dict[str, dict]):
|
||||
try:
|
||||
result = self._session.get(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
json={
|
||||
"id": list(pending_tasks_details),
|
||||
"status": ["stopped"],
|
||||
"only_fields": ["id"]
|
||||
}
|
||||
)
|
||||
aborted_task_ids = list(filter(None, (task.get("id") for task in result["tasks"])))
|
||||
|
||||
for task_id in aborted_task_ids:
|
||||
pod = pending_tasks_details.get(task_id)
|
||||
if not pod:
|
||||
self.log.error("Failed locating aborted task {} in pending pods list".format(task_id))
|
||||
continue
|
||||
|
||||
pod_name = self._get_pod_name(pod)
|
||||
if not self.get_pods(pod_name=pod_name):
|
||||
self.log.debug("K8S Glue pending monitor: pod {} is no longer pending, skipping".format(pod_name))
|
||||
continue
|
||||
|
||||
resource_name = self._get_k8s_resource_name(pod)
|
||||
self.log.info(
|
||||
"K8S Glue pending monitor: task {} was aborted but the k8s resource {} is still pending, "
|
||||
"deleting pod".format(task_id, resource_name)
|
||||
)
|
||||
|
||||
result = self._session.get(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
json={"id": [task_id], "status": ["stopped"], "only_fields": ["id"]},
|
||||
)
|
||||
if not result["tasks"]:
|
||||
self.log.debug("K8S Glue pending monitor: task {} is no longer aborted, skipping".format(task_id))
|
||||
continue
|
||||
|
||||
output = self.delete_k8s_resource(k8s_resource=pod, msg="Pending resource of an aborted task")
|
||||
if not output:
|
||||
self.log.warning("K8S Glue pending monitor: failed deleting resource {}".format(resource_name))
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pending monitor: failed checking aborted tasks for pending resources: {}'.format(ex)
|
||||
)
|
||||
|
||||
def _update_pending_task_msg(self, task_id: str, msg: str, tags: List[str] = None):
|
||||
if not msg or self._last_tasks_msgs.get(task_id, None) == (msg, tags):
|
||||
return
|
||||
try:
|
||||
# Make sure the task is queued
|
||||
result = self._session.send_request(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
json={"id": task_id, "only_fields": ["status"]},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
if result.ok:
|
||||
status = get_path(result.json(), 'data', 'tasks', 0, 'status')
|
||||
# if task is in progress, change its status to enqueued
|
||||
if status == "in_progress":
|
||||
result = self._session.send_request(
|
||||
service='tasks', action='enqueue',
|
||||
json={
|
||||
"task": task_id, "force": True, "queue": self._agent.k8s_pending_queue_id
|
||||
},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
if not result.ok:
|
||||
result_msg = get_path(result.json(), 'meta', 'result_msg')
|
||||
self.log.debug(
|
||||
"K8S Glue pods monitor: failed forcing task status change"
|
||||
" for pending task {}: {}".format(task_id, result_msg)
|
||||
)
|
||||
|
||||
# Update task status message
|
||||
payload = {"task": task_id, "status_message": "K8S glue status: {}".format(msg)}
|
||||
if tags:
|
||||
payload["tags"] = tags
|
||||
result = self._session.send_request('tasks', 'update', json=payload, method=Request.def_method)
|
||||
if not result.ok:
|
||||
result_msg = get_path(result.json(), 'meta', 'result_msg')
|
||||
raise Exception(result_msg or result.text)
|
||||
|
||||
# update last msg for this task
|
||||
self._last_tasks_msgs[task_id] = msg
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed setting status message for task "{}"\nMSG: {}\nEX: {}'.format(
|
||||
task_id, msg, ex
|
||||
)
|
||||
)
|
||||
18
clearml_agent/glue/utilities.py
Normal file
18
clearml_agent/glue/utilities.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import functools
|
||||
|
||||
from subprocess import DEVNULL
|
||||
|
||||
from clearml_agent.helper.process import get_bash_output as _get_bash_output
|
||||
|
||||
|
||||
def get_path(d, *path, default=None):
|
||||
try:
|
||||
return functools.reduce(
|
||||
lambda a, b: a[b], path, d
|
||||
)
|
||||
except (IndexError, KeyError):
|
||||
return default
|
||||
|
||||
|
||||
def get_bash_output(cmd, stderr=DEVNULL, raise_error=False):
|
||||
return _get_bash_output(cmd, stderr=stderr, raise_error=raise_error)
|
||||
@@ -1,4 +1,4 @@
|
||||
""" TRAINS-AGENT Stdout Helper Functions """
|
||||
""" CLEARML-AGENT Stdout Helper Functions """
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import io
|
||||
@@ -14,29 +14,30 @@ import sys
|
||||
import tempfile
|
||||
from abc import ABCMeta
|
||||
from collections import OrderedDict
|
||||
from distutils.spawn import find_executable
|
||||
from functools import total_ordering
|
||||
from typing import Text, Dict, Any, Optional, AnyStr, IO, Union
|
||||
|
||||
import attr
|
||||
import furl
|
||||
import pyhocon
|
||||
import six
|
||||
import yaml
|
||||
from attr import fields_dict
|
||||
from pathlib2 import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
import six
|
||||
from six.moves import reduce
|
||||
from trains_agent.errors import CommandFailedError
|
||||
from trains_agent.helper.dicts import filter_keys
|
||||
|
||||
from clearml_agent.errors import CommandFailedError
|
||||
from clearml_agent.external import pyhocon
|
||||
from clearml_agent.helper.dicts import filter_keys
|
||||
|
||||
pretty_lines = False
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
use_powershell = os.getenv("CLEARML_AGENT_USE_POWERSHELL", None)
|
||||
|
||||
|
||||
def which(cmd, path=None):
|
||||
from clearml_agent.helper.process import find_executable
|
||||
result = find_executable(cmd, path)
|
||||
if not result:
|
||||
raise ValueError('command "{}" not found'.format(cmd))
|
||||
@@ -53,7 +54,7 @@ def select_for_platform(linux, windows):
|
||||
|
||||
|
||||
def bash_c():
|
||||
return 'bash -c' if not is_windows_platform() else 'cmd /c'
|
||||
return 'bash -c' if not is_windows_platform() else ('powershell -Command' if use_powershell else 'cmd /c')
|
||||
|
||||
|
||||
def return_list(arg):
|
||||
@@ -205,10 +206,13 @@ def get_python_path(script_dir, entry_point, package_api, is_conda_env=False):
|
||||
["-c", "import sys; print('{}'.join(sys.path))".format(python_path_sep)])
|
||||
org_python_path = python_path_cmd.get_output(cwd=script_dir)
|
||||
# Add path of the script directory and executable directory
|
||||
python_path = '{}{python_path_sep}{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(),
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
python_path = '{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(), python_path_sep=python_path_sep)
|
||||
if entry_point:
|
||||
python_path += '{}{python_path_sep}'.format(
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
|
||||
if is_windows_platform():
|
||||
python_path = python_path.replace('/', '\\')
|
||||
|
||||
@@ -380,11 +384,11 @@ AllDumper.add_multi_representer(object, lambda dumper, data: dumper.represent_st
|
||||
|
||||
|
||||
def error(message):
|
||||
print('\ntrains_agent: ERROR: {}\n'.format(message))
|
||||
print('\nclearml_agent: ERROR: {}\n'.format(message))
|
||||
|
||||
|
||||
def warning(message):
|
||||
print('trains_agent: Warning: {}'.format(message))
|
||||
print('clearml_agent: Warning: {}'.format(message))
|
||||
|
||||
|
||||
class TqdmStream(object):
|
||||
@@ -399,12 +403,6 @@ class TqdmStream(object):
|
||||
self.buffer.write('\n')
|
||||
|
||||
|
||||
class TqdmLog(tqdm):
|
||||
|
||||
def __init__(self, iterable=None, file=None, **kwargs):
|
||||
super(TqdmLog, self).__init__(iterable, file=TqdmStream(file or sys.stderr), **kwargs)
|
||||
|
||||
|
||||
def url_join(first, *rest):
|
||||
"""
|
||||
Join url parts similarly to Path.join
|
||||
@@ -422,6 +420,7 @@ def mkstemp(
|
||||
open_kwargs=None, # type: Optional[Dict[Text, Any]]
|
||||
text=True, # type: bool
|
||||
name_only=False, # type: bool
|
||||
mode=None, # type: str
|
||||
*args,
|
||||
**kwargs):
|
||||
# type: (...) -> Union[(IO[AnyStr], Text), Text]
|
||||
@@ -431,12 +430,14 @@ def mkstemp(
|
||||
:param open_kwargs: keyword arguments for ``io.open``
|
||||
:param text: open in text mode
|
||||
:param name_only: close the file and return its name
|
||||
:param mode: open file mode
|
||||
:param args: tempfile.mkstemp args
|
||||
:param kwargs: tempfile.mkstemp kwargs
|
||||
"""
|
||||
fd, name = tempfile.mkstemp(text=text, *args, **kwargs)
|
||||
mode = 'w+'
|
||||
if not text:
|
||||
if not mode:
|
||||
mode = 'w+'
|
||||
if not text and 'b' not in mode:
|
||||
mode += 'b'
|
||||
if name_only:
|
||||
os.close(fd)
|
||||
@@ -510,6 +511,38 @@ def is_conda(config):
|
||||
return config['agent.package_manager.type'].lower() == 'conda'
|
||||
|
||||
|
||||
def convert_cuda_version_to_float_single_digit_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into a float representation, e.g. 11.4
|
||||
Notice returns String Single digit only!
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = str(cuda_version or 0)
|
||||
# if we have patch version we parse it here
|
||||
cuda_version_parts = [int(v) for v in cuda_version.split('.')]
|
||||
if len(cuda_version_parts) > 1 or cuda_version_parts[0] < 60:
|
||||
cuda_version = 10 * cuda_version_parts[0]
|
||||
if len(cuda_version_parts) > 1:
|
||||
cuda_version += float(".{:d}".format(cuda_version_parts[1]))*10
|
||||
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
else:
|
||||
cuda_version = cuda_version_parts[0]
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
|
||||
return cuda_version_full
|
||||
|
||||
|
||||
def convert_cuda_version_to_int_10_base_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into an integer version, e.g. 112 for cuda 11.2
|
||||
Return string
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = convert_cuda_version_to_float_single_digit_str(cuda_version)
|
||||
return str(int(float(cuda_version)*10))
|
||||
|
||||
|
||||
class NonStrictAttrs(object):
|
||||
|
||||
@classmethod
|
||||
@@ -21,14 +21,14 @@ def start_check_update_daemon():
|
||||
|
||||
def _check_new_version_available():
|
||||
cur_version = __version__
|
||||
update_server_releases = requests.get('https://updates.trains.allegro.ai/updates',
|
||||
data=json.dumps({"versions": {"trains-agent": str(cur_version)}}),
|
||||
update_server_releases = requests.get('https://updates.clear.ml/updates',
|
||||
data=json.dumps({"versions": {"clearml-agent": str(cur_version)}}),
|
||||
timeout=3.0)
|
||||
if update_server_releases.ok:
|
||||
update_server_releases = update_server_releases.json()
|
||||
else:
|
||||
return None
|
||||
trains_answer = update_server_releases.get("trains-agent", {})
|
||||
trains_answer = update_server_releases.get("clearml-agent", {})
|
||||
latest_version = trains_answer.get("version")
|
||||
cur_version = cur_version
|
||||
latest_version = latest_version or ''
|
||||
@@ -48,7 +48,7 @@ def _check_update_daemon():
|
||||
if latest_version:
|
||||
if latest_version[1]:
|
||||
sep = os.linesep
|
||||
print('TRAINS-AGENT new package available: UPGRADE to v{} is recommended!\nRelease Notes:\n{}'.format(
|
||||
print('CLEARML-AGENT new package available: UPGRADE to v{} is recommended!\nRelease Notes:\n{}'.format(
|
||||
latest_version[0], sep.join(latest_version[2])))
|
||||
else:
|
||||
print('TRAINS-SERVER new version available: upgrade to v{} is recommended!'.format(
|
||||
@@ -2,14 +2,14 @@ from __future__ import unicode_literals, print_function
|
||||
|
||||
import csv
|
||||
import sys
|
||||
from collections import Iterable
|
||||
from collections.abc import Iterable
|
||||
from typing import List, Dict, Text, Any
|
||||
|
||||
from attr import attrs, attrib
|
||||
|
||||
import six
|
||||
from six import binary_type, text_type
|
||||
from trains_agent.helper.base import nonstrict_in_place_sort
|
||||
from clearml_agent.helper.base import nonstrict_in_place_sort
|
||||
|
||||
|
||||
def print_text(text, newline=True):
|
||||
23
clearml_agent/helper/dicts.py
Normal file
23
clearml_agent/helper/dicts.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from typing import Callable, Dict, Any, Optional
|
||||
|
||||
_not_set = object()
|
||||
|
||||
|
||||
def filter_keys(filter_, dct): # type: (Callable[[Any], bool], Dict) -> Dict
|
||||
return {key: value for key, value in dct.items() if filter_(key)}
|
||||
|
||||
|
||||
def merge_dicts(dict1, dict2, custom_merge_func=None):
|
||||
# type: (Any, Any, Optional[Callable[[str, Any, Any, Any], Any]]) -> Any
|
||||
""" Recursively merges dict2 into dict1 """
|
||||
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
|
||||
return dict2
|
||||
for k in dict2:
|
||||
if k in dict1:
|
||||
res = None
|
||||
if custom_merge_func:
|
||||
res = custom_merge_func(k, dict1[k], dict2[k], _not_set)
|
||||
dict1[k] = merge_dicts(dict1[k], dict2[k], custom_merge_func) if res is _not_set else res
|
||||
else:
|
||||
dict1[k] = dict2[k]
|
||||
return dict1
|
||||
169
clearml_agent/helper/docker_args.py
Normal file
169
clearml_agent/helper/docker_args.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import re
|
||||
import shlex
|
||||
from typing import Tuple, List, TYPE_CHECKING
|
||||
from urllib.parse import urlunparse, urlparse
|
||||
|
||||
from clearml_agent.definitions import (
|
||||
ENV_AGENT_GIT_PASS,
|
||||
ENV_AGENT_SECRET_KEY,
|
||||
ENV_AWS_SECRET_KEY,
|
||||
ENV_AZURE_ACCOUNT_KEY,
|
||||
ENV_AGENT_AUTH_TOKEN,
|
||||
ENV_DOCKER_IMAGE,
|
||||
ENV_DOCKER_ARGS_HIDE_ENV,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
def sanitize_urls(s: str) -> Tuple[str, bool]:
|
||||
"""
|
||||
Replaces passwords in URLs with asterisks.
|
||||
Returns the sanitized string and a boolean indicating whether sanitation was performed.
|
||||
"""
|
||||
regex = re.compile("^([^:]*:)[^@]+(.*)$")
|
||||
tokens = re.split(r"\s", s)
|
||||
changed = False
|
||||
for k in range(len(tokens)):
|
||||
if "@" in tokens[k]:
|
||||
res = urlparse(tokens[k])
|
||||
if regex.match(res.netloc):
|
||||
changed = True
|
||||
tokens[k] = urlunparse((
|
||||
res.scheme,
|
||||
regex.sub("\\1********\\2", res.netloc),
|
||||
res.path,
|
||||
res.params,
|
||||
res.query,
|
||||
res.fragment
|
||||
))
|
||||
return " ".join(tokens) if changed else s, changed
|
||||
|
||||
|
||||
class DockerArgsSanitizer:
|
||||
@classmethod
|
||||
def sanitize_docker_command(cls, session, docker_command):
|
||||
# type: (Session, List[str]) -> List[str]
|
||||
if not docker_command:
|
||||
return docker_command
|
||||
|
||||
enabled = (
|
||||
session.config.get('agent.hide_docker_command_env_vars.enabled', False) or ENV_DOCKER_ARGS_HIDE_ENV.get()
|
||||
)
|
||||
if not enabled:
|
||||
return docker_command
|
||||
|
||||
keys = set(session.config.get('agent.hide_docker_command_env_vars.extra_keys', []))
|
||||
if ENV_DOCKER_ARGS_HIDE_ENV.get():
|
||||
keys.update(shlex.split(ENV_DOCKER_ARGS_HIDE_ENV.get().strip()))
|
||||
keys.update(
|
||||
ENV_AGENT_GIT_PASS.vars,
|
||||
ENV_AGENT_SECRET_KEY.vars,
|
||||
ENV_AWS_SECRET_KEY.vars,
|
||||
ENV_AZURE_ACCOUNT_KEY.vars,
|
||||
ENV_AGENT_AUTH_TOKEN.vars,
|
||||
)
|
||||
|
||||
parse_embedded_urls = bool(session.config.get(
|
||||
'agent.hide_docker_command_env_vars.parse_embedded_urls', True
|
||||
))
|
||||
|
||||
skip_next = False
|
||||
result = docker_command[:]
|
||||
for i, item in enumerate(docker_command):
|
||||
if skip_next:
|
||||
skip_next = False
|
||||
continue
|
||||
try:
|
||||
if item in ("-e", "--env"):
|
||||
key, sep, val = result[i + 1].partition("=")
|
||||
if not sep:
|
||||
continue
|
||||
if key in ENV_DOCKER_IMAGE.vars:
|
||||
# special case - this contains a complete docker command
|
||||
val = " ".join(cls.sanitize_docker_command(session, re.split(r"\s", val)))
|
||||
elif key in keys:
|
||||
val = "********"
|
||||
elif parse_embedded_urls:
|
||||
val = sanitize_urls(val)[0]
|
||||
result[i + 1] = "{}={}".format(key, val)
|
||||
skip_next = True
|
||||
elif parse_embedded_urls and not item.startswith("-"):
|
||||
item, changed = sanitize_urls(item)
|
||||
if changed:
|
||||
result[i] = item
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def get_list_of_switches(docker_args: List[str]) -> List[str]:
|
||||
args = []
|
||||
for token in docker_args:
|
||||
if token.strip().startswith("-"):
|
||||
args += [token.strip().split("=")[0].lstrip("-")]
|
||||
|
||||
return args
|
||||
|
||||
@staticmethod
|
||||
def filter_switches(docker_args: List[str], exclude_switches: List[str]) -> List[str]:
|
||||
# shortcut if we are sure we have no matches
|
||||
if (not exclude_switches or
|
||||
not any("-{}".format(s) in " ".join(docker_args) for s in exclude_switches)):
|
||||
return docker_args
|
||||
|
||||
args = []
|
||||
in_switch_args = True
|
||||
for token in docker_args:
|
||||
if token.strip().startswith("-"):
|
||||
if "=" in token:
|
||||
switch = token.strip().split("=")[0]
|
||||
in_switch_args = False
|
||||
else:
|
||||
switch = token
|
||||
in_switch_args = True
|
||||
|
||||
if switch.lstrip("-") in exclude_switches:
|
||||
# if in excluded, skip the switch and following arguments
|
||||
in_switch_args = False
|
||||
else:
|
||||
args += [token]
|
||||
|
||||
elif in_switch_args:
|
||||
args += [token]
|
||||
else:
|
||||
# this is the switch arguments we need to skip
|
||||
pass
|
||||
|
||||
return args
|
||||
|
||||
@staticmethod
|
||||
def merge_docker_args(config, task_docker_arguments: List[str], extra_docker_arguments: List[str]) -> List[str]:
|
||||
base_cmd = []
|
||||
# currently only resolving --network, --ipc, --privileged
|
||||
override_switches = config.get(
|
||||
"agent.protected_docker_extra_args",
|
||||
["privileged", "security-opt", "network", "ipc"]
|
||||
)
|
||||
|
||||
if config.get("agent.docker_args_extra_precedes_task", True):
|
||||
switches = []
|
||||
if extra_docker_arguments:
|
||||
switches = DockerArgsSanitizer.get_list_of_switches(extra_docker_arguments)
|
||||
switches = list(set(switches) & set(override_switches))
|
||||
base_cmd += [str(a) for a in extra_docker_arguments if a]
|
||||
if task_docker_arguments:
|
||||
docker_arguments = DockerArgsSanitizer.filter_switches(task_docker_arguments, switches)
|
||||
base_cmd += [a for a in docker_arguments if a]
|
||||
else:
|
||||
switches = []
|
||||
if task_docker_arguments:
|
||||
switches = DockerArgsSanitizer.get_list_of_switches(task_docker_arguments)
|
||||
switches = list(set(switches) & set(override_switches))
|
||||
base_cmd += [a for a in task_docker_arguments if a]
|
||||
if extra_docker_arguments:
|
||||
extra_docker_arguments = DockerArgsSanitizer.filter_switches(extra_docker_arguments, switches)
|
||||
base_cmd += [a for a in extra_docker_arguments if a]
|
||||
return base_cmd
|
||||
8
clearml_agent/helper/environment/__init__.py
Normal file
8
clearml_agent/helper/environment/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from .entry import Entry, NotSet
|
||||
from .environment import EnvEntry
|
||||
|
||||
__all__ = [
|
||||
'Entry',
|
||||
'NotSet',
|
||||
'EnvEntry',
|
||||
]
|
||||
@@ -1,5 +1,4 @@
|
||||
import base64
|
||||
from distutils.util import strtobool
|
||||
from typing import Union, Optional, Any, TypeVar, Callable, Tuple
|
||||
|
||||
import six
|
||||
@@ -19,11 +18,27 @@ def base64_to_text(value):
|
||||
return base64.b64decode(value).decode("utf-8")
|
||||
|
||||
|
||||
def text_to_int(value, default=0):
|
||||
# type: (Any, int) -> int
|
||||
try:
|
||||
return int(value)
|
||||
except (ValueError, TypeError):
|
||||
return default
|
||||
|
||||
|
||||
def text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
return bool(strtobool(value))
|
||||
|
||||
|
||||
def safe_text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
try:
|
||||
return text_to_bool(value)
|
||||
except ValueError:
|
||||
return bool(value)
|
||||
|
||||
|
||||
def any_to_bool(value):
|
||||
# type: (Optional[Union[int, float, Text]]) -> bool
|
||||
if isinstance(value, six.text_type):
|
||||
@@ -31,6 +46,7 @@ def any_to_bool(value):
|
||||
return bool(value)
|
||||
|
||||
|
||||
# noinspection PyIncorrectDocstring
|
||||
def or_(*converters, **kwargs):
|
||||
# type: (ConverterType, Tuple[Exception, ...]) -> ConverterType
|
||||
"""
|
||||
@@ -51,3 +67,20 @@ def or_(*converters, **kwargs):
|
||||
return value
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def strtobool (val):
|
||||
"""Convert a string representation of truth to true (1) or false (0).
|
||||
|
||||
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
|
||||
are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
|
||||
'val' is anything else.
|
||||
"""
|
||||
val = val.lower()
|
||||
if val in ('y', 'yes', 't', 'true', 'on', '1'):
|
||||
return 1
|
||||
elif val in ('n', 'no', 'f', 'false', 'off', '0'):
|
||||
return 0
|
||||
else:
|
||||
raise ValueError("invalid truth value %r" % (val,))
|
||||
|
||||
@@ -23,17 +23,38 @@ class Entry(object):
|
||||
Configuration entry definition
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def default_conversions(cls):
|
||||
def default_conversions(self):
|
||||
# type: () -> Dict[Any, Converter]
|
||||
|
||||
if self.lstrip and self.rstrip:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s).strip()
|
||||
|
||||
elif self.lstrip:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s).lstrip()
|
||||
|
||||
elif self.rstrip:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s).rstrip()
|
||||
|
||||
else:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s)
|
||||
|
||||
return {
|
||||
bool: any_to_bool,
|
||||
six.text_type: lambda s: six.text_type(s).strip(),
|
||||
bool: lambda x: any_to_bool(x.strip()),
|
||||
six.text_type: str_convert,
|
||||
}
|
||||
|
||||
def __init__(self, key, *more_keys, **kwargs):
|
||||
# type: (Text, Text, Any) -> None
|
||||
"""
|
||||
:rtype: object
|
||||
:param key: Entry's key (at least one).
|
||||
:param more_keys: More alternate keys for this entry.
|
||||
:param type: Value type. If provided, will be used choosing a default conversion or
|
||||
@@ -49,6 +70,8 @@ class Entry(object):
|
||||
self.converter = kwargs.pop("converter", None)
|
||||
self.default = kwargs.pop("default", None)
|
||||
self.help = kwargs.pop("help", None)
|
||||
self.lstrip = kwargs.pop("lstrip", True)
|
||||
self.rstrip = kwargs.pop("rstrip", True)
|
||||
|
||||
def __str__(self):
|
||||
return str(self.key)
|
||||
@@ -64,8 +87,8 @@ class Entry(object):
|
||||
converter = self.default_conversions().get(self.type, self.type)
|
||||
return converter(value)
|
||||
|
||||
def get_pair(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Tuple[Text, Any]]
|
||||
def get_pair(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Tuple[Text, Any]]
|
||||
for key in self.keys:
|
||||
value = self._get(key)
|
||||
if value is NotSet:
|
||||
@@ -75,18 +98,26 @@ class Entry(object):
|
||||
except Exception as ex:
|
||||
self.error("invalid value {key}={value}: {ex}".format(**locals()))
|
||||
break
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if value_cb:
|
||||
value_cb(key, value)
|
||||
except Exception:
|
||||
pass
|
||||
return key, value
|
||||
|
||||
result = self.default if default is NotSet else default
|
||||
return self.key, result
|
||||
|
||||
def get(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter)[1]
|
||||
def get(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter, value_cb=value_cb)[1]
|
||||
|
||||
def set(self, value):
|
||||
# type: (Any, Any) -> (Text, Any)
|
||||
key, _ = self.get_pair(default=None, converter=None)
|
||||
self._set(key, str(value))
|
||||
# key, _ = self.get_pair(default=None, converter=None)
|
||||
for k in self.keys:
|
||||
self._set(k, str(value))
|
||||
|
||||
def _set(self, key, value):
|
||||
# type: (Text, Text) -> None
|
||||
28
clearml_agent/helper/environment/environment.py
Normal file
28
clearml_agent/helper/environment/environment.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from os import getenv, environ
|
||||
|
||||
from .converters import text_to_bool
|
||||
from .entry import Entry, NotSet
|
||||
|
||||
|
||||
class EnvEntry(Entry):
|
||||
def default_conversions(self):
|
||||
conversions = super(EnvEntry, self).default_conversions().copy()
|
||||
conversions[bool] = lambda x: text_to_bool(x.strip())
|
||||
return conversions
|
||||
|
||||
def pop(self):
|
||||
for k in self.keys:
|
||||
environ.pop(k, None)
|
||||
|
||||
def _get(self, key):
|
||||
value = getenv(key, "")
|
||||
return value or NotSet
|
||||
|
||||
def _set(self, key, value):
|
||||
environ[key] = value
|
||||
|
||||
def __str__(self):
|
||||
return "env:{}".format(super(EnvEntry, self).__str__())
|
||||
|
||||
def error(self, message):
|
||||
print("Environment configuration: {}".format(message))
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user