mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
468 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3273f76b46 | ||
|
|
9af0f9fe41 | ||
|
|
205cd47cb9 | ||
|
|
0ff428bb96 | ||
|
|
bf8d9c96e9 | ||
|
|
a88487ff25 | ||
|
|
785e22dc87 | ||
|
|
6a2b778d53 | ||
|
|
b2c3702830 | ||
|
|
6302d43990 | ||
|
|
760bbca74e | ||
|
|
e63fd31420 | ||
|
|
2ff9985db7 | ||
|
|
b8c762401b | ||
|
|
99e1e54f94 | ||
|
|
a4d3b5bad6 | ||
|
|
b21665ed6e | ||
|
|
f877aa96e2 | ||
|
|
f99344d194 | ||
|
|
d9f2a1999a | ||
|
|
79d0abe707 | ||
|
|
6213ef4c02 | ||
|
|
aef6aa9fc8 | ||
|
|
0bb267115b | ||
|
|
f89a92556f | ||
|
|
8ba4d75e80 | ||
|
|
edc333ba5f | ||
|
|
2f0553b873 | ||
|
|
b2a4bf08ac | ||
|
|
f18c6b809f | ||
|
|
cd5b4d2186 | ||
|
|
5f1bab6711 | ||
|
|
ab9b9db0c9 | ||
|
|
93df021108 | ||
|
|
700ae85de0 | ||
|
|
f367c5a571 | ||
|
|
ebc5944b44 | ||
|
|
8f41002845 | ||
|
|
7e8670d57f | ||
|
|
77de343863 | ||
|
|
6b31883e45 | ||
|
|
e48b4756fa | ||
|
|
47147e3237 | ||
|
|
41fc4ec646 | ||
|
|
441e5a73b2 | ||
|
|
27ed6821c4 | ||
|
|
10c6629982 | ||
|
|
6fb48a4c6e | ||
|
|
105ade31f1 | ||
|
|
502e266b6b | ||
|
|
cd9a3b9f4e | ||
|
|
4179ac5234 | ||
|
|
98cc0d86ba | ||
|
|
293cbc0ac6 | ||
|
|
4387ed73b6 | ||
|
|
43443ccf08 | ||
|
|
3d43240c8f | ||
|
|
fc58ba947b | ||
|
|
22672d2444 | ||
|
|
6a4fcda1bf | ||
|
|
a4ebf8293d | ||
|
|
10fb157d58 | ||
|
|
56058beec2 | ||
|
|
9f207d5155 | ||
|
|
8a2bea3c14 | ||
|
|
f1f9278928 | ||
|
|
2de1c926bf | ||
|
|
e1104e60bb | ||
|
|
8b2970350c | ||
|
|
a2758250b2 | ||
|
|
01e8ffd854 | ||
|
|
74edf6aa36 | ||
|
|
09c5ef99af | ||
|
|
17ae28a62f | ||
|
|
059a9385e9 | ||
|
|
9a321a410f | ||
|
|
919013d4fe | ||
|
|
05530b712b | ||
|
|
8d15fd8798 | ||
|
|
b34329934b | ||
|
|
85049d8705 | ||
|
|
6fbd70786e | ||
|
|
05a65548da | ||
|
|
6657003d65 | ||
|
|
95dde6ca0c | ||
|
|
c9fc092f4e | ||
|
|
432ee395e1 | ||
|
|
98fc4f0fb9 | ||
|
|
111e774c21 | ||
|
|
3dd8d783e1 | ||
|
|
7c3e420df4 | ||
|
|
55b065a114 | ||
|
|
faa97b6cc2 | ||
|
|
f5861b1e4a | ||
|
|
030cbb69f1 | ||
|
|
564f769ff7 | ||
|
|
2c7f091e57 | ||
|
|
dd5d24b0ca | ||
|
|
996bb797c3 | ||
|
|
9ad49a0d21 | ||
|
|
ba4fee7b19 | ||
|
|
0131db8b7d | ||
|
|
d2384a9a95 | ||
|
|
5b86c230c1 | ||
|
|
21e4be966f | ||
|
|
9c6cb421b3 | ||
|
|
52405c343d | ||
|
|
46f0c991c8 | ||
|
|
0254279ed5 | ||
|
|
0e1750f90e | ||
|
|
58e0dc42ec | ||
|
|
d16825029d | ||
|
|
fb639afcb9 | ||
|
|
eefb94d1bc | ||
|
|
f1e9266075 | ||
|
|
e1e3c84a8d | ||
|
|
ed1356976b | ||
|
|
2b815354e0 | ||
|
|
edae380a9e | ||
|
|
946e9d9ce9 | ||
|
|
a56343ffc7 | ||
|
|
159a6e9a5a | ||
|
|
6b7ee12dc1 | ||
|
|
3838247716 | ||
|
|
6e7d35a42a | ||
|
|
4c056a17b9 | ||
|
|
21d98afca5 | ||
|
|
6a1bf11549 | ||
|
|
7115a9b9a7 | ||
|
|
450df2f8d3 | ||
|
|
ccf752c4e4 | ||
|
|
3ed63e2154 | ||
|
|
a535f93cd6 | ||
|
|
b380ec54c6 | ||
|
|
a1274299ce | ||
|
|
c77224af68 | ||
|
|
95dadca45c | ||
|
|
685918fd9b | ||
|
|
bc85ddf78d | ||
|
|
5b5fb0b8a6 | ||
|
|
fec0ce1756 | ||
|
|
1e09b88b7a | ||
|
|
b6ca0fa6a5 | ||
|
|
307ec9213e | ||
|
|
a78a25d966 | ||
|
|
ebb6231f5a | ||
|
|
e1d65cb280 | ||
|
|
3fe92a92ba | ||
|
|
154db59ce6 | ||
|
|
afffa83063 | ||
|
|
787c7d88bb | ||
|
|
667c2ced3d | ||
|
|
7f5b3c8df4 | ||
|
|
46ded2864d | ||
|
|
40456be948 | ||
|
|
8d51aed679 | ||
|
|
bfc4ba38cd | ||
|
|
3cedc104df | ||
|
|
b367c80477 | ||
|
|
262b6d3a00 | ||
|
|
95e996bfda | ||
|
|
b6d132b226 | ||
|
|
4f17a2c17d | ||
|
|
00e8e9eb5a | ||
|
|
af6a77918f | ||
|
|
855622fd30 | ||
|
|
8cd12810f3 | ||
|
|
ebb955187d | ||
|
|
85e1fadf9b | ||
|
|
249b51a31b | ||
|
|
da19ef26c4 | ||
|
|
f69e16ea9d | ||
|
|
efa1f71dac | ||
|
|
692cb8cf13 | ||
|
|
ebdc215632 | ||
|
|
b2da639582 | ||
|
|
71fdb43f10 | ||
|
|
ca2791c65e | ||
|
|
dd75cedaab | ||
|
|
669fb1a6e5 | ||
|
|
5d517c91b5 | ||
|
|
6be75abc86 | ||
|
|
4c777fa2ee | ||
|
|
dc5e0033c8 | ||
|
|
3dd5973734 | ||
|
|
53d379205f | ||
|
|
57cde21c48 | ||
|
|
396abf13b6 | ||
|
|
6e7fb5f331 | ||
|
|
1d5c118b70 | ||
|
|
18612aac4d | ||
|
|
76c533a2e8 | ||
|
|
9eee213683 | ||
|
|
e4861fc0fb | ||
|
|
53ef984065 | ||
|
|
26e62da1a8 | ||
|
|
d2f3614ab0 | ||
|
|
c6d767bd64 | ||
|
|
efb06891a8 | ||
|
|
70771b12a9 | ||
|
|
3f7a4840cc | ||
|
|
e28048dc25 | ||
|
|
2ef5d38b32 | ||
|
|
d216d70cdf | ||
|
|
0de10345f7 | ||
|
|
a243fa211f | ||
|
|
d794b047be | ||
|
|
f0fd62a28f | ||
|
|
e8493d3807 | ||
|
|
5353e9c44d | ||
|
|
75f5814f9f | ||
|
|
94b8b5520d | ||
|
|
42450dcbc4 | ||
|
|
ef47225d41 | ||
|
|
e61accefb9 | ||
|
|
5c1543d112 | ||
|
|
7ff6aee20c | ||
|
|
37ea381d98 | ||
|
|
67fc884895 | ||
|
|
1e3646b57c | ||
|
|
ba2db4e727 | ||
|
|
077148be00 | ||
|
|
594ee5842e | ||
|
|
a69766bd8b | ||
|
|
857a750eb1 | ||
|
|
26aa50f1b5 | ||
|
|
8b4f1eefc2 | ||
|
|
97c2e21dcc | ||
|
|
918dd39b87 | ||
|
|
7776e906c4 | ||
|
|
1bf865ec08 | ||
|
|
3f1ce847dc | ||
|
|
9006c2d28f | ||
|
|
ec216198a0 | ||
|
|
fe6adbf110 | ||
|
|
2693c565ba | ||
|
|
9054ea37c2 | ||
|
|
7292263f86 | ||
|
|
f8a6cd697f | ||
|
|
ec9d027678 | ||
|
|
48a145a8bd | ||
|
|
71d2ab4ce7 | ||
|
|
12a8872b27 | ||
|
|
820ab4dc0c | ||
|
|
1d1ffd17fb | ||
|
|
d96b8ff906 | ||
|
|
e687418194 | ||
|
|
a5a797ec5e | ||
|
|
ff6cee4a44 | ||
|
|
9acbad28f7 | ||
|
|
560e689ccd | ||
|
|
f66e42ddb1 | ||
|
|
d9856d5de5 | ||
|
|
24177cc5a9 | ||
|
|
178af0dee8 | ||
|
|
51eb0a713c | ||
|
|
249aa006cb | ||
|
|
c08e2ac0bb | ||
|
|
335ef91d8e | ||
|
|
6c7a639673 | ||
|
|
5f77cad5ac | ||
|
|
0228ae0494 | ||
|
|
165677e800 | ||
|
|
2e5298b737 | ||
|
|
c9ffb8a053 | ||
|
|
2466eed23f | ||
|
|
6e31171d31 | ||
|
|
592254709e | ||
|
|
e43f31eb80 | ||
|
|
f50ba005b5 | ||
|
|
1011544533 | ||
|
|
6572023173 | ||
|
|
9c7e2aacd0 | ||
|
|
715f102f6d | ||
|
|
5446aed9cf | ||
|
|
b94ec85461 | ||
|
|
f55f4f7535 | ||
|
|
c87da3a079 | ||
|
|
c3590a53a8 | ||
|
|
a4315722ab | ||
|
|
c901bd331c | ||
|
|
df97f170a2 | ||
|
|
a30a2dad66 | ||
|
|
2432f5bb68 | ||
|
|
341086d86a | ||
|
|
1163c96438 | ||
|
|
4c120d7cd0 | ||
|
|
966a9758b8 | ||
|
|
f58071fc74 | ||
|
|
8712c5e636 | ||
|
|
a51f9bed49 | ||
|
|
531e514003 | ||
|
|
2cd9e706c8 | ||
|
|
e3e6a1dda8 | ||
|
|
92b5ce61a0 | ||
|
|
36073ad488 | ||
|
|
d89d0f9ff5 | ||
|
|
14c48d0a78 | ||
|
|
b1ee3e105b | ||
|
|
1f53c4fd1b | ||
|
|
bfed3ccf4d | ||
|
|
d521482409 | ||
|
|
53eba5658f | ||
|
|
bb64e4a850 | ||
|
|
771690d5c0 | ||
|
|
d39e30995a | ||
|
|
363aaeaba8 | ||
|
|
fa1307e62c | ||
|
|
e7c9e9695b | ||
|
|
bf07b7f76d | ||
|
|
5afb604e3d | ||
|
|
b3e8be6296 | ||
|
|
2cb452b1c2 | ||
|
|
938fcc4530 | ||
|
|
73625bf00f | ||
|
|
f41ed09dc1 | ||
|
|
f03c4576f7 | ||
|
|
6c5087e425 | ||
|
|
5a6caf6399 | ||
|
|
a07053d961 | ||
|
|
aa9a9a25fb | ||
|
|
cd4a39d8fc | ||
|
|
92e3f00435 | ||
|
|
a890e36a36 | ||
|
|
bed94ee431 | ||
|
|
175e99b12b | ||
|
|
2a941e3abf | ||
|
|
3c8e0ae5db | ||
|
|
e416ab526b | ||
|
|
e17246d8ea | ||
|
|
f6f043d1ca | ||
|
|
db57441c5d | ||
|
|
31d90be0a1 | ||
|
|
5a080798cb | ||
|
|
21c4857795 | ||
|
|
4149afa896 | ||
|
|
b196ab5793 | ||
|
|
b39b54bbaf | ||
|
|
26d76f52ac | ||
|
|
2fff28845d | ||
|
|
5e4c495d62 | ||
|
|
5c5802c089 | ||
|
|
06010ef1b7 | ||
|
|
bd411a1984 | ||
|
|
29d24e3eaa | ||
|
|
0fbbe774fa | ||
|
|
aede6f4bac | ||
|
|
84706ba66d | ||
|
|
6b602889a5 | ||
|
|
cd046927f3 | ||
|
|
5ed47d2d2c | ||
|
|
fd068c0933 | ||
|
|
9456e493ac | ||
|
|
3b08a73245 | ||
|
|
42606d9247 | ||
|
|
499b3dfa66 | ||
|
|
ca360b7d43 | ||
|
|
6470b16b70 | ||
|
|
4c9410c5fe | ||
|
|
351f0657c3 | ||
|
|
382604e923 | ||
|
|
b48f25a7f9 | ||
|
|
b76e4fc02b | ||
|
|
27cf7dd67f | ||
|
|
05ec45352c | ||
|
|
0e7546f248 | ||
|
|
e3c8bd5666 | ||
|
|
3ae1741343 | ||
|
|
53c106c3af | ||
|
|
44fc7dffe6 | ||
|
|
aaa6b32f9f | ||
|
|
821a0c4a2b | ||
|
|
6373237960 | ||
|
|
1caf7b104f | ||
|
|
176b4a4cde | ||
|
|
29bf993be7 | ||
|
|
eda597dea5 | ||
|
|
8c56777125 | ||
|
|
7e90ebd5db | ||
|
|
3a07bfe1d7 | ||
|
|
0694b9e8af | ||
|
|
742cbf5767 | ||
|
|
e93384b99b | ||
|
|
3c4e976093 | ||
|
|
1e795beec8 | ||
|
|
4f7407084d | ||
|
|
ae3d034531 | ||
|
|
a2db1f5ab5 | ||
|
|
cec6420c8f | ||
|
|
4f18bb7ea0 | ||
|
|
3ec2a3a92e | ||
|
|
823b67a3ce | ||
|
|
24dc59e31f | ||
|
|
08ff5e6db7 | ||
|
|
e60a6f9d14 | ||
|
|
161656d9e4 | ||
|
|
8569c02b33 | ||
|
|
35e714d8d9 | ||
|
|
6f8d5710d6 | ||
|
|
a671692832 | ||
|
|
5c8675e43a | ||
|
|
60a58f6fad | ||
|
|
948fc4c6ce | ||
|
|
5be5f3209d | ||
|
|
537b67e0cd | ||
|
|
82c5e55fe4 | ||
|
|
5f0d51d485 | ||
|
|
945dd816ad | ||
|
|
45009e6cc2 | ||
|
|
8eace6d57b | ||
|
|
3774fa6abd | ||
|
|
e71e6865d2 | ||
|
|
0e8f1528b1 | ||
|
|
c331babf51 | ||
|
|
c59d268995 | ||
|
|
9e9fcb0ba9 | ||
|
|
f33e0b2f78 | ||
|
|
0e4b99351f | ||
|
|
81edd2860f | ||
|
|
14ac584577 | ||
|
|
9ce6baf074 | ||
|
|
92a1e07b33 | ||
|
|
cb6bdece39 | ||
|
|
2ea38364bb | ||
|
|
cf6fdc0d81 | ||
|
|
91eec99563 | ||
|
|
f8cbaa9a06 | ||
|
|
d9b9b4984b | ||
|
|
8a46dc6b03 | ||
|
|
205f9dd816 | ||
|
|
9dfa1294e2 | ||
|
|
f019905720 | ||
|
|
9c257858dd | ||
|
|
2006ab20dd | ||
|
|
0caf31719c | ||
|
|
5da7184276 | ||
|
|
50fccdab96 | ||
|
|
77d6ff6630 | ||
|
|
99614702ea | ||
|
|
58cb344ee6 | ||
|
|
22d5892b12 | ||
|
|
f619969efc | ||
|
|
ca242424ab | ||
|
|
407deb84e9 | ||
|
|
14589aa094 | ||
|
|
1260e3d942 | ||
|
|
b22d926d94 | ||
|
|
410cc8c7be | ||
|
|
784c676f5b | ||
|
|
296f7970df | ||
|
|
cd59933c9c | ||
|
|
b95d3f5300 | ||
|
|
fa0d5d8469 | ||
|
|
8229843018 | ||
|
|
c578b37c6d | ||
|
|
8ea062c0bd | ||
|
|
5d8bbde434 | ||
|
|
0462af6a3d | ||
|
|
5a94a4048e | ||
|
|
2602301e1d | ||
|
|
161993f66f | ||
|
|
b7f87fb8d3 | ||
|
|
8fdb87f1f5 | ||
|
|
a9a68d230e | ||
|
|
a1f2941ffd | ||
|
|
c548eeacfc | ||
|
|
428781af86 | ||
|
|
72efe2e9fe |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -11,3 +11,8 @@ build/
|
||||
dist/
|
||||
*.egg-info
|
||||
|
||||
# VSCode
|
||||
.vscode
|
||||
|
||||
# MirrorD
|
||||
.mirrord
|
||||
|
||||
283
README.md
283
README.md
@@ -2,45 +2,55 @@
|
||||
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_agent_logo.png?raw=true" width="250px">
|
||||
|
||||
**ClearML Agent - ML-Ops made easy
|
||||
ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
**ClearML Agent - MLOps/LLMOps made easy
|
||||
MLOps/LLMOps scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
|
||||
[](https://img.shields.io/github/license/allegroai/trains-agent.svg)
|
||||
[](https://img.shields.io/github/license/allegroai/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/v/clearml-agent.svg)
|
||||
[](https://pypi.python.org/pypi/clearml-agent/)
|
||||
[](https://pypi.org/project/clearml-agent/)
|
||||
[](https://artifacthub.io/packages/search?repo=allegroai)
|
||||
|
||||
`🌟 ClearML is open-source - Leave a star to support the project! 🌟`
|
||||
|
||||
</div>
|
||||
|
||||
---
|
||||
|
||||
### ClearML-Agent
|
||||
#### *Formerly known as Trains Agent*
|
||||
|
||||
#### *Formerly known as Trains Agent*
|
||||
|
||||
* Run jobs (experiments) on any local or cloud based resource
|
||||
* Implement optimized resource utilization policies
|
||||
* Deploy execution environments with either virtualenv or fully docker containerized with zero effort
|
||||
* Launch-and-Forget service containers
|
||||
* [Cloud autoscaling](https://allegro.ai/clearml/docs/examples/services/aws_autoscaler/aws_autoscaler/)
|
||||
* [Customizable cleanup](https://allegro.ai/clearml/docs/examples/services/cleanup/cleanup_service/)
|
||||
* Advanced [pipeline building and execution](https://allegro.ai/clearml/docs/examples/frameworks/pytorch/notebooks/table/tabular_training_pipeline/)
|
||||
* [Cloud autoscaling](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler)
|
||||
* [Customizable cleanup](https://clear.ml/docs/latest/docs/guides/services/cleanup_service)
|
||||
* Advanced [pipeline building and execution](https://clear.ml/docs/latest/docs/guides/frameworks/pytorch/notebooks/table/tabular_training_pipeline)
|
||||
|
||||
It is a zero configuration fire-and-forget execution agent, providing a full ML/DL cluster solution.
|
||||
|
||||
**Full Automation in 5 steps**
|
||||
1. ClearML Server [self-hosted](https://github.com/allegroai/trains-server) or [free tier hosting](https://app.community.clear.ml)
|
||||
2. `pip install clearml-agent` ([install](#installing-the-clearml-agent) the ClearML Agent on any GPU machine: on-premises / cloud / ...)
|
||||
3. Create a [job](https://github.com/allegroai/clearml/docs/clearml-task.md) or Add [ClearML](https://github.com/allegroai/trains) to your code with just 2 lines
|
||||
4. Change the [parameters](#using-the-clearml-agent) in the UI & schedule for [execution](#using-the-clearml-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
|
||||
1. ClearML Server [self-hosted](https://github.com/allegroai/clearml-server)
|
||||
or [free tier hosting](https://app.clear.ml)
|
||||
2. `pip install clearml-agent` ([install](#installing-the-clearml-agent) the ClearML Agent on any GPU machine:
|
||||
on-premises / cloud / ...)
|
||||
3. Create a [job](https://clear.ml/docs/latest/docs/apps/clearml_task) or
|
||||
add [ClearML](https://github.com/allegroai/clearml) to your code with just 2 lines of code
|
||||
4. Change the [parameters](#using-the-clearml-agent) in the UI & schedule for [execution](#using-the-clearml-agent) (or
|
||||
automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
5. :chart_with_downwards_trend: :chart_with_upwards_trend: :eyes: :beer:
|
||||
|
||||
"All the Deep/Machine-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
|
||||
**Try ClearML now** [Self Hosted](https://github.com/allegroai/trains-server) or [Free tier Hosting](https://app.community.clear.ml)
|
||||
<a href="https://app.community.clear.ml"><img src="https://raw.githubusercontent.com/allegroai/trains-agent/9f1e86c1ca45c984ee13edc9353c7b10c55d7257/docs/screenshots.gif" width="100%"></a>
|
||||
**Try ClearML now** [Self Hosted](https://github.com/allegroai/clearml-server)
|
||||
or [Free tier Hosting](https://app.clear.ml)
|
||||
<a href="https://app.clear.ml"><img src="https://github.com/allegroai/clearml-agent/blob/master/docs/screenshots.gif?raw=true" width="100%"></a>
|
||||
|
||||
### Simple, Flexible Experiment Orchestration
|
||||
|
||||
**The ClearML Agent was built to address the DL/ML R&D DevOps needs:**
|
||||
|
||||
* Easily add & remove machines from the cluster
|
||||
@@ -56,75 +66,92 @@ It is a zero configuration fire-and-forget execution agent, providing a full ML/
|
||||
|
||||
*epsilon - Because we are :triangular_ruler: and nothing is really zero work
|
||||
|
||||
|
||||
### Kubernetes Integration (Optional)
|
||||
We think Kubernetes is awesome, but it should be a choice.
|
||||
We designed `clearml-agent` so you can run bare-metal or inside a pod with any mix that fits your environment.
|
||||
#### Benefits of integrating existing K8s with ClearML-Agent
|
||||
- ClearML-Agent adds the missing scheduling capabilities to K8s
|
||||
- Allowing for more flexible automation from code
|
||||
- A programmatic interface for easier learning curve (and debugging)
|
||||
- Seamless integration with ML/DL experiment manager
|
||||
- Web UI for customization, scheduling & prioritization of jobs
|
||||
|
||||
**Two K8s integration flavours**
|
||||
- Spin ClearML-Agent as a long-lasting service pod
|
||||
- use [clearml-agent](https://hub.docker.com/r/allegroai/trains-agent) docker image
|
||||
- map docker socket into the pod (soon replaced by [podman](https://github.com/containers/podman))
|
||||
- allow the clearml-agent to manage sibling dockers
|
||||
- benefits: full use of the ClearML scheduling, no need to worry about wrong container images / lost pods etc.
|
||||
- downside: Sibling containers
|
||||
- Kubernetes Glue, map ClearML jobs directly to K8s jobs
|
||||
- Run the [clearml-k8s glue](https://github.com/allegroai/trains-agent/blob/master/examples/k8s_glue_example.py) on a K8s cpu node
|
||||
- The clearml-k8s glue pulls jobs from the ClearML job execution queue and prepares a K8s job (based on provided yaml template)
|
||||
- Inside the pod itself the clearml-agent will install the job (experiment) environment and spin and monitor the experiment's process
|
||||
- benefits: Kubernetes full view of all running jobs in the system
|
||||
- downside: No real scheduling (k8s scheduler), no docker image verification (post-mortem only)
|
||||
We think Kubernetes is awesome, but it is not a must to get started with remote execution agents and cluster management.
|
||||
We designed `clearml-agent` so you can run both bare-metal and on top of Kubernetes, in any combination that fits your environment.
|
||||
|
||||
You can find the Dockerfiles in the [docker folder](./docker) and the helm Chart in https://github.com/allegroai/clearml-helm-charts
|
||||
|
||||
#### Benefits of integrating existing Kubernetes cluster with ClearML
|
||||
|
||||
- ClearML-Agent adds the missing scheduling capabilities to your Kubernetes cluster
|
||||
- Users do not need to have direct Kubernetes access!
|
||||
- Easy learning curve with UI and CLI requiring no DevOps knowledge from end users
|
||||
- Unlike other solutions, ClearML-Agents work in tandem with other customers of your Kubernetes cluster
|
||||
- Allows for more flexible automation from code, building pipelines and visibility
|
||||
- A programmatic interface for easy CI/CD workflows, enabling GitOps to trigger jobs inside your cluster
|
||||
- Seamless integration with the ClearML ML/DL/GenAI experiment manager
|
||||
- Web UI for customization, scheduling & prioritization of jobs
|
||||
- **Enterprise Features**: RBAC, vault, multi-tenancy, scheduler, quota management, fractional GPU support
|
||||
|
||||
**Run the agent in Kubernetes Glue mode an map ClearML jobs directly to K8s jobs:**
|
||||
- Use the [ClearML Agent Helm Chart](https://github.com/allegroai/clearml-helm-charts/tree/main/charts/clearml-agent) to spin an agent pod acting as a controller
|
||||
- Or run the [clearml-k8s glue](https://github.com/allegroai/clearml-agent/blob/master/examples/k8s_glue_example.py) on
|
||||
a Kubernetes cpu node
|
||||
- The clearml-k8s glue pulls jobs from the ClearML job execution queue and prepares a Kubernetes job (based on provided
|
||||
yaml template)
|
||||
- Inside each pod the clearml-agent will install the job (experiment) environment and spin and monitor the
|
||||
experiment's process, fully visible in the clearml UI
|
||||
- Benefits: Kubernetes full view of all running jobs in the system
|
||||
- **Enterprise Features**
|
||||
- Full scheduler features added on Top of Kubernetes, with quota/over-quota management, priorities and order.
|
||||
- Fractional GPU support, allowing multiple isolated containers sharing the same GPU with memory/compute limit per container
|
||||
|
||||
### SLURM (Optional)
|
||||
|
||||
Yes! Slurm integration is available, check the [documentation](https://clear.ml/docs/latest/docs/clearml_agent/#slurm) for further details
|
||||
|
||||
### Using the ClearML Agent
|
||||
|
||||
**Full scale HPC with a click of a button**
|
||||
|
||||
The ClearML Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the job and monitors its progress.
|
||||
The ClearML Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the
|
||||
job and monitors its progress.
|
||||
|
||||
Any 'Draft' experiment can be scheduled for execution by a ClearML agent.
|
||||
|
||||
A previously run experiment can be put into 'Draft' state by either of two methods:
|
||||
* Using the **'Reset'** action from the experiment right-click context menu in the
|
||||
ClearML UI - This will clear any results and artifacts the previous run had created.
|
||||
* Using the **'Clone'** action from the experiment right-click context menu in the
|
||||
ClearML UI - This will create a new 'Draft' experiment with the same configuration as the original experiment.
|
||||
|
||||
An experiment is scheduled for execution using the **'Enqueue'** action from the experiment
|
||||
right-click context menu in the ClearML UI and selecting the execution queue.
|
||||
* Using the **'Reset'** action from the experiment right-click context menu in the ClearML UI - This will clear any
|
||||
results and artifacts the previous run had created.
|
||||
* Using the **'Clone'** action from the experiment right-click context menu in the ClearML UI - This will create a new
|
||||
'Draft' experiment with the same configuration as the original experiment.
|
||||
|
||||
An experiment is scheduled for execution using the **'Enqueue'** action from the experiment right-click context menu in
|
||||
the ClearML UI and selecting the execution queue.
|
||||
|
||||
See [creating an experiment and enqueuing it for execution](#from-scratch).
|
||||
|
||||
Once an experiment is enqueued, it will be picked up and executed by a ClearML agent monitoring this queue.
|
||||
Once an experiment is enqueued, it will be picked up and executed by a ClearML Agent monitoring this queue.
|
||||
|
||||
The ClearML UI Workers & Queues page provides ongoing execution information:
|
||||
- Workers Tab: Monitor you cluster
|
||||
|
||||
- Workers Tab: Monitor you cluster
|
||||
- Review available resources
|
||||
- Monitor machines statistics (CPU / GPU / Disk / Network)
|
||||
- Queues Tab:
|
||||
- Queues Tab:
|
||||
- Control the scheduling order of jobs
|
||||
- Cancel or abort job execution
|
||||
- Move jobs between execution queues
|
||||
|
||||
#### What The ClearML Agent Actually Does
|
||||
|
||||
The ClearML Agent executes experiments using the following process:
|
||||
- Create a new virtual environment (or launch the selected docker image)
|
||||
- Clone the code into the virtual-environment (or inside the docker)
|
||||
- Install python packages based on the package requirements listed for the experiment
|
||||
- Special note for PyTorch: The ClearML Agent will automatically select the
|
||||
torch packages based on the CUDA_VERSION environment variable of the machine
|
||||
- Execute the code, while monitoring the process
|
||||
- Log all stdout/stderr in the ClearML UI, including the cloning and installation process, for easy debugging
|
||||
- Monitor the execution and allow you to manually abort the job using the ClearML UI (or, in the unfortunate case of a code crash, catch the error and signal the experiment has failed)
|
||||
|
||||
- Create a new virtual environment (or launch the selected docker image)
|
||||
- Clone the code into the virtual-environment (or inside the docker)
|
||||
- Install python packages based on the package requirements listed for the experiment
|
||||
- Special note for PyTorch: The ClearML Agent will automatically select the torch packages based on the CUDA_VERSION
|
||||
environment variable of the machine
|
||||
- Execute the code, while monitoring the process
|
||||
- Log all stdout/stderr in the ClearML UI, including the cloning and installation process, for easy debugging
|
||||
- Monitor the execution and allow you to manually abort the job using the ClearML UI (or, in the unfortunate case of a
|
||||
code crash, catch the error and signal the experiment has failed)
|
||||
|
||||
#### System Design & Flow
|
||||
|
||||
<img src="https://allegro.ai/clearml/docs/img/ClearML_Architecture.png" width="100%" alt="clearml-architecture">
|
||||
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_architecture.png" width="100%" alt="clearml-architecture">
|
||||
|
||||
#### Installing the ClearML Agent
|
||||
|
||||
@@ -135,6 +162,7 @@ pip install clearml-agent
|
||||
#### ClearML Agent Usage Examples
|
||||
|
||||
Full Interface and capabilities are available with
|
||||
|
||||
```bash
|
||||
clearml-agent --help
|
||||
clearml-agent daemon --help
|
||||
@@ -146,39 +174,48 @@ clearml-agent daemon --help
|
||||
clearml-agent init
|
||||
```
|
||||
|
||||
Note: The ClearML Agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default ClearML Agent cache folder is `~/.clearml`
|
||||
Note: The ClearML Agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default
|
||||
ClearML Agent cache folder is `~/.clearml`.
|
||||
|
||||
See full details in your configuration file at `~/clearml.conf`
|
||||
See full details in your configuration file at `~/clearml.conf`.
|
||||
|
||||
Note: The **ClearML agent** extends the **ClearML** configuration file `~/clearml.conf`
|
||||
Note: The **ClearML Agent** extends the **ClearML** configuration file `~/clearml.conf`.
|
||||
They are designed to share the same configuration file, see example [here](docs/clearml.conf)
|
||||
|
||||
#### Running the ClearML Agent
|
||||
|
||||
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen
|
||||
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen:
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --queue default --foreground
|
||||
```
|
||||
|
||||
For actual service mode, all the stdout will be stored automatically into a temporary file (no need to pipe)
|
||||
For actual service mode, all the stdout will be stored automatically into a temporary file (no need to pipe).
|
||||
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --queue default
|
||||
```
|
||||
|
||||
GPU allocation is controlled via the standard OS environment `NVIDIA_VISIBLE_DEVICES` or `--gpus` flag (or disabled with `--cpu-only`).
|
||||
GPU allocation is controlled via the standard OS environment `NVIDIA_VISIBLE_DEVICES` or `--gpus` flag (or disabled
|
||||
with `--cpu-only`).
|
||||
|
||||
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for the `clearml-agent` <br>
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for the `clearml-agent`
|
||||
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPUs will be allocated for
|
||||
the `clearml-agent`. <br>
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES="none"`, no gpu will be allocated for
|
||||
the `clearml-agent`.
|
||||
|
||||
Example: spin two agents, one per GPU on the same machine:
|
||||
|
||||
Notice: with `--detached` flag, the *clearml-agent* will run in the background
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine:
|
||||
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0 --queue default
|
||||
clearml-agent daemon --detached --gpus 1 --queue default
|
||||
```
|
||||
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two GPUs per agent
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu
|
||||
@@ -187,82 +224,95 @@ clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu
|
||||
##### Starting the ClearML Agent in docker mode
|
||||
|
||||
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --queue default --docker --foreground
|
||||
```
|
||||
|
||||
For actual service mode, all the stdout will be stored automatically into a file (no need to pipe)
|
||||
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
|
||||
For actual service mode, all the stdout will be stored automatically into a file (no need to pipe).
|
||||
Notice: with `--detached` flag, the *clearml-agent* will run in the background
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --queue default --docker
|
||||
```
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda docker:
|
||||
Example: spin two agents, one per gpu on the same machine, with default `nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04`
|
||||
docker:
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
clearml-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
```
|
||||
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda docker:
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two GPUs per agent, with default
|
||||
`nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04` docker:
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
|
||||
```
|
||||
|
||||
##### Starting the ClearML Agent - Priority Queues
|
||||
|
||||
Priority Queues are also supported, example use case:
|
||||
|
||||
High priority queue: `important_jobs` Low priority queue: `default`
|
||||
High priority queue: `important_jobs`, low priority queue: `default`
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --queue important_jobs default
|
||||
```
|
||||
The **ClearML Agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
|
||||
|
||||
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [free server](https://app.community.clear.ml/workers-and-queues/queues)
|
||||
The **ClearML Agent** will first try to pull jobs from the `important_jobs` queue, and only if it is empty, the agent
|
||||
will try to pull from the `default` queue.
|
||||
|
||||
Adding queues, managing job order within a queue, and moving jobs between queues, is available using the Web UI, see
|
||||
example on our [free server](https://app.clear.ml/workers-and-queues/queues)
|
||||
|
||||
##### Stopping the ClearML Agent
|
||||
|
||||
To stop a **ClearML Agent** running in the background, run the same command line used to start the agent with `--stop` appended.
|
||||
For example, to stop the first of the above shown same machine, single gpu agents:
|
||||
To stop a **ClearML Agent** running in the background, run the same command line used to start the agent with `--stop`
|
||||
appended. For example, to stop the first of the above shown same machine, single gpu agents:
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda --stop
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04 --stop
|
||||
```
|
||||
|
||||
### How do I create an experiment on the ClearML Server? <a name="from-scratch"></a>
|
||||
* Integrate [ClearML](https://github.com/allegroai/trains) with your code
|
||||
|
||||
* Integrate [ClearML](https://github.com/allegroai/clearml) with your code
|
||||
* Execute the code on your machine (Manually / PyCharm / Jupyter Notebook)
|
||||
* As your code is running, **ClearML** creates an experiment logging all the necessary execution information:
|
||||
- Git repository link and commit ID (or an entire jupyter notebook)
|
||||
- Git diff (we’re not saying you never commit and push, but still...)
|
||||
- Python packages used by your code (including specific versions used)
|
||||
- Hyper-Parameters
|
||||
- Input Artifacts
|
||||
- Git repository link and commit ID (or an entire jupyter notebook)
|
||||
- Git diff (we’re not saying you never commit and push, but still...)
|
||||
- Python packages used by your code (including specific versions used)
|
||||
- Hyperparameters
|
||||
- Input artifacts
|
||||
|
||||
You now have a 'template' of your experiment with everything required for automated execution
|
||||
|
||||
* In the ClearML UI, Right click on the experiment and select 'clone'. A copy of your experiment will be created.
|
||||
* In the ClearML UI, right-click on the experiment and select 'clone'. A copy of your experiment will be created.
|
||||
* You now have a new draft experiment cloned from your original experiment, feel free to edit it
|
||||
- Change the Hyper-Parameters
|
||||
- Switch to the latest code base of the repository
|
||||
- Update package versions
|
||||
- Select a specific docker image to run in (see docker execution mode section)
|
||||
- Or simply change nothing to run the same experiment again...
|
||||
* Schedule the newly created experiment for execution: Right-click the experiment and select 'enqueue'
|
||||
- Change the hyperparameters
|
||||
- Switch to the latest code base of the repository
|
||||
- Update package versions
|
||||
- Select a specific docker image to run in (see docker execution mode section)
|
||||
- Or simply change nothing to run the same experiment again...
|
||||
* Schedule the newly created experiment for execution: right-click the experiment and select 'enqueue'
|
||||
|
||||
### ClearML-Agent Services Mode <a name="services"></a>
|
||||
|
||||
ClearML-Agent Services is a special mode of ClearML-Agent that provides the ability to launch long-lasting jobs
|
||||
that previously had to be executed on local / dedicated machines. It allows a single agent to
|
||||
launch multiple dockers (Tasks) for different use cases. To name a few use cases, auto-scaler service (spinning instances
|
||||
when the need arises and the budget allows), Controllers (Implementing pipelines and more sophisticated DevOps logic),
|
||||
Optimizer (such as Hyper-parameter Optimization or sweeping), and Application (such as interactive Bokeh apps for
|
||||
increased data transparency)
|
||||
ClearML-Agent Services is a special mode of ClearML-Agent that provides the ability to launch long-lasting jobs that
|
||||
previously had to be executed on local / dedicated machines. It allows a single agent to launch multiple dockers (Tasks)
|
||||
for different use cases:
|
||||
* Auto-scaler service (spinning instances when the need arises and the budget allows)
|
||||
* Controllers (Implementing pipelines and more sophisticated DevOps logic)
|
||||
* Optimizer (such as Hyperparameter Optimization or sweeping)
|
||||
* Application (such as interactive Bokeh apps for increased data transparency)
|
||||
|
||||
ClearML-Agent Services mode will spin **any** task enqueued into the specified queue.
|
||||
Every task launched by ClearML-Agent Services will be registered as a new node in the system,
|
||||
providing tracking and transparency capabilities.
|
||||
Currently clearml-agent in services-mode supports cpu only configuration. ClearML-agent services mode can be launched alongside GPU agents.
|
||||
ClearML-Agent Services mode will spin **any** task enqueued into the specified queue. Every task launched by
|
||||
ClearML-Agent Services will be registered as a new node in the system, providing tracking and transparency capabilities.
|
||||
Currently, clearml-agent in services-mode supports CPU only configuration. ClearML-Agent services mode can be launched
|
||||
alongside GPU agents.
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --services-mode --detached --queue services --create-queue --docker ubuntu:18.04 --cpu-only
|
||||
@@ -270,22 +320,27 @@ clearml-agent daemon --services-mode --detached --queue services --create-queue
|
||||
|
||||
**Note**: It is the user's responsibility to make sure the proper tasks are pushed into the specified queue.
|
||||
|
||||
|
||||
### AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
|
||||
The ClearML Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the ClearML package.
|
||||
|
||||
Sample AutoML & Orchestration examples can be found in the ClearML [example/automation](https://github.com/allegroai/trains/tree/master/examples/automation) folder.
|
||||
The ClearML Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the
|
||||
ClearML package.
|
||||
|
||||
AutoML examples
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/trains/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
|
||||
Sample AutoML & Orchestration examples can be found in the
|
||||
ClearML [example/automation](https://github.com/allegroai/clearml/tree/master/examples/automation) folder.
|
||||
|
||||
AutoML examples:
|
||||
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/clearml/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/trains/blob/master/examples/automation/manual_random_param_search_example.py)
|
||||
- This example will create multiple copies of the Keras experiment-template, with different hyper-parameter combinations
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/clearml/blob/master/examples/automation/manual_random_param_search_example.py)
|
||||
- This example will create multiple copies of the Keras experiment-template, with different hyperparameter
|
||||
combinations
|
||||
|
||||
Experiment Pipeline examples
|
||||
- [First step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/task_piping_example.py)
|
||||
Experiment Pipeline examples:
|
||||
|
||||
- [First step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/task_piping_example.py)
|
||||
- This example will "process data", and once done, will launch a copy of the 'second step' experiment-template
|
||||
- [Second step experiment](https://github.com/allegroai/trains/blob/master/examples/automation/toy_base_task.py)
|
||||
- [Second step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/toy_base_task.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
|
||||
### License
|
||||
|
||||
@@ -12,7 +12,7 @@ from clearml_agent.definitions import FileBuffering, CONFIG_FILE
|
||||
from clearml_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
|
||||
from clearml_agent.helper.process import ExitStatus
|
||||
from . import interface, session, definitions, commands
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError, CustomBuildScriptFailed
|
||||
from .helper.trace import PackageTrace
|
||||
from .interface import get_parser
|
||||
|
||||
@@ -44,6 +44,8 @@ def run_command(parser, args, command_name):
|
||||
debug = command._session.debug_mode
|
||||
func = getattr(command, command_name)
|
||||
return func(**args_dict)
|
||||
except CustomBuildScriptFailed as e:
|
||||
command_class.exit(e.message, e.errno)
|
||||
except ConfigFileNotFound:
|
||||
message = 'Cannot find configuration file in "{}".\n' \
|
||||
'To create a configuration file, run:\n' \
|
||||
|
||||
@@ -11,70 +11,143 @@
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# To learn how to generate git token GitHub/Bitbucket/GitLab:
|
||||
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
|
||||
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
|
||||
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# Limit credentials to a single domain, for example: github.com,
|
||||
# all other domains will use public access (no user/pass). Default: always send user/pass for any VCS domain
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# Force the root folder of the git repository (instead of the working directory) into the PYHTONPATH
|
||||
# default false, only the working directory will be added to the PYHTONPATH
|
||||
# force_git_root_python_path: false
|
||||
|
||||
# if set, use GIT_ASKPASS to pass user/pass when cloning / fetch repositories
|
||||
# it solves passing user/token to git submodules.
|
||||
# this is a safer way to ensure multiple users using the same repository will
|
||||
# not accidentally leak credentials
|
||||
# Note: this is only supported on Linux systems
|
||||
# enable_git_ask_pass: true
|
||||
|
||||
# in docker mode, if container's entrypoint automatically activated a virtual environment
|
||||
# use the activated virtual environment and install everything there
|
||||
# set to False to disable, and always create a new venv inheriting from the system_site_packages
|
||||
# docker_use_activated_venv: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: "<20.2",
|
||||
# specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: ["<20.2 ; python_version < '3.10'", "<22.3 ; python_version >= '3.10' and python_version <= '3.11'", ">=23,<24.3 ; python_version >= '3.12'"]
|
||||
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
|
||||
# poetry_version: "<2",
|
||||
# poetry_install_extra_args: ["-v"]
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
# virtual environment inherits packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
# install with --upgrade
|
||||
force_upgrade: false,
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
|
||||
# turn on the "--use-deprecated=legacy-resolver" flag for pip, to avoid package dependency version mismatch
|
||||
# is any version restrictions are matched we add the "--use-deprecated=legacy-resolver" flag
|
||||
# example: pip_legacy_resolver = [">=20.3,<24.3", ">99"]
|
||||
# if pip==20.2 or pip==29.0 is installed we do nothing,
|
||||
# if pip==21.1 or pip==101.1 is installed the flag is added
|
||||
# disable the feature by passing an empty list
|
||||
pip_legacy_resolver = [">=20.3,<24.3"]
|
||||
|
||||
# control the pytorch wheel resolving algorithm, options are: "pip", "direct", "none"
|
||||
# Override with environment variable CLEARML_AGENT_PACKAGE_PYTORCH_RESOLVE
|
||||
# "pip" (default): would automatically detect the cuda version, and supply pip with the correct
|
||||
# extra-index-url, based on pytorch.org tables
|
||||
# "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
|
||||
# and matching the automatically detected cuda version with the required pytorch wheel.
|
||||
# if the exact cuda version is not found for the required pytorch wheel, it will try
|
||||
# a lower cuda version until a match is found
|
||||
# "none": No resolver used, install pytorch like any other package
|
||||
# pytorch_resolve: "pip"
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["defaults", "conda-forge", "pytorch", ]
|
||||
conda_channels: ["pytorch", "conda-forge", "nvidia", "defaults", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||
torch_nightly: false,
|
||||
|
||||
# if set to true, the agent will look for the "poetry.lock" file
|
||||
# in the passed current working directory instead of the repository's root directory.
|
||||
poetry_files_from_repo_working_dir: false
|
||||
},
|
||||
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
@@ -94,6 +167,12 @@
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
|
||||
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
|
||||
# default is false, automatically mounts ~/.ssh
|
||||
# Must be set to True if using "clearml-session" with this agent!
|
||||
# disable_ssh_mount: false
|
||||
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
@@ -106,9 +185,21 @@
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# Allow the extra docker arg to override task level docker arg (if the same argument is passed on both),
|
||||
# if set to False, a task docker arg will override the docker extra arg
|
||||
# docker_args_extra_precedes_task: true
|
||||
|
||||
# allows the following task docker args to be overridden by the extra_docker_arguments
|
||||
# protected_docker_extra_args: ["privileged", "security-opt", "network", "ipc"]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
@@ -131,12 +222,85 @@
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
|
||||
image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
|
||||
# Choose the default docker based on the Task properties,
|
||||
# Notice: Enterprise feature, ignored otherwise
|
||||
# Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project'
|
||||
# Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme" string
|
||||
"match_rules": [
|
||||
{
|
||||
"image": "python:3.6-bullseye",
|
||||
"arguments": "--ipc=host",
|
||||
"match": {
|
||||
"script": {
|
||||
"binary": "python3.6$",
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
"image": "python:3.7-bullseye",
|
||||
"arguments": "--ipc=host",
|
||||
"match": {
|
||||
"script": {
|
||||
"binary": "python3.7$",
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
"image": "python:3.8-bullseye",
|
||||
"arguments": "--ipc=host",
|
||||
"match": {
|
||||
"script": {
|
||||
"binary": "python3.8$",
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
"image": "python:3.9-bullseye",
|
||||
"arguments": "--ipc=host",
|
||||
"match": {
|
||||
"script": {
|
||||
"binary": "python3.9$",
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
"image": "python:3.10-bullseye",
|
||||
"arguments": "--ipc=host",
|
||||
"match": {
|
||||
"script": {
|
||||
"binary": "python3.10$",
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
"image": "python:3.11-bullseye",
|
||||
"arguments": "--ipc=host",
|
||||
"match": {
|
||||
"script": {
|
||||
"binary": "python3.11$",
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
"image": "python:3.12-bullseye",
|
||||
"arguments": "--ipc=host",
|
||||
"match": {
|
||||
"script": {
|
||||
"binary": "python3.12$",
|
||||
},
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
@@ -158,8 +322,182 @@
|
||||
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||
# suppress_carriage_return: true
|
||||
|
||||
# cuda versions used for solving pytorch wheel packages
|
||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# CUDA versions used for Conda setup & solving PyTorch wheel packages
|
||||
# Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Sanitize configuration printout using these settings
|
||||
sanitize_config_printout {
|
||||
# Hide values of configuration keys matching these regexps
|
||||
hide_secrets: ["^sanitize_config_printout$", "secret", "pass", "token", "account_key", "contents"]
|
||||
# As above, only show field's value keys if value is a dictionary
|
||||
hide_secrets_recursive: ["^environment$"]
|
||||
# Do not hide for keys matching these regexps
|
||||
dont_hide_secrets: ["^enable_git_ask_pass$"]
|
||||
# Hide secrets in docker commands, according to the 'agent.hide_docker_command_env_vars' settings
|
||||
docker_commands: ["^extra_docker_arguments$"]
|
||||
# Hide password in URLs found in keys matching these regexps (handles single URLs, lists and dictionaries)
|
||||
urls: ["^extra_index_url$"]
|
||||
}
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
parse_embedded_urls: true
|
||||
}
|
||||
|
||||
# Maximum execution time (in seconds) for Task's abort function call
|
||||
abort_callback_max_timeout: 1800
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
docker_internal_mounts {
|
||||
sdk_cache: "/clearml_agent_cache"
|
||||
apt_cache: "/var/cache/apt/archives"
|
||||
ssh_folder: "~/.ssh"
|
||||
ssh_ro_folder: "/.ssh"
|
||||
pip_cache: "/root/.cache/pip"
|
||||
poetry_cache: "/root/.cache/pypoetry"
|
||||
vcs_cache: "/root/.clearml/vcs-cache"
|
||||
venvs_cache: "/root/.clearml/venvs-cache"
|
||||
venv_build: "~/.clearml/venvs-builds"
|
||||
pip_download: "/root/.clearml/pip-download-cache"
|
||||
}
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
|
||||
# Custom variables may be specified using the docker_container_name_format_fields option.
|
||||
# Note: resulting name must start with an alphanumeric character and
|
||||
# continue with alphanumeric characters, underscores (_), dots (.) and/or dashes (-)
|
||||
# docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
|
||||
# Specify custom variables for the docker_container_name_format option using a mapping of variable name
|
||||
# to a (nested) task field (using "." as a task field separator, digits specify array index)
|
||||
# docker_container_name_format_fields: { foo: "bar.moo" }
|
||||
|
||||
# Apply top-level environment section from configuration into os.environ
|
||||
apply_environment: true
|
||||
# Top-level environment section is in the form of:
|
||||
# environment {
|
||||
# key: value
|
||||
# ...
|
||||
# }
|
||||
# and is applied to the OS environment as `key=value` for each key/value pair
|
||||
|
||||
# Apply top-level files section from configuration into local file system
|
||||
apply_files: true
|
||||
# Top-level files section allows auto-generating files at designated paths with a predefined contents
|
||||
# and target format. Options include:
|
||||
# contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
|
||||
# format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
|
||||
# base64-encoded contents string, otherwise ignored
|
||||
# path: the target file's path, may include ~ and inplace env vars
|
||||
# target_format: format used to encode contents before writing into the target file. Supported values are json,
|
||||
# yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
|
||||
# overwrite: overwrite the target file in case it exists. Default is true.
|
||||
# mode: file-system mode to be applied to the file after its creation. The mode string will be parsed into an
|
||||
# integer (e.g. "0o777" for -rwxrwxrwx)
|
||||
#
|
||||
# Example:
|
||||
# files {
|
||||
# myfile1 {
|
||||
# contents: "The quick brown fox jumped over the lazy dog"
|
||||
# path: "/tmp/fox.txt"
|
||||
# }
|
||||
# myjsonfile {
|
||||
# contents: {
|
||||
# some {
|
||||
# nested {
|
||||
# value: [1, 2, 3, 4]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# path: "/tmp/test.json"
|
||||
# target_format: json
|
||||
# }
|
||||
# }
|
||||
|
||||
# Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
# If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
# will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
# The script can also be specified using the CLEARML_AGENT_CUSTOM_BUILD_SCRIPT environment variable.
|
||||
#
|
||||
# When running the script, the following environment variables will be set:
|
||||
# - CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
# contents in JSON format
|
||||
# - CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
# - CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
# - CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
# - CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
# - CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
# this file must be in the following JSON format:
|
||||
# ```json
|
||||
# {
|
||||
# "binary": "/absolute/path/to/python-executable",
|
||||
# "entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
# "working_dir": "/absolute/path/to/task-working/dir"
|
||||
# }
|
||||
# ```
|
||||
# If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
# skip virtual environment creation.
|
||||
#
|
||||
# In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
# In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
# into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
# standard flow.
|
||||
custom_build_script: ""
|
||||
|
||||
# Crash on exception: by default when encountering an exception while running a task,
|
||||
# the agent will catch the exception, log it and continue running.
|
||||
# Set this to `true` to propagate exceptions and crash the agent.
|
||||
# crash_on_exception: true
|
||||
|
||||
# Disable task docker override. If true, the agent will use the default docker image and ignore any docker image
|
||||
# and arguments specified in the task's container section (setup shell script from the task container section will
|
||||
# be used in any case, if specified).
|
||||
disable_task_docker_override: false
|
||||
|
||||
# Choose the default docker based on the Task properties,
|
||||
# Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project'
|
||||
# Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme$" string
|
||||
#
|
||||
# "default_docker": {
|
||||
# "image": "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04",
|
||||
# # optional arguments to pass to docker image
|
||||
# # arguments: ["--ipc=host", ]
|
||||
# "match_rules": [
|
||||
# {
|
||||
# "image": "sample_container:tag",
|
||||
# "arguments": "-e VALUE=1 --ipc=host",
|
||||
# "match": {
|
||||
# "script": {
|
||||
# "requirements": {
|
||||
# "pip": {
|
||||
# "tensorflow": "~=1.6"
|
||||
# }
|
||||
# },
|
||||
# "repository": "",
|
||||
# "branch": "master"
|
||||
# },
|
||||
# "project": "example"
|
||||
# }
|
||||
# },
|
||||
# {
|
||||
# "image": "another_container:tag",
|
||||
# "arguments": "",
|
||||
# "match": {
|
||||
# "project": "^examples", # anything that starts with "examples", e.g. "examples", "examples/sub_project"
|
||||
# }
|
||||
# }
|
||||
# ]
|
||||
# },
|
||||
#
|
||||
}
|
||||
|
||||
@@ -28,10 +28,15 @@
|
||||
|
||||
pool_maxsize: 512
|
||||
pool_connections: 512
|
||||
|
||||
# Override the default http method, use "put" if working behind GCP load balancer (default: "get")
|
||||
# default_method: "get"
|
||||
}
|
||||
|
||||
auth {
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token
|
||||
token_expiration_threshold_sec = 360
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token. Default 12 hours
|
||||
token_expiration_threshold_sec: 43200
|
||||
# When requesting a token, request specific expiration time. Server default (and maximum) is 30 days
|
||||
# request_token_expiration_sec: 2592000
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
# Defaults to <system_temp_folder>/clearml_cache
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
size {
|
||||
# max_used_bytes = -1
|
||||
@@ -140,7 +140,7 @@
|
||||
vcs_repo_detect_async: true
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||
# This stores "git diff" or into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff: true
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
|
||||
@@ -4,7 +4,7 @@ import re
|
||||
import attr
|
||||
import six
|
||||
|
||||
import pyhocon
|
||||
from clearml_agent.external import pyhocon
|
||||
|
||||
from .action import Action
|
||||
|
||||
|
||||
@@ -66,11 +66,16 @@ class DataModel(object):
|
||||
}
|
||||
|
||||
def validate(self, schema=None):
|
||||
jsonschema.validate(
|
||||
self.to_dict(),
|
||||
schema or self._schema,
|
||||
types=dict(array=(list, tuple), integer=six.integer_types),
|
||||
schema = schema or self._schema
|
||||
validator = jsonschema.validators.validator_for(schema)
|
||||
validator_cls = jsonschema.validators.extend(
|
||||
validator=validator,
|
||||
type_checker=validator.TYPE_CHECKER.redefine_many({
|
||||
"array": lambda s, instance: isinstance(instance, (list, tuple)),
|
||||
"integer": lambda s, instance: isinstance(instance, six.integer_types),
|
||||
}),
|
||||
)
|
||||
jsonschema.validate(self.to_dict(), schema, cls=validator_cls)
|
||||
|
||||
def __repr__(self):
|
||||
return '<{}.{}: {}>'.format(
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from ...backend_config.environment import EnvEntry
|
||||
from clearml_agent.helper.environment import EnvEntry
|
||||
from clearml_agent.helper.environment.converters import safe_text_to_bool
|
||||
|
||||
|
||||
ENV_HOST = EnvEntry("CLEARML_API_HOST", "TRAINS_API_HOST")
|
||||
@@ -6,6 +7,30 @@ ENV_WEB_HOST = EnvEntry("CLEARML_WEB_HOST", "TRAINS_WEB_HOST")
|
||||
ENV_FILES_HOST = EnvEntry("CLEARML_FILES_HOST", "TRAINS_FILES_HOST")
|
||||
ENV_ACCESS_KEY = EnvEntry("CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY")
|
||||
ENV_SECRET_KEY = EnvEntry("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AUTH_TOKEN = EnvEntry("CLEARML_AUTH_TOKEN")
|
||||
ENV_VERBOSE = EnvEntry("CLEARML_API_VERBOSE", "TRAINS_API_VERBOSE", type=bool, default=False)
|
||||
ENV_HOST_VERIFY_CERT = EnvEntry("CLEARML_API_HOST_VERIFY_CERT", "TRAINS_API_HOST_VERIFY_CERT", type=bool, default=True)
|
||||
ENV_CONDA_ENV_PACKAGE = EnvEntry("CLEARML_CONDA_ENV_PACKAGE", "TRAINS_CONDA_ENV_PACKAGE")
|
||||
ENV_USE_CONDA_BASE_ENV = EnvEntry("CLEARML_USE_CONDA_BASE_ENV", type=bool)
|
||||
ENV_NO_DEFAULT_SERVER = EnvEntry("CLEARML_NO_DEFAULT_SERVER", "TRAINS_NO_DEFAULT_SERVER", type=bool, default=True)
|
||||
ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type=bool)
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
|
||||
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
|
||||
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
|
||||
ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
|
||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||
)
|
||||
ENV_FORCE_MAX_API_VERSION = EnvEntry("CLEARML_AGENT_FORCE_MAX_API_VERSION", type=str)
|
||||
# values are 0/None (task per node), 1/2 (multi-node reporting, colored console), -1 (only report rank 0 node)
|
||||
ENV_MULTI_NODE_SINGLE_TASK = EnvEntry("CLEARML_MULTI_NODE_SINGLE_TASK", type=int, default=None)
|
||||
|
||||
|
||||
"""
|
||||
Experimental option to set the request method for all API requests and auth login.
|
||||
This could be useful when GET requests with payloads are blocked by a server as
|
||||
POST requests can be used instead.
|
||||
|
||||
However this has not been vigorously tested and may have unintended consequences.
|
||||
"""
|
||||
ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD", default="GET")
|
||||
|
||||
@@ -5,10 +5,18 @@ import six
|
||||
|
||||
from .apimodel import ApiModel
|
||||
from .datamodel import DataModel
|
||||
from .defs import ENV_API_DEFAULT_REQ_METHOD
|
||||
|
||||
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST", "PUT"):
|
||||
raise ValueError(
|
||||
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
|
||||
)
|
||||
|
||||
|
||||
class Request(ApiModel):
|
||||
_method = 'get'
|
||||
def_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs:
|
||||
|
||||
@@ -1,17 +1,26 @@
|
||||
|
||||
import json as json_lib
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import types
|
||||
from random import SystemRandom
|
||||
from socket import gethostname
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
from typing import Optional
|
||||
|
||||
import jwt
|
||||
import requests
|
||||
import six
|
||||
from pyhocon import ConfigTree
|
||||
from requests import RequestException
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
|
||||
from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
|
||||
from .callresult import CallResult
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST
|
||||
from .defs import (
|
||||
ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD,
|
||||
ENV_FORCE_MAX_API_VERSION)
|
||||
from .request import Request, BatchRequest
|
||||
from .token_manager import TokenManager
|
||||
from ..config import load
|
||||
@@ -19,6 +28,8 @@ from ..utils import get_http_session_with_retry, urllib_log_warning_setup
|
||||
from ...backend_config.environment import backward_compatibility_support
|
||||
from ...version import __version__
|
||||
|
||||
sys_random = SystemRandom()
|
||||
|
||||
|
||||
class LoginError(Exception):
|
||||
pass
|
||||
@@ -40,16 +51,21 @@ class Session(TokenManager):
|
||||
_session_requests = 0
|
||||
_session_initial_timeout = (3.0, 10.)
|
||||
_session_timeout = (10.0, 30.)
|
||||
_session_initial_connect_retry = 4
|
||||
_session_initial_retry_connect_override = 4
|
||||
_write_session_data_size = 15000
|
||||
_write_session_timeout = (30.0, 30.)
|
||||
_request_exception_retry_timeout = (2.0, 3.0)
|
||||
|
||||
api_version = '2.1'
|
||||
feature_set = 'basic'
|
||||
default_host = "https://demoapi.demo.clear.ml"
|
||||
default_web = "https://demoapp.demo.clear.ml"
|
||||
default_files = "https://demofiles.demo.clear.ml"
|
||||
default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
|
||||
default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
|
||||
force_max_api_version = ENV_FORCE_MAX_API_VERSION.get()
|
||||
server_version = "1.0.0"
|
||||
user_id = None
|
||||
|
||||
# TODO: add requests.codes.gateway_timeout once we support async commits
|
||||
_retry_codes = [
|
||||
@@ -99,42 +115,51 @@ class Session(TokenManager):
|
||||
if initialize_logging:
|
||||
self.config.initialize_logging(debug=kwargs.get('debug', False))
|
||||
|
||||
token_expiration_threshold_sec = self.config.get(
|
||||
"auth.token_expiration_threshold_sec", 60
|
||||
)
|
||||
|
||||
super(Session, self).__init__(
|
||||
token_expiration_threshold_sec=token_expiration_threshold_sec, **kwargs
|
||||
)
|
||||
super(Session, self).__init__(config=config, **kwargs)
|
||||
|
||||
self._verbose = verbose if verbose is not None else ENV_VERBOSE.get()
|
||||
self._logger = logger
|
||||
self.__auth_token = None
|
||||
self._propagate_exceptions_on_send = True
|
||||
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key)
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
self.update_default_api_method()
|
||||
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret)
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
if ENV_AUTH_TOKEN.get(
|
||||
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
|
||||
):
|
||||
self.set_auth_token(ENV_AUTH_TOKEN.get())
|
||||
else:
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key),
|
||||
value_cb=lambda key, value: print("Using environment access key {}={}".format(key, value))
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
|
||||
value_cb=lambda key, value: print("Using environment secret key {}=********".format(key))
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
if self.access_key == self.default_key and self.secret_key == self.default_secret:
|
||||
print("Using built-in ClearML default key/secret")
|
||||
|
||||
host = host or self.get_api_server_host(config=self.config)
|
||||
if not host:
|
||||
raise ValueError("host is required in init or config")
|
||||
raise ValueError(
|
||||
"Could not find host server definition "
|
||||
"(missing `~/clearml.conf` or Environment CLEARML_API_HOST)\n"
|
||||
"To get started with ClearML: setup your own `clearml-server`, "
|
||||
"or create a free account at https://app.clear.ml and run `clearml-agent init`"
|
||||
)
|
||||
|
||||
self.__host = host.strip("/")
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
self.__worker = worker or gethostname()
|
||||
|
||||
@@ -145,22 +170,31 @@ class Session(TokenManager):
|
||||
self.client = client or "api-{}".format(__version__)
|
||||
|
||||
# limit the reconnect retries, so we get an error if we are starting the session
|
||||
http_no_retries_config = dict(**http_retries_config)
|
||||
http_no_retries_config['connect'] = self._session_initial_connect_retry
|
||||
self.__http_session = get_http_session_with_retry(**http_no_retries_config)
|
||||
_, self.__http_session = self._setup_session(
|
||||
http_retries_config,
|
||||
initial_session=True,
|
||||
default_initial_connect_override=(False if kwargs.get("command") == "execute" else None)
|
||||
)
|
||||
# try to connect with the server
|
||||
self.refresh_token()
|
||||
|
||||
# for resilience, from now on we won't allow propagating exceptions when sending requests
|
||||
self._propagate_exceptions_on_send = False
|
||||
|
||||
# create the default session with many retries
|
||||
self.__http_session = get_http_session_with_retry(**http_retries_config)
|
||||
http_retries_config, self.__http_session = self._setup_session(http_retries_config)
|
||||
|
||||
# update api version from server response
|
||||
try:
|
||||
token_dict = jwt.decode(self.token, verify=False)
|
||||
token_dict = TokenManager.get_decoded_token(self.token, verify=False)
|
||||
api_version = token_dict.get('api_version')
|
||||
if not api_version:
|
||||
api_version = '2.2' if token_dict.get('env', '') == 'prod' else Session.api_version
|
||||
|
||||
Session.api_version = str(api_version)
|
||||
Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
|
||||
Session.server_version = token_dict.get('server_version', self.server_version)
|
||||
Session.user_id = (token_dict.get("identity") or {}).get("user") or ""
|
||||
except (jwt.DecodeError, ValueError):
|
||||
pass
|
||||
|
||||
@@ -169,17 +203,110 @@ class Session(TokenManager):
|
||||
# notice: this is across the board warning omission
|
||||
urllib_log_warning_setup(total_retries=http_retries_config.get('total', 0), display_warning_after=3)
|
||||
|
||||
if self.force_max_api_version and self.check_min_api_version(self.force_max_api_version):
|
||||
print("Using forced API version {}".format(self.force_max_api_version))
|
||||
Session.max_api_version = Session.api_version = str(self.force_max_api_version)
|
||||
|
||||
self.pre_vault_config = None
|
||||
|
||||
def _setup_session(self, http_retries_config, initial_session=False, default_initial_connect_override=None):
|
||||
# type: (dict, bool, Optional[bool]) -> (dict, requests.Session)
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
if initial_session:
|
||||
kwargs = {} if default_initial_connect_override is None else {
|
||||
"default": default_initial_connect_override
|
||||
}
|
||||
if ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(**kwargs):
|
||||
connect_retries = self._session_initial_retry_connect_override
|
||||
try:
|
||||
value = ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(converter=str)
|
||||
if not isinstance(value, bool):
|
||||
connect_retries = abs(int(value))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
http_retries_config = dict(**http_retries_config)
|
||||
http_retries_config['connect'] = connect_retries
|
||||
|
||||
return http_retries_config, get_http_session_with_retry(config=self.config or None, **http_retries_config)
|
||||
|
||||
def update_default_api_method(self):
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
|
||||
# Make sure we update the config object, so we pass it into the new containers when we map them
|
||||
self.config.put("api.http.default_method", ENV_API_DEFAULT_REQ_METHOD.get())
|
||||
# notice the default setting of Request.def_method are already set by the OS environment
|
||||
elif self.config.get("api.http.default_method", None):
|
||||
def_method = str(self.config.get("api.http.default_method", None)).strip()
|
||||
if def_method.upper() not in ("GET", "POST", "PUT"):
|
||||
raise ValueError(
|
||||
"api.http.default_method variable must be 'get', 'post' or 'put' (any case is allowed)."
|
||||
)
|
||||
Request.def_method = def_method
|
||||
Request._method = Request.def_method
|
||||
|
||||
def load_vaults(self):
|
||||
# () -> Optional[bool]
|
||||
if not self.check_min_api_version("2.15") or self.feature_set == "basic":
|
||||
return
|
||||
|
||||
if ENV_DISABLE_VAULT_SUPPORT.get():
|
||||
print("Vault support is disabled")
|
||||
return
|
||||
|
||||
def parse(vault):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
print("Loaded {} vault{}: {}".format(
|
||||
vault.get("scope", ""),
|
||||
"" if not self.user_id else " for user {}".format(self.user_id),
|
||||
(vault.get("description", None) or "")[:50] or vault.get("id", ""))
|
||||
)
|
||||
d = vault.get("data", None)
|
||||
if d:
|
||||
r = ConfigFactory.parse_string(d)
|
||||
if isinstance(r, (ConfigTree, dict)):
|
||||
return r
|
||||
except Exception as e:
|
||||
print("Failed parsing vault {}: {}".format(vault.get("description", "<unknown>"), e))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# Use params and not data/json otherwise payload might be dropped if we're using GET with a strict firewall
|
||||
res = self.send_request("users", "get_vaults", params="enabled=true&types=config&types=config")
|
||||
if res.ok:
|
||||
vaults = res.json().get("data", {}).get("vaults", [])
|
||||
data = list(filter(None, map(parse, vaults)))
|
||||
if data:
|
||||
self.pre_vault_config = self.config.copy()
|
||||
self.config.set_overrides(*data)
|
||||
return True
|
||||
elif res.status_code != 404:
|
||||
raise Exception(res.json().get("meta", {}).get("result_msg", res.text))
|
||||
except Exception as ex:
|
||||
print("Failed getting vaults: {}".format(ex))
|
||||
|
||||
def verify_feature_set(self, feature_set):
|
||||
if isinstance(feature_set, str):
|
||||
feature_set = [feature_set]
|
||||
if self.feature_set not in feature_set:
|
||||
raise ValueError('ClearML-server does not support requested feature set {}'.format(feature_set))
|
||||
|
||||
def _send_request(
|
||||
self,
|
||||
service,
|
||||
action,
|
||||
version=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
headers=None,
|
||||
auth=None,
|
||||
data=None,
|
||||
json=None,
|
||||
refresh_token_if_unauthorized=True,
|
||||
params=None,
|
||||
):
|
||||
""" Internal implementation for making a raw API request.
|
||||
- Constructs the api endpoint name
|
||||
@@ -203,6 +330,7 @@ class Session(TokenManager):
|
||||
if version
|
||||
else "{host}/{service}.{action}"
|
||||
).format(**locals())
|
||||
|
||||
while True:
|
||||
if data and len(data) > self._write_session_data_size:
|
||||
timeout = self._write_session_timeout
|
||||
@@ -210,16 +338,30 @@ class Session(TokenManager):
|
||||
timeout = self._session_initial_timeout
|
||||
else:
|
||||
timeout = self._session_timeout
|
||||
res = self.__http_session.request(
|
||||
method, url, headers=headers, auth=auth, data=data, json=json, timeout=timeout)
|
||||
|
||||
try:
|
||||
res = self.__http_session.request(
|
||||
method, url, headers=headers, auth=auth, data=data, json=json, timeout=timeout, params=params)
|
||||
except RequestException as ex:
|
||||
if self._propagate_exceptions_on_send:
|
||||
raise
|
||||
sleep_time = sys_random.uniform(*self._request_exception_retry_timeout)
|
||||
if self._logger:
|
||||
self._logger.error(
|
||||
"{} exception sending {} {}: {} (retrying in {:.1f}sec)".format(
|
||||
type(ex).__name__, method.upper(), url, str(ex), sleep_time
|
||||
)
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
continue
|
||||
|
||||
if (
|
||||
refresh_token_if_unauthorized
|
||||
and res.status_code == requests.codes.unauthorized
|
||||
and not token_refreshed_on_error
|
||||
):
|
||||
# it seems we're unauthorized, so we'll try to refresh our token once in case permissions changed since
|
||||
# the last time we got the token, and try again
|
||||
# it seems we're unauthorized, so we'll try to refresh our token once in case permissions changed
|
||||
# since the last time we got the token, and try again
|
||||
self.refresh_token()
|
||||
token_refreshed_on_error = True
|
||||
# try again
|
||||
@@ -228,11 +370,12 @@ class Session(TokenManager):
|
||||
res.status_code == requests.codes.service_unavailable
|
||||
and self.config.get("api.http.wait_on_maintenance_forever", True)
|
||||
):
|
||||
self._logger.warning(
|
||||
"Service unavailable: {} is undergoing maintenance, retrying...".format(
|
||||
host
|
||||
if self._logger:
|
||||
self._logger.warning(
|
||||
"Service unavailable: {} is undergoing maintenance, retrying...".format(
|
||||
host
|
||||
)
|
||||
)
|
||||
)
|
||||
continue
|
||||
break
|
||||
self._session_requests += 1
|
||||
@@ -242,16 +385,21 @@ class Session(TokenManager):
|
||||
headers[self._AUTHORIZATION_HEADER] = "Bearer {}".format(self.token)
|
||||
return headers
|
||||
|
||||
def set_auth_token(self, auth_token):
|
||||
self.__access_key = self.__secret_key = None
|
||||
self._set_token(auth_token)
|
||||
|
||||
def send_request(
|
||||
self,
|
||||
service,
|
||||
action,
|
||||
version=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
headers=None,
|
||||
data=None,
|
||||
json=None,
|
||||
async_enable=False,
|
||||
params=None,
|
||||
):
|
||||
"""
|
||||
Send a raw API request.
|
||||
@@ -264,6 +412,7 @@ class Session(TokenManager):
|
||||
content type will be application/json)
|
||||
:param data: Dictionary, bytes, or file-like object to send in the request body
|
||||
:param async_enable: whether request is asynchronous
|
||||
:param params: additional query parameters
|
||||
:return: requests Response instance
|
||||
"""
|
||||
headers = self.add_auth_headers(
|
||||
@@ -280,6 +429,7 @@ class Session(TokenManager):
|
||||
headers=headers,
|
||||
data=data,
|
||||
json=json,
|
||||
params=params,
|
||||
)
|
||||
|
||||
def send_request_batch(
|
||||
@@ -290,7 +440,7 @@ class Session(TokenManager):
|
||||
headers=None,
|
||||
data=None,
|
||||
json=None,
|
||||
method="get",
|
||||
method=Request.def_method,
|
||||
):
|
||||
"""
|
||||
Send a raw batch API request. Batch requests always use application/json-lines content type.
|
||||
@@ -439,8 +589,11 @@ class Session(TokenManager):
|
||||
if not config:
|
||||
return None
|
||||
|
||||
return ENV_HOST.get(default=(config.get("api.api_server", None) or
|
||||
config.get("api.host", None) or cls.default_host))
|
||||
default = config.get("api.api_server", None) or config.get("api.host", None)
|
||||
if not ENV_NO_DEFAULT_SERVER.get():
|
||||
default = default or cls.default_host
|
||||
|
||||
return ENV_HOST.get(default=default)
|
||||
|
||||
@classmethod
|
||||
def get_app_server_host(cls, config=None):
|
||||
@@ -503,12 +656,15 @@ class Session(TokenManager):
|
||||
"""
|
||||
Return True if Session.api_version is greater or equal >= to min_api_version
|
||||
"""
|
||||
def version_tuple(v):
|
||||
v = tuple(map(int, (v.split("."))))
|
||||
return v + (0,) * max(0, 3 - len(v))
|
||||
return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))
|
||||
|
||||
def _do_refresh_token(self, old_token, exp=None):
|
||||
@classmethod
|
||||
def check_min_server_version(cls, min_server_version):
|
||||
"""
|
||||
Return True if Session.server_version is greater or equal >= to min_server_version
|
||||
"""
|
||||
return version_tuple(cls.server_version) >= version_tuple(str(min_server_version))
|
||||
def _do_refresh_token(self, current_token, exp=None):
|
||||
""" TokenManager abstract method implementation.
|
||||
Here we ignore the old token and simply obtain a new token.
|
||||
"""
|
||||
@@ -520,16 +676,23 @@ class Session(TokenManager):
|
||||
)
|
||||
)
|
||||
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
auth = None
|
||||
headers = None
|
||||
if self.access_key and self.secret_key:
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
elif current_token:
|
||||
headers = dict(Authorization="Bearer {}".format(current_token))
|
||||
|
||||
res = None
|
||||
try:
|
||||
data = {"expiration_sec": exp} if exp else {}
|
||||
res = self._send_request(
|
||||
method=Request.def_method,
|
||||
service="auth",
|
||||
action="login",
|
||||
auth=auth,
|
||||
json=data,
|
||||
headers=headers,
|
||||
refresh_token_if_unauthorized=False,
|
||||
params={"expiration_sec": exp} if exp else {},
|
||||
)
|
||||
try:
|
||||
resp = res.json()
|
||||
@@ -544,7 +707,10 @@ class Session(TokenManager):
|
||||
)
|
||||
if verbose:
|
||||
self._logger.info("Received new token")
|
||||
return resp["data"]["token"]
|
||||
token = resp["data"]["token"]
|
||||
if ENV_AUTH_TOKEN.get():
|
||||
os.environ[ENV_AUTH_TOKEN.key] = token
|
||||
return token
|
||||
except LoginError:
|
||||
six.reraise(*sys.exc_info())
|
||||
except KeyError as ex:
|
||||
@@ -565,3 +731,18 @@ class Session(TokenManager):
|
||||
return "{self.__class__.__name__}[{self.host}, {self.access_key}/{secret_key}]".format(
|
||||
self=self, secret_key=self.secret_key[:5] + "*" * (len(self.secret_key) - 5)
|
||||
)
|
||||
|
||||
@property
|
||||
def propagate_exceptions_on_send(self):
|
||||
# type: () -> bool
|
||||
return self._propagate_exceptions_on_send
|
||||
|
||||
@propagate_exceptions_on_send.setter
|
||||
def propagate_exceptions_on_send(self, value):
|
||||
# type: (bool) -> None
|
||||
self._propagate_exceptions_on_send = value
|
||||
|
||||
|
||||
def version_tuple(v):
|
||||
v = tuple(map(int, (v.split("."))))
|
||||
return v + (0,) * max(0, 3 - len(v))
|
||||
|
||||
@@ -3,11 +3,14 @@ from abc import ABCMeta, abstractmethod
|
||||
from time import time
|
||||
|
||||
import jwt
|
||||
from jwt.algorithms import get_default_algorithms
|
||||
import six
|
||||
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class TokenManager(object):
|
||||
_default_token_exp_threshold_sec = 12 * 60 * 60
|
||||
_default_req_token_expiration_sec = None
|
||||
|
||||
@property
|
||||
def token_expiration_threshold_sec(self):
|
||||
@@ -40,17 +43,30 @@ class TokenManager(object):
|
||||
return self.__token
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=60,
|
||||
**kwargs
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=None,
|
||||
config=None,
|
||||
**kwargs
|
||||
):
|
||||
super(TokenManager, self).__init__()
|
||||
assert isinstance(token_history, (type(None), dict))
|
||||
self.token_expiration_threshold_sec = token_expiration_threshold_sec
|
||||
self.req_token_expiration_sec = req_token_expiration_sec
|
||||
if config:
|
||||
req_token_expiration_sec = req_token_expiration_sec or config.get(
|
||||
"api.auth.request_token_expiration_sec", None
|
||||
)
|
||||
token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec
|
||||
or config.get("api.auth.token_expiration_threshold_sec", None)
|
||||
)
|
||||
self.token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec or self._default_token_exp_threshold_sec
|
||||
)
|
||||
self.req_token_expiration_sec = (
|
||||
req_token_expiration_sec or self._default_req_token_expiration_sec
|
||||
)
|
||||
self._set_token(token)
|
||||
|
||||
def _calc_token_valid_period_sec(self, token, exp=None, at_least_sec=None):
|
||||
@@ -58,7 +74,9 @@ class TokenManager(object):
|
||||
try:
|
||||
exp = exp or self._get_token_exp(token)
|
||||
if at_least_sec:
|
||||
at_least_sec = max(at_least_sec, self.token_expiration_threshold_sec)
|
||||
at_least_sec = max(
|
||||
at_least_sec, self.token_expiration_threshold_sec
|
||||
)
|
||||
else:
|
||||
at_least_sec = self.token_expiration_threshold_sec
|
||||
return max(0, (exp - time() - at_least_sec))
|
||||
@@ -66,10 +84,26 @@ class TokenManager(object):
|
||||
pass
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def get_decoded_token(cls, token, verify=False):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
if hasattr(jwt, '__version__') and jwt.__version__[0] == '1':
|
||||
return jwt.decode(
|
||||
token,
|
||||
verify=verify,
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
return jwt.decode(
|
||||
token,
|
||||
options=dict(verify_signature=verify),
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_token_exp(cls, token):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
return jwt.decode(token, verify=False).get('exp', sys.maxsize)
|
||||
return cls.get_decoded_token(token).get("exp", sys.maxsize)
|
||||
|
||||
def _set_token(self, token):
|
||||
if token:
|
||||
@@ -80,7 +114,9 @@ class TokenManager(object):
|
||||
self.__token_expiration_sec = 0
|
||||
|
||||
def get_token_valid_period_sec(self):
|
||||
return self._calc_token_valid_period_sec(self.__token, self.token_expiration_sec)
|
||||
return self._calc_token_valid_period_sec(
|
||||
self.__token, self.token_expiration_sec
|
||||
)
|
||||
|
||||
def _get_token(self):
|
||||
if self.get_token_valid_period_sec() <= 0:
|
||||
@@ -92,4 +128,6 @@ class TokenManager(object):
|
||||
pass
|
||||
|
||||
def refresh_token(self):
|
||||
self._set_token(self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec))
|
||||
self._set_token(
|
||||
self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec)
|
||||
)
|
||||
|
||||
@@ -6,16 +6,9 @@ import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util import Retry
|
||||
from urllib3 import PoolManager
|
||||
import six
|
||||
|
||||
from .session.defs import ENV_HOST_VERIFY_CERT
|
||||
|
||||
if six.PY3:
|
||||
from functools import lru_cache
|
||||
elif six.PY2:
|
||||
# python 2 support
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
|
||||
|
||||
__disable_certificate_verification_warning = 0
|
||||
|
||||
@@ -93,7 +86,10 @@ def get_http_session_with_retry(
|
||||
session = requests.Session()
|
||||
|
||||
if backoff_max is not None:
|
||||
Retry.BACKOFF_MAX = backoff_max
|
||||
if "BACKOFF_MAX" in vars(Retry):
|
||||
Retry.BACKOFF_MAX = backoff_max
|
||||
else:
|
||||
Retry.DEFAULT_BACKOFF_MAX = backoff_max
|
||||
|
||||
retry = Retry(
|
||||
total=total, connect=connect, read=read, redirect=redirect, status=status,
|
||||
|
||||
@@ -4,15 +4,11 @@ import functools
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from fnmatch import fnmatch
|
||||
from os.path import expanduser
|
||||
from typing import Any
|
||||
|
||||
import pyhocon
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
from pyparsing import (
|
||||
ParseFatalException,
|
||||
ParseException,
|
||||
@@ -20,6 +16,9 @@ from pyparsing import (
|
||||
ParseSyntaxException,
|
||||
)
|
||||
|
||||
from clearml_agent.external import pyhocon
|
||||
from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
|
||||
|
||||
from .defs import (
|
||||
Environment,
|
||||
DEFAULT_CONFIG_FOLDER,
|
||||
@@ -71,6 +70,10 @@ class Config(object):
|
||||
|
||||
# used in place of None in Config.get as default value because None is a valid value
|
||||
_MISSING = object()
|
||||
extra_config_values_env_key_sep = "__"
|
||||
extra_config_values_env_key_prefix = [
|
||||
"CLEARML_AGENT" + extra_config_values_env_key_sep,
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -90,6 +93,7 @@ class Config(object):
|
||||
self._env = env or os.environ.get("TRAINS_ENV", Environment.default)
|
||||
self.config_paths = set()
|
||||
self.is_server = is_server
|
||||
self._overrides_configs = None
|
||||
|
||||
if self._verbose:
|
||||
print("Config env:%s" % str(self._env))
|
||||
@@ -100,6 +104,7 @@ class Config(object):
|
||||
)
|
||||
if self._env not in get_options(Environment):
|
||||
raise ValueError("Invalid environment %s" % env)
|
||||
|
||||
if relative_to is not None:
|
||||
self.load_relative_to(relative_to)
|
||||
|
||||
@@ -158,7 +163,9 @@ class Config(object):
|
||||
if LOCAL_CONFIG_PATHS:
|
||||
config = functools.reduce(
|
||||
lambda cfg, path: ConfigTree.merge_configs(
|
||||
cfg, self._read_recursive(path, verbose=self._verbose), copy_trees=True
|
||||
cfg,
|
||||
self._read_recursive(path, verbose=self._verbose),
|
||||
copy_trees=True,
|
||||
),
|
||||
LOCAL_CONFIG_PATHS,
|
||||
config,
|
||||
@@ -181,9 +188,42 @@ class Config(object):
|
||||
config,
|
||||
)
|
||||
|
||||
config = ConfigTree.merge_configs(
|
||||
config, self._read_extra_env_config_values(), copy_trees=True
|
||||
)
|
||||
|
||||
config = self.resolve_override_configs(config)
|
||||
|
||||
config["env"] = env
|
||||
return config
|
||||
|
||||
def resolve_override_configs(self, initial=None):
|
||||
if not self._overrides_configs:
|
||||
return initial
|
||||
return functools.reduce(
|
||||
lambda cfg, override: ConfigTree.merge_configs(cfg, override, copy_trees=True),
|
||||
self._overrides_configs,
|
||||
initial or ConfigTree(),
|
||||
)
|
||||
|
||||
def _read_extra_env_config_values(self) -> ConfigTree:
|
||||
""" Loads extra configuration from environment-injected values """
|
||||
result = ConfigTree()
|
||||
|
||||
for prefix in self.extra_config_values_env_key_prefix:
|
||||
keys = sorted(k for k in os.environ if k.startswith(prefix))
|
||||
for key in keys:
|
||||
path = (
|
||||
key[len(prefix) :]
|
||||
.replace(self.extra_config_values_env_key_sep, ".")
|
||||
.lower()
|
||||
)
|
||||
result = ConfigTree.merge_configs(
|
||||
result, ConfigFactory.parse_string("{}: {}".format(path, os.environ[key]))
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def replace(self, config):
|
||||
self._config = config
|
||||
|
||||
@@ -254,6 +294,12 @@ class Config(object):
|
||||
)
|
||||
return value
|
||||
|
||||
def put(self, key, value):
|
||||
self._config.put(key, value)
|
||||
|
||||
def pop(self, key, default=None):
|
||||
return self._config.pop(key, default=default)
|
||||
|
||||
def to_dict(self):
|
||||
return self._config.as_plain_ordered_dict()
|
||||
|
||||
@@ -340,3 +386,10 @@ class Config(object):
|
||||
except Exception as ex:
|
||||
print("Failed loading %s: %s" % (file_path, ex))
|
||||
raise
|
||||
|
||||
def set_overrides(self, *dicts):
|
||||
""" Set several override dictionaries or ConfigTree objects which should be merged onto the configuration """
|
||||
self._overrides_configs = [
|
||||
d if isinstance(d, ConfigTree) else pyhocon.ConfigFactory.from_dict(d) for d in dicts
|
||||
]
|
||||
self.reload()
|
||||
|
||||
@@ -1,53 +1,8 @@
|
||||
import base64
|
||||
from distutils.util import strtobool
|
||||
from typing import Union, Optional, Any, TypeVar, Callable, Tuple
|
||||
|
||||
import six
|
||||
|
||||
try:
|
||||
from typing import Text
|
||||
except ImportError:
|
||||
# windows conda-less hack
|
||||
Text = Any
|
||||
|
||||
|
||||
ConverterType = TypeVar("ConverterType", bound=Callable[[Any], Any])
|
||||
|
||||
|
||||
def base64_to_text(value):
|
||||
# type: (Any) -> Text
|
||||
return base64.b64decode(value).decode("utf-8")
|
||||
|
||||
|
||||
def text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
return bool(strtobool(value))
|
||||
|
||||
|
||||
def any_to_bool(value):
|
||||
# type: (Optional[Union[int, float, Text]]) -> bool
|
||||
if isinstance(value, six.text_type):
|
||||
return text_to_bool(value)
|
||||
return bool(value)
|
||||
|
||||
|
||||
def or_(*converters, **kwargs):
|
||||
# type: (ConverterType, Tuple[Exception, ...]) -> ConverterType
|
||||
"""
|
||||
Wrapper that implements an "optional converter" pattern. Allows specifying a converter
|
||||
for which a set of exceptions is ignored (and the original value is returned)
|
||||
:param converters: A converter callable
|
||||
:param exceptions: A tuple of exception types to ignore
|
||||
"""
|
||||
# noinspection PyUnresolvedReferences
|
||||
exceptions = kwargs.get("exceptions", (ValueError, TypeError))
|
||||
|
||||
def wrapper(value):
|
||||
for converter in converters:
|
||||
try:
|
||||
return converter(value)
|
||||
except exceptions:
|
||||
pass
|
||||
return value
|
||||
|
||||
return wrapper
|
||||
from clearml_agent.helper.environment.converters import (
|
||||
base64_to_text,
|
||||
text_to_bool,
|
||||
text_to_int,
|
||||
safe_text_to_bool,
|
||||
any_to_bool,
|
||||
or_,
|
||||
)
|
||||
|
||||
@@ -1,104 +1,6 @@
|
||||
import abc
|
||||
from typing import Optional, Any, Tuple, Callable, Dict
|
||||
from clearml_agent.helper.environment import Entry, NotSet
|
||||
|
||||
import six
|
||||
|
||||
from .converters import any_to_bool
|
||||
|
||||
try:
|
||||
from typing import Text
|
||||
except ImportError:
|
||||
# windows conda-less hack
|
||||
Text = Any
|
||||
|
||||
|
||||
NotSet = object()
|
||||
|
||||
Converter = Callable[[Any], Any]
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class Entry(object):
|
||||
"""
|
||||
Configuration entry definition
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def default_conversions(cls):
|
||||
# type: () -> Dict[Any, Converter]
|
||||
return {
|
||||
bool: any_to_bool,
|
||||
six.text_type: lambda s: six.text_type(s).strip(),
|
||||
}
|
||||
|
||||
def __init__(self, key, *more_keys, **kwargs):
|
||||
# type: (Text, Text, Any) -> None
|
||||
"""
|
||||
:param key: Entry's key (at least one).
|
||||
:param more_keys: More alternate keys for this entry.
|
||||
:param type: Value type. If provided, will be used choosing a default conversion or
|
||||
(if none exists) for casting the environment value.
|
||||
:param converter: Value converter. If provided, will be used to convert the environment value.
|
||||
:param default: Default value. If provided, will be used as the default value on calls to get() and get_pair()
|
||||
in case no value is found for any key and no specific default value was provided in the call.
|
||||
Default value is None.
|
||||
:param help: Help text describing this entry
|
||||
"""
|
||||
self.keys = (key,) + more_keys
|
||||
self.type = kwargs.pop("type", six.text_type)
|
||||
self.converter = kwargs.pop("converter", None)
|
||||
self.default = kwargs.pop("default", None)
|
||||
self.help = kwargs.pop("help", None)
|
||||
|
||||
def __str__(self):
|
||||
return str(self.key)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.keys[0]
|
||||
|
||||
def convert(self, value, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Any]
|
||||
converter = converter or self.converter
|
||||
if not converter:
|
||||
converter = self.default_conversions().get(self.type, self.type)
|
||||
return converter(value)
|
||||
|
||||
def get_pair(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Tuple[Text, Any]]
|
||||
for key in self.keys:
|
||||
value = self._get(key)
|
||||
if value is NotSet:
|
||||
continue
|
||||
try:
|
||||
value = self.convert(value, converter)
|
||||
except Exception as ex:
|
||||
self.error("invalid value {key}={value}: {ex}".format(**locals()))
|
||||
break
|
||||
return key, value
|
||||
result = self.default if default is NotSet else default
|
||||
return self.key, result
|
||||
|
||||
def get(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter)[1]
|
||||
|
||||
def set(self, value):
|
||||
# type: (Any, Any) -> (Text, Any)
|
||||
# key, _ = self.get_pair(default=None, converter=None)
|
||||
for k in self.keys:
|
||||
self._set(k, str(value))
|
||||
|
||||
def _set(self, key, value):
|
||||
# type: (Text, Text) -> None
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get(self, key):
|
||||
# type: (Text) -> Any
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def error(self, message):
|
||||
# type: (Text) -> None
|
||||
pass
|
||||
__all__ = [
|
||||
"Entry",
|
||||
"NotSet"
|
||||
]
|
||||
|
||||
@@ -1,32 +1,6 @@
|
||||
from os import getenv, environ
|
||||
from os import environ
|
||||
|
||||
from .converters import text_to_bool
|
||||
from .entry import Entry, NotSet
|
||||
|
||||
|
||||
class EnvEntry(Entry):
|
||||
@classmethod
|
||||
def default_conversions(cls):
|
||||
conversions = super(EnvEntry, cls).default_conversions().copy()
|
||||
conversions[bool] = text_to_bool
|
||||
return conversions
|
||||
|
||||
def pop(self):
|
||||
for k in self.keys:
|
||||
environ.pop(k, None)
|
||||
|
||||
def _get(self, key):
|
||||
value = getenv(key, "").strip()
|
||||
return value or NotSet
|
||||
|
||||
def _set(self, key, value):
|
||||
environ[key] = value
|
||||
|
||||
def __str__(self):
|
||||
return "env:{}".format(super(EnvEntry, self).__str__())
|
||||
|
||||
def error(self, message):
|
||||
print("Environment configuration: {}".format(message))
|
||||
from clearml_agent.helper.environment import EnvEntry
|
||||
|
||||
|
||||
def backward_compatibility_support():
|
||||
@@ -34,6 +8,7 @@ def backward_compatibility_support():
|
||||
if ENVIRONMENT_BACKWARD_COMPATIBLE.get():
|
||||
# Add TRAINS_ prefix on every CLEARML_ os environment we support
|
||||
for k, v in ENVIRONMENT_CONFIG.items():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
trains_vars = [var for var in v.vars if var.startswith('CLEARML_')]
|
||||
if not trains_vars:
|
||||
@@ -44,6 +19,7 @@ def backward_compatibility_support():
|
||||
except:
|
||||
continue
|
||||
for k, v in ENVIRONMENT_SDK_PARAMS.items():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
trains_vars = [var for var in v if var.startswith('CLEARML_')]
|
||||
if not trains_vars:
|
||||
@@ -55,10 +31,16 @@ def backward_compatibility_support():
|
||||
continue
|
||||
|
||||
# set OS environ:
|
||||
keys = environ.keys()
|
||||
keys = list(environ.keys())
|
||||
for k in keys:
|
||||
if not k.startswith('CLEARML_'):
|
||||
continue
|
||||
backwards_k = k.replace('CLEARML_', 'TRAINS_', 1)
|
||||
if backwards_k not in keys:
|
||||
environ[backwards_k] = environ[k]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"EnvEntry",
|
||||
"backward_compatibility_support"
|
||||
]
|
||||
@@ -1,3 +1,14 @@
|
||||
import base64
|
||||
import os
|
||||
from os.path import expandvars, expanduser
|
||||
from pathlib import Path
|
||||
from typing import List, TYPE_CHECKING
|
||||
|
||||
from clearml_agent.external.pyhocon import HOCONConverter, ConfigTree
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .config import Config
|
||||
|
||||
|
||||
def get_items(cls):
|
||||
""" get key/value items from an enum-like class (members represent enumeration key/value) """
|
||||
@@ -7,3 +18,108 @@ def get_items(cls):
|
||||
def get_options(cls):
|
||||
""" get options from an enum-like class (members represent enumeration key/value) """
|
||||
return get_items(cls).values()
|
||||
|
||||
|
||||
def apply_environment(config):
|
||||
# type: (Config) -> List[str]
|
||||
env_vars = config.get("environment", None)
|
||||
if not env_vars:
|
||||
return []
|
||||
if isinstance(env_vars, (list, tuple)):
|
||||
env_vars = dict(env_vars)
|
||||
|
||||
keys = list(filter(None, env_vars.keys()))
|
||||
|
||||
for key in keys:
|
||||
value = env_vars[key]
|
||||
os.environ[str(key)] = str(value if value is not None else "")
|
||||
|
||||
return keys
|
||||
|
||||
|
||||
def apply_files(config):
|
||||
# type: (Config) -> None
|
||||
files = config.get("files", None)
|
||||
if not files:
|
||||
return
|
||||
|
||||
if isinstance(files, (list, tuple)):
|
||||
files = dict(files)
|
||||
|
||||
print("Creating files from configuration")
|
||||
for key, data in files.items():
|
||||
path = data.get("path")
|
||||
fmt = data.get("format", "string")
|
||||
target_fmt = data.get("target_format", "string")
|
||||
overwrite = bool(data.get("overwrite", True))
|
||||
contents = data.get("contents")
|
||||
mode = data.get("mode", None)
|
||||
|
||||
target = Path(expanduser(expandvars(path)))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if target.is_dir():
|
||||
print("Skipped [{}]: is a directory {}".format(key, target))
|
||||
continue
|
||||
|
||||
if not overwrite and target.is_file():
|
||||
print("Skipped [{}]: file exists {}".format(key, target))
|
||||
continue
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: can't access {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
if contents:
|
||||
try:
|
||||
if fmt == "base64":
|
||||
contents = base64.b64decode(contents)
|
||||
if target_fmt != "bytes":
|
||||
contents = contents.decode("utf-8")
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed decoding {} ({})".format(key, fmt, ex))
|
||||
continue
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed creating path {} ({})".format(key, target.parent, ex))
|
||||
continue
|
||||
|
||||
try:
|
||||
if target_fmt == "bytes":
|
||||
try:
|
||||
target.write_bytes(contents)
|
||||
except TypeError:
|
||||
# simpler error so the user won't get confused
|
||||
raise TypeError("a bytes-like object is required")
|
||||
else:
|
||||
try:
|
||||
if target_fmt == "json":
|
||||
text = HOCONConverter.to_json(contents)
|
||||
elif target_fmt in ("yaml", "yml"):
|
||||
text = HOCONConverter.to_yaml(contents)
|
||||
else:
|
||||
if isinstance(contents, ConfigTree):
|
||||
contents = contents.as_plain_ordered_dict()
|
||||
text = str(contents)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed encoding to {} ({})".format(key, target_fmt, ex))
|
||||
continue
|
||||
target.write_text(text)
|
||||
print("Saved [{}]: {}".format(key, target))
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed saving file {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
try:
|
||||
if mode:
|
||||
if isinstance(mode, int):
|
||||
mode = int(str(mode), 8)
|
||||
else:
|
||||
mode = int(mode, 8)
|
||||
target.chmod(mode)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed setting mode {} for {} ({})".format(key, mode, target, ex))
|
||||
continue
|
||||
|
||||
@@ -118,11 +118,15 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
""" The name of the REST service used by this command """
|
||||
pass
|
||||
|
||||
def get(self, endpoint, *args, **kwargs):
|
||||
return self._session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def get(self, endpoint, *args, service=None, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
service = service or self.service
|
||||
return session.get(service=service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def post(self, endpoint, *args, **kwargs):
|
||||
return self._session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def post(self, endpoint, *args, service=None, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
service = service or self.service
|
||||
return session.post(service=service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def get_with_act_as(self, endpoint, *args, **kwargs):
|
||||
return self._session.get_with_act_as(service=self.service, action=endpoint, *args, **kwargs)
|
||||
@@ -345,7 +349,7 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
except AttributeError:
|
||||
raise NameResolutionError('Name resolution unavailable for {}'.format(service))
|
||||
|
||||
request = request_cls.from_dict(dict(name=name, only_fields=['name', 'id']))
|
||||
request = request_cls.from_dict(dict(name=re.escape(name), only_fields=['name', 'id']))
|
||||
# from_dict will ignore unrecognised keyword arguments - not all GetAll's have only_fields
|
||||
response = getattr(self._session.send_api(request), service)
|
||||
matches = [db_object for db_object in response if name.lower() == db_object.name.lower()]
|
||||
|
||||
@@ -1,20 +1,21 @@
|
||||
from __future__ import print_function
|
||||
|
||||
from six.moves import input
|
||||
from pyhocon import ConfigFactory, ConfigMissingException
|
||||
from typing import Dict, Optional
|
||||
|
||||
from pathlib2 import Path
|
||||
from six.moves import input
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from clearml_agent.backend_api.session import Session
|
||||
from clearml_agent.backend_api.session.defs import ENV_HOST
|
||||
from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
|
||||
from clearml_agent.external.pyhocon import ConfigFactory, ConfigMissingException
|
||||
|
||||
description = """
|
||||
Please create new clearml credentials through the profile page in your clearml web app (e.g. https://demoapp.demo.clear.ml/profile)
|
||||
Or with the free hosted service at https://app.community.clear.ml/profile
|
||||
Please create new clearml credentials through the settings page in your `clearml-server` web app,
|
||||
or create a free account at https://app.clear.ml/settings/webapp-configuration
|
||||
|
||||
In the profile page, press "Create new credentials", then press "Copy to clipboard".
|
||||
In the settings > workspace page, press "Create new credentials", then press "Copy to clipboard".
|
||||
|
||||
Paste copied configuration here:
|
||||
"""
|
||||
@@ -27,9 +28,9 @@ except Exception:
|
||||
|
||||
host_description = """
|
||||
Editing configuration file: {CONFIG_FILE}
|
||||
Enter the url of the clearml-server's Web service, for example: {HOST}
|
||||
Enter the url of the clearml-server's Web service, for example: {HOST} or https://app.clear.ml
|
||||
""".format(
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[0],
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[-1],
|
||||
HOST=def_host,
|
||||
)
|
||||
|
||||
@@ -43,7 +44,7 @@ def main():
|
||||
|
||||
if conf_file.exists() and conf_file.is_file() and conf_file.stat().st_size > 0:
|
||||
print('Configuration file already exists: {}'.format(str(conf_file)))
|
||||
print('Leaving setup, feel free to edit the configuration file.')
|
||||
print('Leaving setup. If you\'ve previously initialized the ClearML SDK on this machine, manually add an \'agent\' section to this file.')
|
||||
return
|
||||
|
||||
print(description, end='')
|
||||
@@ -84,7 +85,7 @@ def main():
|
||||
host = input_url('API Host', api_server)
|
||||
else:
|
||||
print(host_description)
|
||||
host = input_url('WEB Host', '')
|
||||
host = input_url('WEB Host', 'https://app.clear.ml')
|
||||
|
||||
parsed_host = verify_url(host)
|
||||
api_host, files_host, web_host = parse_host(parsed_host, allow_input=True)
|
||||
@@ -112,13 +113,34 @@ def main():
|
||||
print('Exiting setup without creating configuration file')
|
||||
return
|
||||
|
||||
selection = input_options(
|
||||
'Default Output URI (used to automatically store models and artifacts)',
|
||||
{'N': 'None', 'S': 'ClearML Server', 'C': 'Custom'},
|
||||
default='None'
|
||||
)
|
||||
if selection == 'Custom':
|
||||
print('Custom Default Output URI: ', end='')
|
||||
default_output_uri = input().strip()
|
||||
elif selection == "ClearML Server":
|
||||
default_output_uri = files_host
|
||||
else:
|
||||
default_output_uri = None
|
||||
|
||||
print('\nDefault Output URI: {}'.format(default_output_uri if default_output_uri else 'not set'))
|
||||
|
||||
# get GIT User/Pass for cloning
|
||||
print('Enter git username for repository cloning (leave blank for SSH key authentication): [] ', end='')
|
||||
git_user = input()
|
||||
if git_user.strip():
|
||||
print('Enter password for user \'{}\': '.format(git_user), end='')
|
||||
print(
|
||||
"Git personal token is equivalent to a password, to learn how to generate a token:\n"
|
||||
" GitHub: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token\n" # noqa
|
||||
" Bitbucket: https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/\n"
|
||||
" GitLab: https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html\n"
|
||||
)
|
||||
print('Enter git personal token for user \'{}\': '.format(git_user), end='')
|
||||
git_pass = input()
|
||||
print('Git repository cloning will be using user={} password={}'.format(git_user, git_pass))
|
||||
print('Git repository cloning will be using user={} token={}'.format(git_user, git_pass))
|
||||
else:
|
||||
git_user = None
|
||||
git_pass = None
|
||||
@@ -157,7 +179,7 @@ def main():
|
||||
' api_server: %s\n' \
|
||||
' web_server: %s\n' \
|
||||
' files_server: %s\n' \
|
||||
' # Credentials are generated using the webapp, %s/profile\n' \
|
||||
' # Credentials are generated using the webapp, %s/settings\n' \
|
||||
' # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY\n' \
|
||||
' credentials {"access_key": "%s", "secret_key": "%s"}\n' \
|
||||
'}\n\n' % (api_host, web_host, files_host,
|
||||
@@ -173,6 +195,13 @@ def main():
|
||||
'agent.package_manager.extra_index_url= ' \
|
||||
'[\n{}\n]\n\n'.format("\n".join(map("\"{}\"".format, extra_index_urls)))
|
||||
f.write(extra_index_str)
|
||||
if default_output_uri:
|
||||
default_output_url_str = '# Default Task output_uri. if output_uri is not provided to Task.init, ' \
|
||||
'default_output_uri will be used instead.\n' \
|
||||
'sdk.development.default_output_uri="{}"\n' \
|
||||
'\n'.format(default_output_uri.strip('"'))
|
||||
f.write(default_output_url_str)
|
||||
default_conf = default_conf.replace('default_output_uri: ""', '# default_output_uri: ""')
|
||||
f.write(default_conf)
|
||||
except Exception:
|
||||
print('Error! Could not write configuration file at: {}'.format(str(conf_file)))
|
||||
@@ -299,6 +328,25 @@ def input_url(host_type, host=None):
|
||||
return host
|
||||
|
||||
|
||||
def input_options(message, options, default=None):
|
||||
# type: (str, Dict[str, str], Optional[str]) -> str
|
||||
options_msg = "/".join(
|
||||
"".join(('(' + c.upper() + ')') if c == o else c for c in option)
|
||||
for o, option in options.items()
|
||||
)
|
||||
if default:
|
||||
options_msg += " [{}]".format(default)
|
||||
while True:
|
||||
print('{}: {} '.format(message, options_msg), end='')
|
||||
res = input().strip()
|
||||
if not res:
|
||||
return default
|
||||
elif res.lower() in options:
|
||||
return options[res.lower()]
|
||||
elif res.upper() in options:
|
||||
return options[res.upper()]
|
||||
|
||||
|
||||
def input_host_port(host_type, parsed_host):
|
||||
print('Enter port for {} host '.format(host_type), end='')
|
||||
replace_port = input().lower()
|
||||
|
||||
@@ -2,8 +2,7 @@ from __future__ import print_function
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
from future.builtins import super
|
||||
from typing import List, Tuple
|
||||
|
||||
from clearml_agent.commands.base import ServiceCommandSection
|
||||
from clearml_agent.helper.base import return_list
|
||||
@@ -21,14 +20,16 @@ class Events(ServiceCommandSection):
|
||||
""" Events command service endpoint """
|
||||
return 'events'
|
||||
|
||||
def send_events(self, list_events):
|
||||
def send_events(self, list_events, session=None):
|
||||
def send_packet(jsonlines):
|
||||
if not jsonlines:
|
||||
return 0
|
||||
num_lines = len(jsonlines)
|
||||
jsonlines = '\n'.join(jsonlines)
|
||||
|
||||
new_events = self.post('add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'})
|
||||
new_events = self.post(
|
||||
'add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'}, session=session
|
||||
)
|
||||
if new_events['added'] != num_lines:
|
||||
print('Error (%s) sending events only %d of %d registered' %
|
||||
(new_events['errors'], new_events['added'], num_lines))
|
||||
@@ -57,7 +58,43 @@ class Events(ServiceCommandSection):
|
||||
# print('Sending events done: %d / %d events sent' % (sent_events, len(list_events)))
|
||||
return sent_events
|
||||
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG'):
|
||||
def send_log_events_with_timestamps(
|
||||
self, worker_id, task_id, lines_with_ts: List[Tuple[str, str]], level="DEBUG", session=None
|
||||
):
|
||||
log_events = []
|
||||
|
||||
# break log lines into event packets
|
||||
for ts, line in return_list(lines_with_ts):
|
||||
# HACK ignore terminal reset ANSI code
|
||||
if line == '\x1b[0m':
|
||||
continue
|
||||
while line:
|
||||
if len(line) <= self.max_event_size:
|
||||
msg = line
|
||||
line = None
|
||||
else:
|
||||
msg = line[:self.max_event_size]
|
||||
line = line[self.max_event_size:]
|
||||
|
||||
log_events.append(
|
||||
{
|
||||
"type": "log",
|
||||
"level": level,
|
||||
"task": task_id,
|
||||
"worker": worker_id,
|
||||
"msg": msg,
|
||||
"timestamp": ts,
|
||||
}
|
||||
)
|
||||
|
||||
if line and ts is not None:
|
||||
# advance timestamp in case we break a line to more than one part
|
||||
ts += 1
|
||||
|
||||
# now send the events
|
||||
return self.send_events(list_events=log_events, session=session)
|
||||
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG', session=None):
|
||||
log_events = []
|
||||
base_timestamp = int(time.time() * 1000)
|
||||
base_log_items = {
|
||||
@@ -94,4 +131,4 @@ class Events(ServiceCommandSection):
|
||||
log_events.append(get_event(count))
|
||||
|
||||
# now send the events
|
||||
return self.send_events(list_events=log_events)
|
||||
return self.send_events(list_events=log_events, session=session)
|
||||
|
||||
177
clearml_agent/commands/resolver.py
Normal file
177
clearml_agent/commands/resolver.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import json
|
||||
import re
|
||||
import shlex
|
||||
from copy import copy
|
||||
|
||||
from clearml_agent.backend_api.session import Request
|
||||
from clearml_agent.helper.docker_args import DockerArgsSanitizer
|
||||
from clearml_agent.helper.package.requirements import (
|
||||
RequirementsManager, MarkerRequirement,
|
||||
compare_version_rules, )
|
||||
|
||||
|
||||
def resolve_default_container(session, task_id, container_config, ignore_match_rules=False):
|
||||
container_lookup = session.config.get('agent.default_docker.match_rules', None)
|
||||
if not session.check_min_api_version("2.13") or not container_lookup:
|
||||
return container_config
|
||||
|
||||
# check backend support before sending any more requests (because they will fail and crash the Task)
|
||||
try:
|
||||
session.verify_feature_set('advanced')
|
||||
except ValueError:
|
||||
# ignoring matching rules only supported in higher tiers
|
||||
return container_config
|
||||
|
||||
if ignore_match_rules:
|
||||
print("INFO: default docker command line override, ignoring default docker container match rules")
|
||||
# ignoring matching rules only supported in higher tiers
|
||||
return container_config
|
||||
|
||||
result = session.send_request(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
version='2.14',
|
||||
json={'id': [task_id],
|
||||
'only_fields': ['script.requirements', 'script.binary',
|
||||
'script.repository', 'script.branch',
|
||||
'project', 'container'],
|
||||
'search_hidden': True},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
task_info = result.json()['data']['tasks'][0] if result.ok else {}
|
||||
except (ValueError, TypeError):
|
||||
return container_config
|
||||
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
# store tasks repository
|
||||
repository = task_info.get('script', {}).get('repository') or ''
|
||||
branch = task_info.get('script', {}).get('branch') or ''
|
||||
binary = task_info.get('script', {}).get('binary') or ''
|
||||
requested_container = task_info.get('container', {})
|
||||
|
||||
# get project full path
|
||||
project_full_name = ''
|
||||
if task_info.get('project', None):
|
||||
result = session.send_request(
|
||||
service='projects',
|
||||
action='get_all',
|
||||
version='2.13',
|
||||
json={
|
||||
'id': [task_info.get('project')],
|
||||
'only_fields': ['name'],
|
||||
},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
if result.ok:
|
||||
project_full_name = result.json()['data']['projects'][0]['name'] or ''
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
task_packages_lookup = {}
|
||||
for entry in container_lookup:
|
||||
match = entry.get('match', None)
|
||||
if not match:
|
||||
continue
|
||||
if match.get('project', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('project', None), project_full_name):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('project', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.repository', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.repository', None), repository):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.repository', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.branch', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.branch', None), branch):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.branch', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.binary', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.binary', None), binary):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.binary', None), entry))
|
||||
continue
|
||||
|
||||
# if match.get('image', None):
|
||||
# # noinspection PyBroadException
|
||||
# try:
|
||||
# if not re.search(match.get('image', None), requested_container.get('image', '')):
|
||||
# continue
|
||||
# except Exception:
|
||||
# print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
# match.get('image', None), entry))
|
||||
# continue
|
||||
|
||||
matched = True
|
||||
for req_section in ['script.requirements.pip', 'script.requirements.conda']:
|
||||
if not match.get(req_section, None):
|
||||
continue
|
||||
|
||||
match_pip_reqs = [MarkerRequirement(Requirement.parse('{} {}'.format(k, v)))
|
||||
for k, v in match.get(req_section, None).items()]
|
||||
|
||||
if not task_packages_lookup.get(req_section):
|
||||
req_section_parts = req_section.split('.')
|
||||
task_packages_lookup[req_section] = \
|
||||
RequirementsManager.parse_requirements_section_to_marker_requirements(
|
||||
requirements=task_info.get(req_section_parts[0], {}).get(
|
||||
req_section_parts[1], {}).get(req_section_parts[2], None)
|
||||
)
|
||||
|
||||
matched_all_reqs = True
|
||||
for mr in match_pip_reqs:
|
||||
matched_req = False
|
||||
for pr in task_packages_lookup[req_section]:
|
||||
if mr.req.name != pr.req.name:
|
||||
continue
|
||||
if compare_version_rules(mr.specs, pr.specs):
|
||||
matched_req = True
|
||||
break
|
||||
if not matched_req:
|
||||
matched_all_reqs = False
|
||||
break
|
||||
|
||||
# if ew have a match, check second section
|
||||
if matched_all_reqs:
|
||||
continue
|
||||
# no match stop
|
||||
matched = False
|
||||
break
|
||||
|
||||
if matched:
|
||||
if not container_config.get('image'):
|
||||
container_config['image'] = entry.get('image', None)
|
||||
if not container_config.get('arguments'):
|
||||
container_config['arguments'] = entry.get('arguments', None) or ''
|
||||
if isinstance(container_config.get('arguments'), str):
|
||||
container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
|
||||
print('INFO: Matching default container with rule:\n{}'.format(json.dumps(entry)))
|
||||
return container_config
|
||||
|
||||
return container_config
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
from pyhocon import ConfigTree
|
||||
|
||||
import six
|
||||
|
||||
from clearml_agent.external.pyhocon import ConfigTree
|
||||
from clearml_agent.helper.base import Singleton
|
||||
|
||||
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
import shlex
|
||||
from datetime import timedelta
|
||||
from distutils.util import strtobool
|
||||
from enum import IntEnum
|
||||
from os import getenv, environ
|
||||
from typing import Text, Optional, Union, Tuple, Any
|
||||
|
||||
from furl import furl
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
|
||||
import six
|
||||
from clearml_agent.helper.base import normalize_path
|
||||
from clearml_agent.helper.environment.converters import strtobool
|
||||
|
||||
PROGRAM_NAME = "clearml-agent"
|
||||
FROM_FILE_PREFIX_CHARS = "@"
|
||||
@@ -34,6 +34,7 @@ class EnvironmentConfig(object):
|
||||
conversions = {
|
||||
bool: lambda value: bool(strtobool(value)),
|
||||
six.text_type: lambda s: six.text_type(s).strip(),
|
||||
list: lambda s: shlex.split(s.strip()),
|
||||
}
|
||||
|
||||
def __init__(self, *names, **kwargs):
|
||||
@@ -62,42 +63,71 @@ class EnvironmentConfig(object):
|
||||
return None
|
||||
|
||||
|
||||
ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AGENT_AUTH_TOKEN = EnvironmentConfig("CLEARML_AUTH_TOKEN")
|
||||
ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
|
||||
ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")
|
||||
|
||||
ENVIRONMENT_CONFIG = {
|
||||
"api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
|
||||
"api.api_server": EnvironmentConfig(
|
||||
"CLEARML_API_HOST",
|
||||
"TRAINS_API_HOST",
|
||||
),
|
||||
"api.files_server": EnvironmentConfig(
|
||||
"CLEARML_FILES_HOST",
|
||||
"TRAINS_FILES_HOST",
|
||||
),
|
||||
"api.web_server": EnvironmentConfig(
|
||||
"CLEARML_WEB_HOST",
|
||||
"TRAINS_WEB_HOST",
|
||||
),
|
||||
"api.credentials.access_key": EnvironmentConfig(
|
||||
"CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
|
||||
"CLEARML_API_ACCESS_KEY",
|
||||
"TRAINS_API_ACCESS_KEY",
|
||||
),
|
||||
"api.credentials.secret_key": EnvironmentConfig(
|
||||
"CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY",
|
||||
"api.credentials.secret_key": ENV_AGENT_SECRET_KEY,
|
||||
"agent.worker_name": EnvironmentConfig(
|
||||
"CLEARML_WORKER_NAME",
|
||||
"TRAINS_WORKER_NAME",
|
||||
),
|
||||
"agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
|
||||
"agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
|
||||
"agent.cuda_version": EnvironmentConfig(
|
||||
"CLEARML_CUDA_VERSION", "TRAINS_CUDA_VERSION", "CUDA_VERSION"
|
||||
),
|
||||
"agent.cudnn_version": EnvironmentConfig(
|
||||
"CLEARML_CUDNN_VERSION", "TRAINS_CUDNN_VERSION", "CUDNN_VERSION"
|
||||
),
|
||||
"agent.cpu_only": EnvironmentConfig(
|
||||
names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
|
||||
"agent.worker_id": EnvironmentConfig(
|
||||
"CLEARML_WORKER_ID",
|
||||
"TRAINS_WORKER_ID",
|
||||
),
|
||||
"agent.cuda_version": EnvironmentConfig("CLEARML_CUDA_VERSION", "TRAINS_CUDA_VERSION", "CUDA_VERSION"),
|
||||
"agent.cudnn_version": EnvironmentConfig("CLEARML_CUDNN_VERSION", "TRAINS_CUDNN_VERSION", "CUDNN_VERSION"),
|
||||
"agent.cpu_only": EnvironmentConfig(names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool),
|
||||
"agent.crash_on_exception": EnvironmentConfig("CLEAMRL_AGENT_CRASH_ON_EXCEPTION", type=bool),
|
||||
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
|
||||
"sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"),
|
||||
"sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
|
||||
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
|
||||
"sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
||||
'account_key': EnvironmentConfig("AZURE_STORAGE_KEY")},
|
||||
"sdk.azure.storage.containers.0": {
|
||||
"account_name": EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
||||
"account_key": ENV_AZURE_ACCOUNT_KEY,
|
||||
},
|
||||
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
|
||||
}
|
||||
|
||||
ENVIRONMENT_SDK_PARAMS = {
|
||||
"task_id": ("CLEARML_TASK_ID", "TRAINS_TASK_ID", ),
|
||||
"config_file": ("CLEARML_CONFIG_FILE", "TRAINS_CONFIG_FILE", ),
|
||||
"log_level": ("CLEARML_LOG_LEVEL", "TRAINS_LOG_LEVEL", ),
|
||||
"log_to_backend": ("CLEARML_LOG_TASK_TO_BACKEND", "TRAINS_LOG_TASK_TO_BACKEND", ),
|
||||
"task_id": (
|
||||
"CLEARML_TASK_ID",
|
||||
"TRAINS_TASK_ID",
|
||||
),
|
||||
"config_file": (
|
||||
"CLEARML_CONFIG_FILE",
|
||||
"TRAINS_CONFIG_FILE",
|
||||
),
|
||||
"log_level": (
|
||||
"CLEARML_LOG_LEVEL",
|
||||
"TRAINS_LOG_LEVEL",
|
||||
),
|
||||
"log_to_backend": (
|
||||
"CLEARML_LOG_TASK_TO_BACKEND",
|
||||
"TRAINS_LOG_TASK_TO_BACKEND",
|
||||
),
|
||||
}
|
||||
|
||||
ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig(
|
||||
names=("CLEARML_AGENT_ALG_ENV", "TRAINS_AGENT_ALG_ENV"), type=bool)
|
||||
ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig(names=("CLEARML_AGENT_ALG_ENV", "TRAINS_AGENT_ALG_ENV"), type=bool)
|
||||
|
||||
VIRTUAL_ENVIRONMENT_PATH = {
|
||||
"python2": normalize_path(CONFIG_DIR, "py2venv"),
|
||||
@@ -116,24 +146,110 @@ TOKEN_EXPIRATION_SECONDS = int(timedelta(days=2).total_seconds())
|
||||
|
||||
METADATA_EXTENSION = ".json"
|
||||
|
||||
DEFAULT_VENV_UPDATE_URL = (
|
||||
"https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
|
||||
)
|
||||
DEFAULT_VENV_UPDATE_URL = "https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
|
||||
WORKING_REPOSITORY_DIR = "task_repository"
|
||||
WORKING_STANDALONE_DIR = "code"
|
||||
DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
|
||||
PIP_EXTRA_INDICES = [
|
||||
]
|
||||
PIP_EXTRA_INDICES = []
|
||||
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
|
||||
ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAGE')
|
||||
ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
|
||||
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
|
||||
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
|
||||
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
|
||||
ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
|
||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
|
||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
|
||||
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
|
||||
'TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
|
||||
ENV_PIP_EXTRA_INSTALL_FLAGS = EnvironmentConfig("CLEARML_EXTRA_PIP_INSTALL_FLAGS", type=list)
|
||||
ENV_DOCKER_IMAGE = EnvironmentConfig("CLEARML_DOCKER_IMAGE", "TRAINS_DOCKER_IMAGE")
|
||||
ENV_WORKER_ID = EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID")
|
||||
ENV_WORKER_TAGS = EnvironmentConfig("CLEARML_WORKER_TAGS")
|
||||
ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig("CLEARML_AGENT_SKIP_PIP_VENV_INSTALL")
|
||||
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig("CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL", type=bool)
|
||||
ENV_AGENT_FORCE_CODE_DIR = EnvironmentConfig("CLEARML_AGENT_FORCE_CODE_DIR")
|
||||
ENV_AGENT_FORCE_EXEC_SCRIPT = EnvironmentConfig("CLEARML_AGENT_FORCE_EXEC_SCRIPT")
|
||||
ENV_AGENT_FORCE_POETRY = EnvironmentConfig("CLEARML_AGENT_FORCE_POETRY", type=bool)
|
||||
ENV_AGENT_FORCE_TASK_INIT = EnvironmentConfig("CLEARML_AGENT_FORCE_TASK_INIT", type=bool)
|
||||
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig("CLEARML_DOCKER_SKIP_GPUS_FLAG", "TRAINS_DOCKER_SKIP_GPUS_FLAG")
|
||||
ENV_AGENT_GIT_USER = EnvironmentConfig("CLEARML_AGENT_GIT_USER", "TRAINS_AGENT_GIT_USER")
|
||||
ENV_AGENT_GIT_PASS = EnvironmentConfig("CLEARML_AGENT_GIT_PASS", "TRAINS_AGENT_GIT_PASS")
|
||||
ENV_AGENT_GIT_HOST = EnvironmentConfig("CLEARML_AGENT_GIT_HOST", "TRAINS_AGENT_GIT_HOST")
|
||||
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig("CLEARML_AGENT_DISABLE_SSH_MOUNT", type=bool)
|
||||
ENV_AGENT_DEBUG_GET_NEXT_TASK = EnvironmentConfig("CLEARML_AGENT_DEBUG_GET_NEXT_TASK", type=bool)
|
||||
ENV_SSH_AUTH_SOCK = EnvironmentConfig("SSH_AUTH_SOCK")
|
||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig("CLEARML_AGENT_EXEC_USER", "TRAINS_AGENT_EXEC_USER")
|
||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig("CLEARML_AGENT_EXTRA_PYTHON_PATH", "TRAINS_AGENT_EXTRA_PYTHON_PATH")
|
||||
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig(
|
||||
"CLEARML_AGENT_K8S_HOST_MOUNT",
|
||||
"CLEARML_AGENT_DOCKER_HOST_MOUNT",
|
||||
"TRAINS_AGENT_K8S_HOST_MOUNT",
|
||||
"TRAINS_AGENT_DOCKER_HOST_MOUNT",
|
||||
)
|
||||
ENV_VENV_CACHE_PATH = EnvironmentConfig("CLEARML_AGENT_VENV_CACHE_PATH")
|
||||
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_ARGS", type=list)
|
||||
ENV_EXTRA_DOCKER_LABELS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_LABELS", type=list)
|
||||
ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
|
||||
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
|
||||
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
|
||||
ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV")
|
||||
ENV_CONFIG_BC_IN_STANDALONE = EnvironmentConfig("CLEARML_AGENT_STANDALONE_CONFIG_BC", type=bool)
|
||||
""" Maintain backwards compatible configuration when launching in standalone mode """
|
||||
|
||||
ENV_FORCE_DOCKER_AGENT_REPO = EnvironmentConfig("FORCE_CLEARML_AGENT_REPO", "CLEARML_AGENT_DOCKER_AGENT_REPO")
|
||||
|
||||
ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
|
||||
"""
|
||||
Specify a restart value for a services agent task containers.
|
||||
Note that when a restart value is provided, task containers will not be run with the '--rm' flag and will
|
||||
not be cleaned up automatically when completed (this will need to be done externally using the
|
||||
'docker container prune' command to free up resources).
|
||||
Value format for this env var is "<restart-value>;<task-selector>", where:
|
||||
- <restart-value> can be any valid restart value for docker-run (see https://docs.docker.com/engine/reference/commandline/run/#restart)
|
||||
- <task-selector> is optional, allowing to restrict this behaviour to specific tasks. The format is:
|
||||
"<path-to-task-field>=<value>" where:
|
||||
* <path-to-task-field> is a dot-separated path to a task field (e.g. "container.image")
|
||||
* <value> is optional. If not provided, the restart policy till be applied for the task container if the
|
||||
path provided exists. If provided, the restart policy will be applied if the value matches the value
|
||||
obtained from the task (value parsing and comparison is based on the type of value obtained from the task)
|
||||
For example:
|
||||
CLEARML_AGENT_SERVICES_DOCKER_RESTART=unless-stopped
|
||||
CLEARML_AGENT_SERVICES_DOCKER_RESTART=unless-stopped;container.image=some-image
|
||||
"""
|
||||
|
||||
ENV_FORCE_SYSTEM_SITE_PACKAGES = EnvironmentConfig("CLEARML_AGENT_FORCE_SYSTEM_SITE_PACKAGES", type=bool)
|
||||
""" Force system_site_packages: true when running tasks in containers (i.e. docker mode or k8s glue) """
|
||||
|
||||
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig("CLEARML_AGENT_CUSTOM_BUILD_SCRIPT")
|
||||
"""
|
||||
Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
The script can also be specified using the `agent.custom_build_script` configuration setting.
|
||||
|
||||
When running the script, the following environment variables will be set:
|
||||
- CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
contents in JSON format
|
||||
- CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
- CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
- CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
- CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
- CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
this file must be in the following JSON format:
|
||||
```json
|
||||
{
|
||||
"binary": "/absolute/path/to/python-executable",
|
||||
"entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
"working_dir": "/absolute/path/to/task-working/dir"
|
||||
}
|
||||
```
|
||||
If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
skip virtual environment creation.
|
||||
|
||||
In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
standard flow.
|
||||
"""
|
||||
|
||||
ENV_PACKAGE_PYTORCH_RESOLVE = EnvironmentConfig("CLEARML_AGENT_PACKAGE_PYTORCH_RESOLVE")
|
||||
|
||||
ENV_TEMP_STDOUT_FILE_DIR = EnvironmentConfig("CLEARML_AGENT_TEMP_STDOUT_FILE_DIR")
|
||||
|
||||
ENV_GIT_CLONE_VERBOSE = EnvironmentConfig("CLEARML_AGENT_GIT_CLONE_VERBOSE", type=bool)
|
||||
|
||||
ENV_GPU_FRACTIONS = EnvironmentConfig("CLEARML_AGENT_GPU_FRACTIONS")
|
||||
|
||||
|
||||
class FileBuffering(IntEnum):
|
||||
|
||||
@@ -84,3 +84,13 @@ class MissingPackageError(CommandFailedError):
|
||||
def __str__(self):
|
||||
return '{self.__class__.__name__}: ' \
|
||||
'"{self.name}" package is required. Please run "pip install {self.name}"'.format(self=self)
|
||||
|
||||
|
||||
class CustomBuildScriptFailed(CommandFailedError):
|
||||
def __init__(self, errno, *args, **kwargs):
|
||||
super(CustomBuildScriptFailed, self).__init__(*args, **kwargs)
|
||||
self.errno = errno
|
||||
|
||||
|
||||
class SkippedCustomBuildScript(CommandFailedError):
|
||||
pass
|
||||
|
||||
5
clearml_agent/external/pyhocon/__init__.py
vendored
Normal file
5
clearml_agent/external/pyhocon/__init__.py
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
from .config_parser import ConfigParser, ConfigFactory, ConfigMissingException
|
||||
from .config_tree import ConfigTree
|
||||
from .converter import HOCONConverter
|
||||
|
||||
__all__ = ["ConfigParser", "ConfigFactory", "ConfigMissingException", "ConfigTree", "HOCONConverter"]
|
||||
762
clearml_agent/external/pyhocon/config_parser.py
vendored
Normal file
762
clearml_agent/external/pyhocon/config_parser.py
vendored
Normal file
@@ -0,0 +1,762 @@
|
||||
import itertools
|
||||
import re
|
||||
import os
|
||||
import socket
|
||||
import contextlib
|
||||
import codecs
|
||||
from datetime import timedelta
|
||||
|
||||
from pyparsing import Forward, Keyword, QuotedString, Word, Literal, Suppress, Regex, Optional, SkipTo, ZeroOrMore, \
|
||||
Group, lineno, col, TokenConverter, replaceWith, alphanums, alphas8bit, ParseSyntaxException, StringEnd
|
||||
from pyparsing import ParserElement
|
||||
from .config_tree import ConfigTree, ConfigSubstitution, ConfigList, ConfigValues, ConfigUnquotedString, \
|
||||
ConfigInclude, NoneValue, ConfigQuotedString
|
||||
from .exceptions import ConfigSubstitutionException, ConfigMissingException, ConfigException
|
||||
import logging
|
||||
import copy
|
||||
|
||||
use_urllib2 = False
|
||||
try:
|
||||
# For Python 3.0 and later
|
||||
from urllib.request import urlopen
|
||||
from urllib.error import HTTPError, URLError
|
||||
except ImportError: # pragma: no cover
|
||||
# Fall back to Python 2's urllib2
|
||||
from urllib2 import urlopen, HTTPError, URLError
|
||||
|
||||
use_urllib2 = True
|
||||
try:
|
||||
basestring
|
||||
except NameError: # pragma: no cover
|
||||
basestring = str
|
||||
unicode = str
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#
|
||||
# Substitution Defaults
|
||||
#
|
||||
|
||||
|
||||
class DEFAULT_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
class MANDATORY_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
class NO_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
class STR_SUBSTITUTION(object):
|
||||
pass
|
||||
|
||||
|
||||
def period(period_value, period_unit):
|
||||
try:
|
||||
from dateutil.relativedelta import relativedelta as period_impl
|
||||
except Exception:
|
||||
from datetime import timedelta as period_impl
|
||||
|
||||
if period_unit == 'nanoseconds':
|
||||
period_unit = 'microseconds'
|
||||
period_value = int(period_value / 1000)
|
||||
|
||||
arguments = dict(zip((period_unit,), (period_value,)))
|
||||
|
||||
if period_unit == 'milliseconds':
|
||||
return timedelta(**arguments)
|
||||
|
||||
return period_impl(**arguments)
|
||||
|
||||
|
||||
class ConfigFactory(object):
|
||||
|
||||
@classmethod
|
||||
def parse_file(cls, filename, encoding='utf-8', required=True, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""Parse file
|
||||
|
||||
:param filename: filename
|
||||
:type filename: basestring
|
||||
:param encoding: file encoding
|
||||
:type encoding: basestring
|
||||
:param required: If true, raises an exception if can't load file
|
||||
:type required: boolean
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by its
|
||||
substitution expression (e.g., ${x})
|
||||
:type unresolved_value: class
|
||||
:return: Config object
|
||||
:type return: Config
|
||||
"""
|
||||
try:
|
||||
with codecs.open(filename, 'r', encoding=encoding) as fd:
|
||||
content = fd.read()
|
||||
return cls.parse_string(content, os.path.dirname(filename), resolve, unresolved_value)
|
||||
except IOError as e:
|
||||
if required:
|
||||
raise e
|
||||
logger.warn('Cannot include file %s. File does not exist or cannot be read.', filename)
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def parse_URL(cls, url, timeout=None, resolve=True, required=False, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""Parse URL
|
||||
|
||||
:param url: url to parse
|
||||
:type url: basestring
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
|
||||
its substitution expression (e.g., ${x})
|
||||
:type unresolved_value: boolean
|
||||
:return: Config object or []
|
||||
:type return: Config or list
|
||||
"""
|
||||
socket_timeout = socket._GLOBAL_DEFAULT_TIMEOUT if timeout is None else timeout
|
||||
|
||||
try:
|
||||
with contextlib.closing(urlopen(url, timeout=socket_timeout)) as fd:
|
||||
content = fd.read() if use_urllib2 else fd.read().decode('utf-8')
|
||||
return cls.parse_string(content, os.path.dirname(url), resolve, unresolved_value)
|
||||
except (HTTPError, URLError) as e:
|
||||
logger.warn('Cannot include url %s. Resource is inaccessible.', url)
|
||||
if required:
|
||||
raise e
|
||||
else:
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def parse_string(cls, content, basedir=None, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""Parse URL
|
||||
|
||||
:param content: content to parse
|
||||
:type content: basestring
|
||||
:param resolve: If true, resolve substitutions
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
|
||||
its substitution expression (e.g., ${x})
|
||||
:type unresolved_value: boolean
|
||||
:return: Config object
|
||||
:type return: Config
|
||||
"""
|
||||
return ConfigParser().parse(content, basedir, resolve, unresolved_value)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, dictionary, root=False):
|
||||
"""Convert dictionary (and ordered dictionary) into a ConfigTree
|
||||
:param dictionary: dictionary to convert
|
||||
:type dictionary: dict
|
||||
:return: Config object
|
||||
:type return: Config
|
||||
"""
|
||||
|
||||
def create_tree(value):
|
||||
if isinstance(value, dict):
|
||||
res = ConfigTree(root=root)
|
||||
for key, child_value in value.items():
|
||||
res.put(key, create_tree(child_value))
|
||||
return res
|
||||
if isinstance(value, list):
|
||||
return [create_tree(v) for v in value]
|
||||
else:
|
||||
return value
|
||||
|
||||
return create_tree(dictionary)
|
||||
|
||||
|
||||
class ConfigParser(object):
|
||||
"""
|
||||
Parse HOCON files: https://github.com/typesafehub/config/blob/master/HOCON.md
|
||||
"""
|
||||
|
||||
REPLACEMENTS = {
|
||||
'\\\\': '\\',
|
||||
'\\\n': '\n',
|
||||
'\\n': '\n',
|
||||
'\\r': '\r',
|
||||
'\\t': '\t',
|
||||
'\\=': '=',
|
||||
'\\#': '#',
|
||||
'\\!': '!',
|
||||
'\\"': '"',
|
||||
}
|
||||
|
||||
period_type_map = {
|
||||
'nanoseconds': ['ns', 'nano', 'nanos', 'nanosecond', 'nanoseconds'],
|
||||
|
||||
'microseconds': ['us', 'micro', 'micros', 'microsecond', 'microseconds'],
|
||||
'milliseconds': ['ms', 'milli', 'millis', 'millisecond', 'milliseconds'],
|
||||
'seconds': ['s', 'second', 'seconds'],
|
||||
'minutes': ['m', 'minute', 'minutes'],
|
||||
'hours': ['h', 'hour', 'hours'],
|
||||
'weeks': ['w', 'week', 'weeks'],
|
||||
'days': ['d', 'day', 'days'],
|
||||
|
||||
}
|
||||
|
||||
optional_period_type_map = {
|
||||
'months': ['mo', 'month', 'months'], # 'm' from hocon spec removed. conflicts with minutes syntax.
|
||||
'years': ['y', 'year', 'years']
|
||||
}
|
||||
|
||||
supported_period_map = None
|
||||
|
||||
@classmethod
|
||||
def get_supported_period_type_map(cls):
|
||||
if cls.supported_period_map is None:
|
||||
cls.supported_period_map = {}
|
||||
cls.supported_period_map.update(cls.period_type_map)
|
||||
|
||||
try:
|
||||
from dateutil import relativedelta
|
||||
|
||||
if relativedelta is not None:
|
||||
cls.supported_period_map.update(cls.optional_period_type_map)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return cls.supported_period_map
|
||||
|
||||
@classmethod
|
||||
def parse(cls, content, basedir=None, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
|
||||
"""parse a HOCON content
|
||||
|
||||
:param content: HOCON content to parse
|
||||
:type content: basestring
|
||||
:param resolve: if true, resolve substitutions
|
||||
:type resolve: boolean
|
||||
:param unresolved_value: assigned value value to unresolved substitution.
|
||||
If overriden with a default value, it will replace all unresolved value to the default value.
|
||||
If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
|
||||
its substitution expression (e.g., ${x})
|
||||
:type unresolved_value: boolean
|
||||
:return: a ConfigTree or a list
|
||||
"""
|
||||
|
||||
unescape_pattern = re.compile(r'\\.')
|
||||
|
||||
def replace_escape_sequence(match):
|
||||
value = match.group(0)
|
||||
return cls.REPLACEMENTS.get(value, value)
|
||||
|
||||
def norm_string(value):
|
||||
return unescape_pattern.sub(replace_escape_sequence, value)
|
||||
|
||||
def unescape_string(tokens):
|
||||
return ConfigUnquotedString(norm_string(tokens[0]))
|
||||
|
||||
def parse_multi_string(tokens):
|
||||
# remove the first and last 3 "
|
||||
return tokens[0][3: -3]
|
||||
|
||||
def convert_number(tokens):
|
||||
n = tokens[0]
|
||||
try:
|
||||
return int(n, 10)
|
||||
except ValueError:
|
||||
return float(n)
|
||||
|
||||
def safe_convert_number(tokens):
|
||||
n = tokens[0]
|
||||
try:
|
||||
return int(n, 10)
|
||||
except ValueError:
|
||||
try:
|
||||
return float(n)
|
||||
except ValueError:
|
||||
return n
|
||||
|
||||
def convert_period(tokens):
|
||||
|
||||
period_value = int(tokens.value)
|
||||
period_identifier = tokens.unit
|
||||
|
||||
period_unit = next((single_unit for single_unit, values
|
||||
in cls.get_supported_period_type_map().items()
|
||||
if period_identifier in values))
|
||||
|
||||
return period(period_value, period_unit)
|
||||
|
||||
# ${path} or ${?path} for optional substitution
|
||||
SUBSTITUTION_PATTERN = r"\$\{(?P<optional>\?)?(?P<variable>[^}]+)\}(?P<ws>[ \t]*)"
|
||||
|
||||
def create_substitution(instring, loc, token):
|
||||
# remove the ${ and }
|
||||
match = re.match(SUBSTITUTION_PATTERN, token[0])
|
||||
variable = match.group('variable')
|
||||
ws = match.group('ws')
|
||||
optional = match.group('optional') == '?'
|
||||
substitution = ConfigSubstitution(variable, optional, ws, instring, loc)
|
||||
return substitution
|
||||
|
||||
# ${path} or ${?path} for optional substitution
|
||||
STRING_PATTERN = '"(?P<value>(?:[^"\\\\]|\\\\.)*)"(?P<ws>[ \t]*)'
|
||||
|
||||
def create_quoted_string(instring, loc, token):
|
||||
# remove the ${ and }
|
||||
match = re.match(STRING_PATTERN, token[0])
|
||||
value = norm_string(match.group('value'))
|
||||
ws = match.group('ws')
|
||||
return ConfigQuotedString(value, ws, instring, loc)
|
||||
|
||||
def include_config(instring, loc, token):
|
||||
url = None
|
||||
file = None
|
||||
required = False
|
||||
|
||||
if token[0] == 'required':
|
||||
required = True
|
||||
final_tokens = token[1:]
|
||||
else:
|
||||
final_tokens = token
|
||||
|
||||
if len(final_tokens) == 1: # include "test"
|
||||
value = final_tokens[0].value if isinstance(final_tokens[0], ConfigQuotedString) else final_tokens[0]
|
||||
if value.startswith("http://") or value.startswith("https://") or value.startswith("file://"):
|
||||
url = value
|
||||
else:
|
||||
file = value
|
||||
elif len(final_tokens) == 2: # include url("test") or file("test")
|
||||
value = final_tokens[1].value if isinstance(token[1], ConfigQuotedString) else final_tokens[1]
|
||||
if final_tokens[0] == 'url':
|
||||
url = value
|
||||
else:
|
||||
file = value
|
||||
|
||||
if url is not None:
|
||||
logger.debug('Loading config from url %s', url)
|
||||
obj = ConfigFactory.parse_URL(
|
||||
url,
|
||||
resolve=False,
|
||||
required=required,
|
||||
unresolved_value=NO_SUBSTITUTION
|
||||
)
|
||||
elif file is not None:
|
||||
path = file if basedir is None else os.path.join(basedir, file)
|
||||
logger.debug('Loading config from file %s', path)
|
||||
obj = ConfigFactory.parse_file(
|
||||
path,
|
||||
resolve=False,
|
||||
required=required,
|
||||
unresolved_value=NO_SUBSTITUTION
|
||||
)
|
||||
else:
|
||||
raise ConfigException('No file or URL specified at: {loc}: {instring}', loc=loc, instring=instring)
|
||||
|
||||
return ConfigInclude(obj if isinstance(obj, list) else obj.items())
|
||||
|
||||
@contextlib.contextmanager
|
||||
def set_default_white_spaces():
|
||||
default = ParserElement.DEFAULT_WHITE_CHARS
|
||||
ParserElement.setDefaultWhitespaceChars(' \t')
|
||||
yield
|
||||
ParserElement.setDefaultWhitespaceChars(default)
|
||||
|
||||
with set_default_white_spaces():
|
||||
assign_expr = Forward()
|
||||
true_expr = Keyword("true", caseless=True).setParseAction(replaceWith(True))
|
||||
false_expr = Keyword("false", caseless=True).setParseAction(replaceWith(False))
|
||||
null_expr = Keyword("null", caseless=True).setParseAction(replaceWith(NoneValue()))
|
||||
# key = QuotedString('"', escChar='\\', unquoteResults=False) | Word(alphanums + alphas8bit + '._- /')
|
||||
regexp_numbers = r'[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE][+\-]?\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))'
|
||||
key = QuotedString('"', escChar='\\', unquoteResults=False) | \
|
||||
Regex(regexp_numbers, re.DOTALL).setParseAction(safe_convert_number) | \
|
||||
Word(alphanums + alphas8bit + '._- /')
|
||||
|
||||
eol = Word('\n\r').suppress()
|
||||
eol_comma = Word('\n\r,').suppress()
|
||||
comment = (Literal('#') | Literal('//')) - SkipTo(eol | StringEnd())
|
||||
comment_eol = Suppress(Optional(eol_comma) + comment)
|
||||
comment_no_comma_eol = (comment | eol).suppress()
|
||||
number_expr = Regex(regexp_numbers, re.DOTALL).setParseAction(convert_number)
|
||||
|
||||
period_types = itertools.chain.from_iterable(cls.get_supported_period_type_map().values())
|
||||
period_expr = Regex(r'(?P<value>\d+)\s*(?P<unit>' + '|'.join(period_types) + ')$'
|
||||
).setParseAction(convert_period)
|
||||
|
||||
# multi line string using """
|
||||
# Using fix described in http://pyparsing.wikispaces.com/share/view/3778969
|
||||
multiline_string = Regex('""".*?"*"""', re.DOTALL | re.UNICODE).setParseAction(parse_multi_string)
|
||||
# single quoted line string
|
||||
quoted_string = Regex(r'"(?:[^"\\\n]|\\.)*"[ \t]*', re.UNICODE).setParseAction(create_quoted_string)
|
||||
# unquoted string that takes the rest of the line until an optional comment
|
||||
# we support .properties multiline support which is like this:
|
||||
# line1 \
|
||||
# line2 \
|
||||
# so a backslash precedes the \n
|
||||
unquoted_string = Regex(r'(?:[^^`+?!@*&"\[\{\s\]\}#,=\$\\]|\\.)+[ \t]*',
|
||||
re.UNICODE).setParseAction(unescape_string)
|
||||
substitution_expr = Regex(r'[ \t]*\$\{[^\}]+\}[ \t]*').setParseAction(create_substitution)
|
||||
string_expr = multiline_string | quoted_string | unquoted_string
|
||||
|
||||
value_expr = period_expr | number_expr | true_expr | false_expr | null_expr | string_expr
|
||||
|
||||
include_content = (quoted_string | ((Keyword('url') | Keyword(
|
||||
'file')) - Literal('(').suppress() - quoted_string - Literal(')').suppress()))
|
||||
include_expr = (
|
||||
Keyword("include", caseless=True).suppress() + (
|
||||
include_content | (
|
||||
Keyword("required") - Literal('(').suppress() - include_content - Literal(')').suppress()
|
||||
)
|
||||
)
|
||||
).setParseAction(include_config)
|
||||
|
||||
root_dict_expr = Forward()
|
||||
dict_expr = Forward()
|
||||
list_expr = Forward()
|
||||
multi_value_expr = ZeroOrMore(comment_eol | include_expr | substitution_expr |
|
||||
dict_expr | list_expr | value_expr | (Literal('\\') - eol).suppress())
|
||||
# for a dictionary : or = is optional
|
||||
# last zeroOrMore is because we can have t = {a:4} {b: 6} {c: 7} which is dictionary concatenation
|
||||
inside_dict_expr = ConfigTreeParser(ZeroOrMore(comment_eol | include_expr | assign_expr | eol_comma))
|
||||
inside_root_dict_expr = ConfigTreeParser(ZeroOrMore(
|
||||
comment_eol | include_expr | assign_expr | eol_comma), root=True)
|
||||
dict_expr << Suppress('{') - inside_dict_expr - Suppress('}')
|
||||
root_dict_expr << Suppress('{') - inside_root_dict_expr - Suppress('}')
|
||||
list_entry = ConcatenatedValueParser(multi_value_expr)
|
||||
list_expr << Suppress('[') - ListParser(list_entry - ZeroOrMore(eol_comma - list_entry)) - Suppress(']')
|
||||
|
||||
# special case when we have a value assignment where the string can potentially be the remainder of the line
|
||||
assign_expr << Group(key - ZeroOrMore(comment_no_comma_eol) -
|
||||
(dict_expr | (Literal('=') | Literal(':') | Literal('+=')) -
|
||||
ZeroOrMore(comment_no_comma_eol) - ConcatenatedValueParser(multi_value_expr)))
|
||||
|
||||
# the file can be { ... } where {} can be omitted or []
|
||||
config_expr = ZeroOrMore(comment_eol | eol) + (list_expr | root_dict_expr |
|
||||
inside_root_dict_expr) + ZeroOrMore(comment_eol | eol_comma)
|
||||
config = config_expr.parseString(content, parseAll=True)[0]
|
||||
|
||||
if resolve:
|
||||
allow_unresolved = resolve and unresolved_value is not DEFAULT_SUBSTITUTION and \
|
||||
unresolved_value is not MANDATORY_SUBSTITUTION
|
||||
has_unresolved = cls.resolve_substitutions(config, allow_unresolved)
|
||||
if has_unresolved and unresolved_value is MANDATORY_SUBSTITUTION:
|
||||
raise ConfigSubstitutionException(
|
||||
'resolve cannot be set to True and unresolved_value to MANDATORY_SUBSTITUTION')
|
||||
|
||||
if unresolved_value is not NO_SUBSTITUTION and unresolved_value is not DEFAULT_SUBSTITUTION:
|
||||
cls.unresolve_substitutions_to_value(config, unresolved_value)
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def _resolve_variable(cls, config, substitution):
|
||||
"""
|
||||
:param config:
|
||||
:param substitution:
|
||||
:return: (is_resolved, resolved_variable)
|
||||
"""
|
||||
variable = substitution.variable
|
||||
try:
|
||||
return True, config.get(variable)
|
||||
except ConfigMissingException:
|
||||
# default to environment variable
|
||||
value = os.environ.get(variable)
|
||||
|
||||
if value is None:
|
||||
if substitution.optional:
|
||||
return False, None
|
||||
else:
|
||||
raise ConfigSubstitutionException(
|
||||
"Cannot resolve variable ${{{variable}}} (line: {line}, col: {col})".format(
|
||||
variable=variable,
|
||||
line=lineno(substitution.loc, substitution.instring),
|
||||
col=col(substitution.loc, substitution.instring)))
|
||||
elif isinstance(value, ConfigList) or isinstance(value, ConfigTree):
|
||||
raise ConfigSubstitutionException(
|
||||
"Cannot substitute variable ${{{variable}}} because it does not point to a "
|
||||
"string, int, float, boolean or null {type} (line:{line}, col: {col})".format(
|
||||
variable=variable,
|
||||
type=value.__class__.__name__,
|
||||
line=lineno(substitution.loc, substitution.instring),
|
||||
col=col(substitution.loc, substitution.instring)))
|
||||
return True, value
|
||||
|
||||
@classmethod
|
||||
def _fixup_self_references(cls, config, accept_unresolved=False):
|
||||
if isinstance(config, ConfigTree) and config.root:
|
||||
for key in config: # Traverse history of element
|
||||
history = config.history[key]
|
||||
previous_item = history[0]
|
||||
for current_item in history[1:]:
|
||||
for substitution in cls._find_substitutions(current_item):
|
||||
prop_path = ConfigTree.parse_key(substitution.variable)
|
||||
if len(prop_path) > 1 and config.get(substitution.variable, None) is not None:
|
||||
continue # If value is present in latest version, don't do anything
|
||||
if prop_path[0] == key:
|
||||
if isinstance(previous_item, ConfigValues) and not accept_unresolved:
|
||||
# We hit a dead end, we cannot evaluate
|
||||
raise ConfigSubstitutionException(
|
||||
"Property {variable} cannot be substituted. Check for cycles.".format(
|
||||
variable=substitution.variable
|
||||
)
|
||||
)
|
||||
else:
|
||||
value = previous_item if len(
|
||||
prop_path) == 1 else previous_item.get(".".join(prop_path[1:]))
|
||||
_, _, current_item = cls._do_substitute(substitution, value)
|
||||
previous_item = current_item
|
||||
|
||||
if len(history) == 1:
|
||||
for substitution in cls._find_substitutions(previous_item):
|
||||
prop_path = ConfigTree.parse_key(substitution.variable)
|
||||
if len(prop_path) > 1 and config.get(substitution.variable, None) is not None:
|
||||
continue # If value is present in latest version, don't do anything
|
||||
if prop_path[0] == key and substitution.optional:
|
||||
cls._do_substitute(substitution, None)
|
||||
if prop_path[0] == key:
|
||||
value = os.environ.get(key)
|
||||
if value is not None:
|
||||
cls._do_substitute(substitution, value)
|
||||
continue
|
||||
if substitution.optional: # special case, when self optional referencing without existing
|
||||
cls._do_substitute(substitution, None)
|
||||
|
||||
# traverse config to find all the substitutions
|
||||
@classmethod
|
||||
def _find_substitutions(cls, item):
|
||||
"""Convert HOCON input into a JSON output
|
||||
|
||||
:return: JSON string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
if isinstance(item, ConfigValues):
|
||||
return item.get_substitutions()
|
||||
|
||||
substitutions = []
|
||||
elements = []
|
||||
if isinstance(item, ConfigTree):
|
||||
elements = item.values()
|
||||
elif isinstance(item, list):
|
||||
elements = item
|
||||
|
||||
for child in elements:
|
||||
substitutions += cls._find_substitutions(child)
|
||||
return substitutions
|
||||
|
||||
@classmethod
|
||||
def _do_substitute(cls, substitution, resolved_value, is_optional_resolved=True):
|
||||
unresolved = False
|
||||
new_substitutions = []
|
||||
if isinstance(resolved_value, ConfigValues):
|
||||
resolved_value = resolved_value.transform()
|
||||
if isinstance(resolved_value, ConfigValues):
|
||||
unresolved = True
|
||||
result = resolved_value
|
||||
else:
|
||||
# replace token by substitution
|
||||
config_values = substitution.parent
|
||||
# if it is a string, then add the extra ws that was present in the original string after the substitution
|
||||
formatted_resolved_value = resolved_value \
|
||||
if resolved_value is None \
|
||||
or isinstance(resolved_value, (dict, list)) \
|
||||
or substitution.index == len(config_values.tokens) - 1 \
|
||||
else (str(resolved_value) + substitution.ws)
|
||||
# use a deepcopy of resolved_value to avoid mutation
|
||||
config_values.put(substitution.index, copy.deepcopy(formatted_resolved_value))
|
||||
transformation = config_values.transform()
|
||||
result = config_values.overriden_value \
|
||||
if transformation is None and not is_optional_resolved \
|
||||
else transformation
|
||||
|
||||
if result is None and config_values.key in config_values.parent:
|
||||
del config_values.parent[config_values.key]
|
||||
else:
|
||||
config_values.parent[config_values.key] = result
|
||||
s = cls._find_substitutions(result)
|
||||
if s:
|
||||
new_substitutions = s
|
||||
unresolved = True
|
||||
|
||||
return (unresolved, new_substitutions, result)
|
||||
|
||||
@classmethod
|
||||
def _final_fixup(cls, item):
|
||||
if isinstance(item, ConfigValues):
|
||||
return item.transform()
|
||||
elif isinstance(item, list):
|
||||
return list([cls._final_fixup(child) for child in item])
|
||||
elif isinstance(item, ConfigTree):
|
||||
items = list(item.items())
|
||||
for key, child in items:
|
||||
item[key] = cls._final_fixup(child)
|
||||
return item
|
||||
|
||||
@classmethod
|
||||
def unresolve_substitutions_to_value(cls, config, unresolved_value=STR_SUBSTITUTION):
|
||||
for substitution in cls._find_substitutions(config):
|
||||
if unresolved_value is STR_SUBSTITUTION:
|
||||
value = substitution.raw_str()
|
||||
elif unresolved_value is None:
|
||||
value = NoneValue()
|
||||
else:
|
||||
value = unresolved_value
|
||||
cls._do_substitute(substitution, value, False)
|
||||
cls._final_fixup(config)
|
||||
|
||||
@classmethod
|
||||
def resolve_substitutions(cls, config, accept_unresolved=False):
|
||||
has_unresolved = False
|
||||
cls._fixup_self_references(config, accept_unresolved)
|
||||
substitutions = cls._find_substitutions(config)
|
||||
if len(substitutions) > 0:
|
||||
unresolved = True
|
||||
any_unresolved = True
|
||||
_substitutions = []
|
||||
cache = {}
|
||||
while any_unresolved and len(substitutions) > 0 and set(substitutions) != set(_substitutions):
|
||||
unresolved = False
|
||||
any_unresolved = True
|
||||
_substitutions = substitutions[:]
|
||||
|
||||
for substitution in _substitutions:
|
||||
is_optional_resolved, resolved_value = cls._resolve_variable(config, substitution)
|
||||
|
||||
# if the substitution is optional
|
||||
if not is_optional_resolved and substitution.optional:
|
||||
resolved_value = None
|
||||
if isinstance(resolved_value, ConfigValues):
|
||||
parents = cache.get(resolved_value)
|
||||
if parents is None:
|
||||
parents = []
|
||||
link = resolved_value
|
||||
while isinstance(link, ConfigValues):
|
||||
parents.append(link)
|
||||
link = link.overriden_value
|
||||
cache[resolved_value] = parents
|
||||
|
||||
if isinstance(resolved_value, ConfigValues) \
|
||||
and substitution.parent in parents \
|
||||
and hasattr(substitution.parent, 'overriden_value') \
|
||||
and substitution.parent.overriden_value:
|
||||
|
||||
# self resolution, backtrack
|
||||
resolved_value = substitution.parent.overriden_value
|
||||
|
||||
unresolved, new_substitutions, result = cls._do_substitute(
|
||||
substitution, resolved_value, is_optional_resolved)
|
||||
any_unresolved = unresolved or any_unresolved
|
||||
substitutions.extend(new_substitutions)
|
||||
if not isinstance(result, ConfigValues):
|
||||
substitutions.remove(substitution)
|
||||
|
||||
cls._final_fixup(config)
|
||||
if unresolved:
|
||||
has_unresolved = True
|
||||
if not accept_unresolved:
|
||||
raise ConfigSubstitutionException("Cannot resolve {variables}. Check for cycles.".format(
|
||||
variables=', '.join('${{{variable}}}: (line: {line}, col: {col})'.format(
|
||||
variable=substitution.variable,
|
||||
line=lineno(substitution.loc, substitution.instring),
|
||||
col=col(substitution.loc, substitution.instring)) for substitution in substitutions)))
|
||||
|
||||
cls._final_fixup(config)
|
||||
return has_unresolved
|
||||
|
||||
|
||||
class ListParser(TokenConverter):
|
||||
"""Parse a list [elt1, etl2, ...]
|
||||
"""
|
||||
|
||||
def __init__(self, expr=None):
|
||||
super(ListParser, self).__init__(expr)
|
||||
self.saveAsList = True
|
||||
|
||||
def postParse(self, instring, loc, token_list):
|
||||
"""Create a list from the tokens
|
||||
|
||||
:param instring:
|
||||
:param loc:
|
||||
:param token_list:
|
||||
:return:
|
||||
"""
|
||||
cleaned_token_list = [token for tokens in (token.tokens if isinstance(token, ConfigInclude) else [token]
|
||||
for token in token_list if token != '')
|
||||
for token in tokens]
|
||||
config_list = ConfigList(cleaned_token_list)
|
||||
return [config_list]
|
||||
|
||||
|
||||
class ConcatenatedValueParser(TokenConverter):
|
||||
def __init__(self, expr=None):
|
||||
super(ConcatenatedValueParser, self).__init__(expr)
|
||||
self.parent = None
|
||||
self.key = None
|
||||
|
||||
def postParse(self, instring, loc, token_list):
|
||||
config_values = ConfigValues(token_list, instring, loc)
|
||||
return [config_values.transform()]
|
||||
|
||||
|
||||
class ConfigTreeParser(TokenConverter):
|
||||
"""
|
||||
Parse a config tree from tokens
|
||||
"""
|
||||
|
||||
def __init__(self, expr=None, root=False):
|
||||
super(ConfigTreeParser, self).__init__(expr)
|
||||
self.root = root
|
||||
self.saveAsList = True
|
||||
|
||||
def postParse(self, instring, loc, token_list):
|
||||
"""Create ConfigTree from tokens
|
||||
|
||||
:param instring:
|
||||
:param loc:
|
||||
:param token_list:
|
||||
:return:
|
||||
"""
|
||||
config_tree = ConfigTree(root=self.root)
|
||||
for element in token_list:
|
||||
expanded_tokens = element.tokens if isinstance(element, ConfigInclude) else [element]
|
||||
|
||||
for tokens in expanded_tokens:
|
||||
# key, value1 (optional), ...
|
||||
key = tokens[0].strip() if isinstance(tokens[0], (unicode, basestring)) else tokens[0]
|
||||
operator = '='
|
||||
if len(tokens) == 3 and tokens[1].strip() in [':', '=', '+=']:
|
||||
operator = tokens[1].strip()
|
||||
values = tokens[2:]
|
||||
elif len(tokens) == 2:
|
||||
values = tokens[1:]
|
||||
else:
|
||||
raise ParseSyntaxException("Unknown tokens {tokens} received".format(tokens=tokens))
|
||||
# empty string
|
||||
if len(values) == 0:
|
||||
config_tree.put(key, '')
|
||||
else:
|
||||
value = values[0]
|
||||
if isinstance(value, list) and operator == "+=":
|
||||
value = ConfigValues([ConfigSubstitution(key, True, '', False, loc), value], False, loc)
|
||||
config_tree.put(key, value, False)
|
||||
elif isinstance(value, unicode) and operator == "+=":
|
||||
value = ConfigValues([ConfigSubstitution(key, True, '', True, loc), ' ' + value], True, loc)
|
||||
config_tree.put(key, value, False)
|
||||
elif isinstance(value, list):
|
||||
config_tree.put(key, value, False)
|
||||
else:
|
||||
existing_value = config_tree.get(key, None)
|
||||
if isinstance(value, ConfigTree) and not isinstance(existing_value, list):
|
||||
# Only Tree has to be merged with tree
|
||||
config_tree.put(key, value, True)
|
||||
elif isinstance(value, ConfigValues):
|
||||
conf_value = value
|
||||
value.parent = config_tree
|
||||
value.key = key
|
||||
if isinstance(existing_value, list) or isinstance(existing_value, ConfigTree):
|
||||
config_tree.put(key, conf_value, True)
|
||||
else:
|
||||
config_tree.put(key, conf_value, False)
|
||||
else:
|
||||
config_tree.put(key, value, False)
|
||||
return config_tree
|
||||
608
clearml_agent/external/pyhocon/config_tree.py
vendored
Normal file
608
clearml_agent/external/pyhocon/config_tree.py
vendored
Normal file
@@ -0,0 +1,608 @@
|
||||
from collections import OrderedDict
|
||||
from pyparsing import lineno
|
||||
from pyparsing import col
|
||||
try:
|
||||
basestring
|
||||
except NameError: # pragma: no cover
|
||||
basestring = str
|
||||
unicode = str
|
||||
|
||||
import re
|
||||
import copy
|
||||
from .exceptions import ConfigException, ConfigWrongTypeException, ConfigMissingException
|
||||
|
||||
|
||||
class UndefinedKey(object):
|
||||
pass
|
||||
|
||||
|
||||
class NonExistentKey(object):
|
||||
pass
|
||||
|
||||
|
||||
class NoneValue(object):
|
||||
pass
|
||||
|
||||
|
||||
class ConfigTree(OrderedDict):
|
||||
KEY_SEP = '.'
|
||||
|
||||
def __init__(self, *args, **kwds):
|
||||
self.root = kwds.pop('root') if 'root' in kwds else False
|
||||
if self.root:
|
||||
self.history = {}
|
||||
super(ConfigTree, self).__init__(*args, **kwds)
|
||||
for key, value in self.items():
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = self
|
||||
value.index = key
|
||||
|
||||
@staticmethod
|
||||
def merge_configs(a, b, copy_trees=False):
|
||||
"""Merge config b into a
|
||||
|
||||
:param a: target config
|
||||
:type a: ConfigTree
|
||||
:param b: source config
|
||||
:type b: ConfigTree
|
||||
:return: merged config a
|
||||
"""
|
||||
for key, value in b.items():
|
||||
# if key is in both a and b and both values are dictionary then merge it otherwise override it
|
||||
if key in a and isinstance(a[key], ConfigTree) and isinstance(b[key], ConfigTree):
|
||||
if copy_trees:
|
||||
a[key] = a[key].copy()
|
||||
ConfigTree.merge_configs(a[key], b[key], copy_trees=copy_trees)
|
||||
else:
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = a
|
||||
value.key = key
|
||||
if key in a:
|
||||
value.overriden_value = a[key]
|
||||
a[key] = value
|
||||
if a.root:
|
||||
if b.root:
|
||||
a.history[key] = a.history.get(key, []) + b.history.get(key, [value])
|
||||
else:
|
||||
a.history[key] = a.history.get(key, []) + [value]
|
||||
|
||||
return a
|
||||
|
||||
def _put(self, key_path, value, append=False):
|
||||
key_elt = key_path[0]
|
||||
if len(key_path) == 1:
|
||||
# if value to set does not exist, override
|
||||
# if they are both configs then merge
|
||||
# if not then override
|
||||
if key_elt in self and isinstance(self[key_elt], ConfigTree) and isinstance(value, ConfigTree):
|
||||
if self.root:
|
||||
new_value = ConfigTree.merge_configs(ConfigTree(), self[key_elt], copy_trees=True)
|
||||
new_value = ConfigTree.merge_configs(new_value, value, copy_trees=True)
|
||||
self._push_history(key_elt, new_value)
|
||||
self[key_elt] = new_value
|
||||
else:
|
||||
ConfigTree.merge_configs(self[key_elt], value)
|
||||
elif append:
|
||||
# If we have t=1
|
||||
# and we try to put t.a=5 then t is replaced by {a: 5}
|
||||
l_value = self.get(key_elt, None)
|
||||
if isinstance(l_value, ConfigValues):
|
||||
l_value.tokens.append(value)
|
||||
l_value.recompute()
|
||||
elif isinstance(l_value, ConfigTree) and isinstance(value, ConfigValues):
|
||||
value.overriden_value = l_value
|
||||
value.tokens.insert(0, l_value)
|
||||
value.recompute()
|
||||
value.parent = self
|
||||
value.key = key_elt
|
||||
self._push_history(key_elt, value)
|
||||
self[key_elt] = value
|
||||
elif isinstance(l_value, list) and isinstance(value, ConfigValues):
|
||||
self._push_history(key_elt, value)
|
||||
value.overriden_value = l_value
|
||||
value.parent = self
|
||||
value.key = key_elt
|
||||
self[key_elt] = value
|
||||
elif isinstance(l_value, list):
|
||||
self[key_elt] = l_value + value
|
||||
self._push_history(key_elt, l_value)
|
||||
elif l_value is None:
|
||||
self._push_history(key_elt, value)
|
||||
self[key_elt] = value
|
||||
|
||||
else:
|
||||
raise ConfigWrongTypeException(
|
||||
u"Cannot concatenate the list {key}: {value} to {prev_value} of {type}".format(
|
||||
key='.'.join(key_path),
|
||||
value=value,
|
||||
prev_value=l_value,
|
||||
type=l_value.__class__.__name__)
|
||||
)
|
||||
else:
|
||||
# if there was an override keep overide value
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = self
|
||||
value.key = key_elt
|
||||
value.overriden_value = self.get(key_elt, None)
|
||||
self._push_history(key_elt, value)
|
||||
self[key_elt] = value
|
||||
else:
|
||||
next_config_tree = super(ConfigTree, self).get(key_elt)
|
||||
if not isinstance(next_config_tree, ConfigTree):
|
||||
# create a new dictionary or overwrite a previous value
|
||||
next_config_tree = ConfigTree()
|
||||
self._push_history(key_elt, next_config_tree)
|
||||
self[key_elt] = next_config_tree
|
||||
next_config_tree._put(key_path[1:], value, append)
|
||||
|
||||
def _push_history(self, key, value):
|
||||
if self.root:
|
||||
hist = self.history.get(key)
|
||||
if hist is None:
|
||||
hist = self.history[key] = []
|
||||
hist.append(value)
|
||||
|
||||
def _get(self, key_path, key_index=0, default=UndefinedKey):
|
||||
key_elt = key_path[key_index]
|
||||
elt = super(ConfigTree, self).get(key_elt, UndefinedKey)
|
||||
|
||||
if elt is UndefinedKey:
|
||||
if default is UndefinedKey:
|
||||
raise ConfigMissingException(u"No configuration setting found for key {key}".format(
|
||||
key='.'.join(key_path[: key_index + 1])))
|
||||
else:
|
||||
return default
|
||||
|
||||
if key_index == len(key_path) - 1:
|
||||
if isinstance(elt, NoneValue):
|
||||
return None
|
||||
elif isinstance(elt, list):
|
||||
return [None if isinstance(x, NoneValue) else x for x in elt]
|
||||
else:
|
||||
return elt
|
||||
elif isinstance(elt, ConfigTree):
|
||||
return elt._get(key_path, key_index + 1, default)
|
||||
else:
|
||||
if default is UndefinedKey:
|
||||
raise ConfigWrongTypeException(
|
||||
u"{key} has type {type} rather than dict".format(key='.'.join(key_path[:key_index + 1]),
|
||||
type=type(elt).__name__))
|
||||
else:
|
||||
return default
|
||||
|
||||
@staticmethod
|
||||
def parse_key(string):
|
||||
"""
|
||||
Split a key into path elements:
|
||||
- a.b.c => a, b, c
|
||||
- a."b.c" => a, QuotedKey("b.c") if . is any of the special characters: $}[]:=+#`^?!@*&.
|
||||
- "a" => a
|
||||
- a.b."c" => a, b, c (special case)
|
||||
:param string: either string key (parse '.' as sub-key) or int / float as regular keys
|
||||
:return:
|
||||
"""
|
||||
if isinstance(string, (int, float)):
|
||||
return [string]
|
||||
|
||||
special_characters = '$}[]:=+#`^?!@*&.'
|
||||
tokens = re.findall(
|
||||
r'"[^"]+"|[^{special_characters}]+'.format(special_characters=re.escape(special_characters)),
|
||||
string)
|
||||
|
||||
def contains_special_character(token):
|
||||
return any((c in special_characters) for c in token)
|
||||
|
||||
return [token if contains_special_character(token) else token.strip('"') for token in tokens]
|
||||
|
||||
def put(self, key, value, append=False):
|
||||
"""Put a value in the tree (dot separated)
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param value: value to put
|
||||
"""
|
||||
self._put(ConfigTree.parse_key(key), value, append)
|
||||
|
||||
def get(self, key, default=UndefinedKey):
|
||||
"""Get a value from the tree
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: object
|
||||
:return: value in the tree located at key
|
||||
"""
|
||||
return self._get(ConfigTree.parse_key(key), 0, default)
|
||||
|
||||
def get_string(self, key, default=UndefinedKey):
|
||||
"""Return string representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: basestring
|
||||
:return: string value
|
||||
:type return: basestring
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
string_value = unicode(value)
|
||||
if isinstance(value, bool):
|
||||
string_value = string_value.lower()
|
||||
return string_value
|
||||
|
||||
def pop(self, key, default=UndefinedKey):
|
||||
"""Remove specified key and return the corresponding value.
|
||||
If key is not found, default is returned if given, otherwise ConfigMissingException is raised
|
||||
|
||||
This method assumes the user wants to remove the last value in the chain so it parses via parse_key
|
||||
and pops the last value out of the dict.
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: object
|
||||
:param default: default value if key not found
|
||||
:return: value in the tree located at key
|
||||
"""
|
||||
if default != UndefinedKey and key not in self:
|
||||
return default
|
||||
|
||||
value = self.get(key, UndefinedKey)
|
||||
lst = ConfigTree.parse_key(key)
|
||||
parent = self.KEY_SEP.join(lst[0:-1])
|
||||
child = lst[-1]
|
||||
|
||||
if parent:
|
||||
self.get(parent).__delitem__(child)
|
||||
else:
|
||||
self.__delitem__(child)
|
||||
return value
|
||||
|
||||
def get_int(self, key, default=UndefinedKey):
|
||||
"""Return int representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: int
|
||||
:return: int value
|
||||
:type return: int
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
try:
|
||||
return int(value) if value is not None else None
|
||||
except (TypeError, ValueError):
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'int'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def get_float(self, key, default=UndefinedKey):
|
||||
"""Return float representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: float
|
||||
:return: float value
|
||||
:type return: float
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
try:
|
||||
return float(value) if value is not None else None
|
||||
except (TypeError, ValueError):
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'float'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def get_bool(self, key, default=UndefinedKey):
|
||||
"""Return boolean representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: bool
|
||||
:return: boolean value
|
||||
:type return: bool
|
||||
"""
|
||||
|
||||
# String conversions as per API-recommendations:
|
||||
# https://github.com/typesafehub/config/blob/master/HOCON.md#automatic-type-conversions
|
||||
bool_conversions = {
|
||||
None: None,
|
||||
'true': True, 'yes': True, 'on': True,
|
||||
'false': False, 'no': False, 'off': False
|
||||
}
|
||||
string_value = self.get_string(key, default)
|
||||
if string_value is not None:
|
||||
string_value = string_value.lower()
|
||||
try:
|
||||
return bool_conversions[string_value]
|
||||
except KeyError:
|
||||
raise ConfigException(
|
||||
u"{key} does not translate to a Boolean value".format(key=key))
|
||||
|
||||
def get_list(self, key, default=UndefinedKey):
|
||||
"""Return list representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: list
|
||||
:return: list value
|
||||
:type return: list
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
elif isinstance(value, ConfigTree):
|
||||
lst = []
|
||||
for k, v in sorted(value.items(), key=lambda kv: kv[0]):
|
||||
if re.match('^[1-9][0-9]*$|0', k):
|
||||
lst.append(v)
|
||||
else:
|
||||
raise ConfigException(u"{key} does not translate to a list".format(key=key))
|
||||
return lst
|
||||
elif value is None:
|
||||
return None
|
||||
else:
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'list'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def get_config(self, key, default=UndefinedKey):
|
||||
"""Return tree config representation of value found at key
|
||||
|
||||
:param key: key to use (dot separated). E.g., a.b.c
|
||||
:type key: basestring
|
||||
:param default: default value if key not found
|
||||
:type default: config
|
||||
:return: config value
|
||||
:type return: ConfigTree
|
||||
"""
|
||||
value = self.get(key, default)
|
||||
if isinstance(value, dict):
|
||||
return value
|
||||
elif value is None:
|
||||
return None
|
||||
else:
|
||||
raise ConfigException(
|
||||
u"{key} has type '{type}' rather than 'config'".format(key=key, type=type(value).__name__))
|
||||
|
||||
def __getitem__(self, item):
|
||||
val = self.get(item)
|
||||
if val is UndefinedKey:
|
||||
raise KeyError(item)
|
||||
return val
|
||||
|
||||
try:
|
||||
from collections import _OrderedDictItemsView
|
||||
except ImportError: # pragma: nocover
|
||||
pass
|
||||
else:
|
||||
def items(self): # pragma: nocover
|
||||
return self._OrderedDictItemsView(self)
|
||||
|
||||
def __getattr__(self, item):
|
||||
val = self.get(item, NonExistentKey)
|
||||
if val is NonExistentKey:
|
||||
return super(ConfigTree, self).__getattr__(item)
|
||||
return val
|
||||
|
||||
def __contains__(self, item):
|
||||
return self._get(self.parse_key(item), default=NoneValue) is not NoneValue
|
||||
|
||||
def with_fallback(self, config, resolve=True):
|
||||
"""
|
||||
return a new config with fallback on config
|
||||
:param config: config or filename of the config to fallback on
|
||||
:param resolve: resolve substitutions
|
||||
:return: new config with fallback on config
|
||||
"""
|
||||
if isinstance(config, ConfigTree):
|
||||
result = ConfigTree.merge_configs(copy.deepcopy(config), copy.deepcopy(self))
|
||||
else:
|
||||
from . import ConfigFactory
|
||||
result = ConfigTree.merge_configs(ConfigFactory.parse_file(config, resolve=False), copy.deepcopy(self))
|
||||
|
||||
if resolve:
|
||||
from . import ConfigParser
|
||||
ConfigParser.resolve_substitutions(result)
|
||||
return result
|
||||
|
||||
def as_plain_ordered_dict(self):
|
||||
"""return a deep copy of this config as a plain OrderedDict
|
||||
|
||||
The config tree should be fully resolved.
|
||||
|
||||
This is useful to get an object with no special semantics such as path expansion for the keys.
|
||||
In particular this means that keys that contain dots are not surrounded with '"' in the plain OrderedDict.
|
||||
|
||||
:return: this config as an OrderedDict
|
||||
:type return: OrderedDict
|
||||
"""
|
||||
def plain_value(v):
|
||||
if isinstance(v, list):
|
||||
return [plain_value(e) for e in v]
|
||||
elif isinstance(v, ConfigTree):
|
||||
return v.as_plain_ordered_dict()
|
||||
else:
|
||||
if isinstance(v, ConfigValues):
|
||||
raise ConfigException("The config tree contains unresolved elements")
|
||||
return v
|
||||
|
||||
return OrderedDict((key.strip('"') if isinstance(key, (unicode, basestring)) else key, plain_value(value))
|
||||
for key, value in self.items())
|
||||
|
||||
|
||||
class ConfigList(list):
|
||||
def __init__(self, iterable=[]):
|
||||
new_list = list(iterable)
|
||||
super(ConfigList, self).__init__(new_list)
|
||||
for index, value in enumerate(new_list):
|
||||
if isinstance(value, ConfigValues):
|
||||
value.parent = self
|
||||
value.key = index
|
||||
|
||||
|
||||
class ConfigInclude(object):
|
||||
def __init__(self, tokens):
|
||||
self.tokens = tokens
|
||||
|
||||
|
||||
class ConfigValues(object):
|
||||
def __init__(self, tokens, instring, loc):
|
||||
self.tokens = tokens
|
||||
self.parent = None
|
||||
self.key = None
|
||||
self._instring = instring
|
||||
self._loc = loc
|
||||
self.overriden_value = None
|
||||
self.recompute()
|
||||
|
||||
def recompute(self):
|
||||
for index, token in enumerate(self.tokens):
|
||||
if isinstance(token, ConfigSubstitution):
|
||||
token.parent = self
|
||||
token.index = index
|
||||
|
||||
# no value return empty string
|
||||
if len(self.tokens) == 0:
|
||||
self.tokens = ['']
|
||||
|
||||
# if the last token is an unquoted string then right strip it
|
||||
if isinstance(self.tokens[-1], ConfigUnquotedString):
|
||||
# rstrip only whitespaces, not \n\r because they would have been used escaped
|
||||
self.tokens[-1] = self.tokens[-1].rstrip(' \t')
|
||||
|
||||
def has_substitution(self):
|
||||
return len(self.get_substitutions()) > 0
|
||||
|
||||
def get_substitutions(self):
|
||||
lst = []
|
||||
node = self
|
||||
while node:
|
||||
lst = [token for token in node.tokens if isinstance(token, ConfigSubstitution)] + lst
|
||||
if hasattr(node, 'overriden_value'):
|
||||
node = node.overriden_value
|
||||
if not isinstance(node, ConfigValues):
|
||||
break
|
||||
else:
|
||||
break
|
||||
return lst
|
||||
|
||||
def transform(self):
|
||||
def determine_type(token):
|
||||
return ConfigTree if isinstance(token, ConfigTree) else ConfigList if isinstance(token, list) else str
|
||||
|
||||
def format_str(v, last=False):
|
||||
if isinstance(v, ConfigQuotedString):
|
||||
return v.value + ('' if last else v.ws)
|
||||
else:
|
||||
return '' if v is None else unicode(v)
|
||||
|
||||
if self.has_substitution():
|
||||
return self
|
||||
|
||||
# remove None tokens
|
||||
tokens = [token for token in self.tokens if token is not None]
|
||||
|
||||
if not tokens:
|
||||
return None
|
||||
|
||||
# check if all tokens are compatible
|
||||
first_tok_type = determine_type(tokens[0])
|
||||
for index, token in enumerate(tokens[1:]):
|
||||
tok_type = determine_type(token)
|
||||
if first_tok_type is not tok_type:
|
||||
raise ConfigWrongTypeException(
|
||||
"Token '{token}' of type {tok_type} (index {index}) must be of type {req_tok_type} "
|
||||
"(line: {line}, col: {col})".format(
|
||||
token=token,
|
||||
index=index + 1,
|
||||
tok_type=tok_type.__name__,
|
||||
req_tok_type=first_tok_type.__name__,
|
||||
line=lineno(self._loc, self._instring),
|
||||
col=col(self._loc, self._instring)))
|
||||
|
||||
if first_tok_type is ConfigTree:
|
||||
child = []
|
||||
if hasattr(self, 'overriden_value'):
|
||||
node = self.overriden_value
|
||||
while node:
|
||||
if isinstance(node, ConfigValues):
|
||||
value = node.transform()
|
||||
if isinstance(value, ConfigTree):
|
||||
child.append(value)
|
||||
else:
|
||||
break
|
||||
elif isinstance(node, ConfigTree):
|
||||
child.append(node)
|
||||
else:
|
||||
break
|
||||
if hasattr(node, 'overriden_value'):
|
||||
node = node.overriden_value
|
||||
else:
|
||||
break
|
||||
|
||||
result = ConfigTree()
|
||||
for conf in reversed(child):
|
||||
ConfigTree.merge_configs(result, conf, copy_trees=True)
|
||||
for token in tokens:
|
||||
ConfigTree.merge_configs(result, token, copy_trees=True)
|
||||
return result
|
||||
elif first_tok_type is ConfigList:
|
||||
result = []
|
||||
main_index = 0
|
||||
for sublist in tokens:
|
||||
sublist_result = ConfigList()
|
||||
for token in sublist:
|
||||
if isinstance(token, ConfigValues):
|
||||
token.parent = result
|
||||
token.key = main_index
|
||||
main_index += 1
|
||||
sublist_result.append(token)
|
||||
result.extend(sublist_result)
|
||||
return result
|
||||
else:
|
||||
if len(tokens) == 1:
|
||||
if isinstance(tokens[0], ConfigQuotedString):
|
||||
return tokens[0].value
|
||||
return tokens[0]
|
||||
else:
|
||||
return ''.join(format_str(token) for token in tokens[:-1]) + format_str(tokens[-1], True)
|
||||
|
||||
def put(self, index, value):
|
||||
self.tokens[index] = value
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return '[ConfigValues: ' + ','.join(str(o) for o in self.tokens) + ']'
|
||||
|
||||
|
||||
class ConfigSubstitution(object):
|
||||
def __init__(self, variable, optional, ws, instring, loc):
|
||||
self.variable = variable
|
||||
self.optional = optional
|
||||
self.ws = ws
|
||||
self.index = None
|
||||
self.parent = None
|
||||
self.instring = instring
|
||||
self.loc = loc
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return '[ConfigSubstitution: ' + self.variable + ']'
|
||||
|
||||
|
||||
class ConfigUnquotedString(unicode):
|
||||
def __new__(cls, value):
|
||||
return super(ConfigUnquotedString, cls).__new__(cls, value)
|
||||
|
||||
|
||||
class ConfigQuotedString(object):
|
||||
def __init__(self, value, ws, instring, loc):
|
||||
self.value = value
|
||||
self.ws = ws
|
||||
self.instring = instring
|
||||
self.loc = loc
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return '[ConfigQuotedString: ' + self.value + ']'
|
||||
329
clearml_agent/external/pyhocon/converter.py
vendored
Normal file
329
clearml_agent/external/pyhocon/converter.py
vendored
Normal file
@@ -0,0 +1,329 @@
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
||||
from . import ConfigFactory
|
||||
from .config_tree import ConfigQuotedString
|
||||
from .config_tree import ConfigSubstitution
|
||||
from .config_tree import ConfigTree
|
||||
from .config_tree import ConfigValues
|
||||
from .config_tree import NoneValue
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
unicode = str
|
||||
|
||||
|
||||
class HOCONConverter(object):
|
||||
_number_re = r'[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE][+\-]?\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))'
|
||||
_number_re_matcher = re.compile(_number_re)
|
||||
|
||||
@classmethod
|
||||
def to_json(cls, config, compact=False, indent=2, level=0):
|
||||
"""Convert HOCON input into a JSON output
|
||||
|
||||
:return: JSON string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
lines = ""
|
||||
if isinstance(config, ConfigTree):
|
||||
if len(config) == 0:
|
||||
lines += '{}'
|
||||
else:
|
||||
lines += '{\n'
|
||||
bet_lines = []
|
||||
for key, item in config.items():
|
||||
bet_lines.append('{indent}"{key}": {value}'.format(
|
||||
indent=''.rjust((level + 1) * indent, ' '),
|
||||
key=key.strip('"'), # for dotted keys enclosed with "" to not be interpreted as nested key
|
||||
value=cls.to_json(item, compact, indent, level + 1))
|
||||
)
|
||||
lines += ',\n'.join(bet_lines)
|
||||
lines += '\n{indent}}}'.format(indent=''.rjust(level * indent, ' '))
|
||||
elif isinstance(config, list):
|
||||
if len(config) == 0:
|
||||
lines += '[]'
|
||||
else:
|
||||
lines += '[\n'
|
||||
bet_lines = []
|
||||
for item in config:
|
||||
bet_lines.append('{indent}{value}'.format(
|
||||
indent=''.rjust((level + 1) * indent, ' '),
|
||||
value=cls.to_json(item, compact, indent, level + 1))
|
||||
)
|
||||
lines += ',\n'.join(bet_lines)
|
||||
lines += '\n{indent}]'.format(indent=''.rjust(level * indent, ' '))
|
||||
elif isinstance(config, basestring):
|
||||
lines = json.dumps(config)
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
lines = 'null'
|
||||
elif config is True:
|
||||
lines = 'true'
|
||||
elif config is False:
|
||||
lines = 'false'
|
||||
else:
|
||||
lines = str(config)
|
||||
return lines
|
||||
|
||||
@staticmethod
|
||||
def _auto_indent(lines, section):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
indent = len(lines) - lines.rindex('\n')
|
||||
except Exception:
|
||||
indent = len(lines)
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
section_indent = section.index('\n')
|
||||
except Exception:
|
||||
section_indent = len(section)
|
||||
if section_indent < 3:
|
||||
return lines + section
|
||||
|
||||
indent = '\n' + ''.rjust(indent, ' ')
|
||||
return lines + indent.join([sec.strip() for sec in section.split('\n')])
|
||||
# indent = ''.rjust(indent, ' ')
|
||||
# return lines + section.replace('\n', '\n'+indent)
|
||||
|
||||
@classmethod
|
||||
def to_hocon(cls, config, compact=False, indent=2, level=0):
|
||||
"""Convert HOCON input into a HOCON output
|
||||
|
||||
:return: JSON string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
lines = ""
|
||||
if isinstance(config, ConfigTree):
|
||||
if len(config) == 0:
|
||||
lines += '{}'
|
||||
else:
|
||||
if level > 0: # don't display { at root level
|
||||
lines += '{\n'
|
||||
bet_lines = []
|
||||
|
||||
for key, item in config.items():
|
||||
if compact:
|
||||
full_key = key
|
||||
while isinstance(item, ConfigTree) and len(item) == 1:
|
||||
key, item = next(iter(item.items()))
|
||||
full_key += '.' + key
|
||||
else:
|
||||
full_key = key
|
||||
|
||||
if isinstance(full_key, float) or \
|
||||
(isinstance(full_key, (basestring, unicode)) and cls._number_re_matcher.match(full_key)):
|
||||
# if key can be casted to float, and it is a string, make sure we quote it
|
||||
full_key = '\"{}\"'.format(full_key)
|
||||
|
||||
bet_line = ('{indent}{key}{assign_sign} '.format(
|
||||
indent=''.rjust(level * indent, ' '),
|
||||
key=full_key,
|
||||
assign_sign='' if isinstance(item, dict) else ' =',)
|
||||
)
|
||||
value_line = cls.to_hocon(item, compact, indent, level + 1)
|
||||
if isinstance(item, (list, tuple)):
|
||||
bet_lines.append(cls._auto_indent(bet_line, value_line))
|
||||
else:
|
||||
bet_lines.append(bet_line + value_line)
|
||||
lines += '\n'.join(bet_lines)
|
||||
|
||||
if level > 0: # don't display { at root level
|
||||
lines += '\n{indent}}}'.format(indent=''.rjust((level - 1) * indent, ' '))
|
||||
elif isinstance(config, (list, tuple)):
|
||||
if len(config) == 0:
|
||||
lines += '[]'
|
||||
else:
|
||||
# lines += '[\n'
|
||||
lines += '['
|
||||
bet_lines = []
|
||||
base_len = len(lines)
|
||||
skip_comma = False
|
||||
for i, item in enumerate(config):
|
||||
if 0 < i and not skip_comma:
|
||||
# if not isinstance(item, (str, int, float)):
|
||||
# lines += ',\n{indent}'.format(indent=''.rjust(level * indent, ' '))
|
||||
# else:
|
||||
# lines += ', '
|
||||
lines += ', '
|
||||
|
||||
skip_comma = False
|
||||
new_line = cls.to_hocon(item, compact, indent, level + 1)
|
||||
lines += new_line
|
||||
if '\n' in new_line or len(lines) - base_len > 80:
|
||||
if i < len(config) - 1:
|
||||
lines += ',\n{indent}'.format(indent=''.rjust(level * indent, ' '))
|
||||
base_len = len(lines)
|
||||
skip_comma = True
|
||||
# bet_lines.append('{value}'.format(value=cls.to_hocon(item, compact, indent, level + 1)))
|
||||
|
||||
# lines += '\n'.join(bet_lines)
|
||||
# lines += ', '.join(bet_lines)
|
||||
|
||||
# lines += '\n{indent}]'.format(indent=''.rjust((level - 1) * indent, ' '))
|
||||
lines += ']'
|
||||
elif isinstance(config, basestring):
|
||||
if '\n' in config and len(config) > 1:
|
||||
lines = '"""{value}"""'.format(value=config) # multilines
|
||||
else:
|
||||
lines = '"{value}"'.format(value=cls.__escape_string(config))
|
||||
elif isinstance(config, ConfigValues):
|
||||
lines = ''.join(cls.to_hocon(o, compact, indent, level) for o in config.tokens)
|
||||
elif isinstance(config, ConfigSubstitution):
|
||||
lines = '${'
|
||||
if config.optional:
|
||||
lines += '?'
|
||||
lines += config.variable + '}' + config.ws
|
||||
elif isinstance(config, ConfigQuotedString):
|
||||
if '\n' in config.value and len(config.value) > 1:
|
||||
lines = '"""{value}"""'.format(value=config.value) # multilines
|
||||
else:
|
||||
lines = '"{value}"'.format(value=cls.__escape_string(config.value))
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
lines = 'null'
|
||||
elif config is True:
|
||||
lines = 'true'
|
||||
elif config is False:
|
||||
lines = 'false'
|
||||
else:
|
||||
lines = str(config)
|
||||
return lines
|
||||
|
||||
@classmethod
|
||||
def to_yaml(cls, config, compact=False, indent=2, level=0):
|
||||
"""Convert HOCON input into a YAML output
|
||||
|
||||
:return: YAML string representation
|
||||
:type return: basestring
|
||||
"""
|
||||
lines = ""
|
||||
if isinstance(config, ConfigTree):
|
||||
if len(config) > 0:
|
||||
if level > 0:
|
||||
lines += '\n'
|
||||
bet_lines = []
|
||||
for key, item in config.items():
|
||||
bet_lines.append('{indent}{key}: {value}'.format(
|
||||
indent=''.rjust(level * indent, ' '),
|
||||
key=key.strip('"'), # for dotted keys enclosed with "" to not be interpreted as nested key,
|
||||
value=cls.to_yaml(item, compact, indent, level + 1))
|
||||
)
|
||||
lines += '\n'.join(bet_lines)
|
||||
elif isinstance(config, list):
|
||||
config_list = [line for line in config if line is not None]
|
||||
if len(config_list) == 0:
|
||||
lines += '[]'
|
||||
else:
|
||||
lines += '\n'
|
||||
bet_lines = []
|
||||
for item in config_list:
|
||||
bet_lines.append('{indent}- {value}'.format(indent=''.rjust(level * indent, ' '),
|
||||
value=cls.to_yaml(item, compact, indent, level + 1)))
|
||||
lines += '\n'.join(bet_lines)
|
||||
elif isinstance(config, basestring):
|
||||
# if it contains a \n then it's multiline
|
||||
lines = config.split('\n')
|
||||
if len(lines) == 1:
|
||||
lines = config
|
||||
else:
|
||||
lines = '|\n' + '\n'.join([line.rjust(level * indent, ' ') for line in lines])
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
lines = 'null'
|
||||
elif config is True:
|
||||
lines = 'true'
|
||||
elif config is False:
|
||||
lines = 'false'
|
||||
else:
|
||||
lines = str(config)
|
||||
return lines
|
||||
|
||||
@classmethod
|
||||
def to_properties(cls, config, compact=False, indent=2, key_stack=[]):
|
||||
"""Convert HOCON input into a .properties output
|
||||
|
||||
:return: .properties string representation
|
||||
:type return: basestring
|
||||
:return:
|
||||
"""
|
||||
|
||||
def escape_value(value):
|
||||
return value.replace('=', '\\=').replace('!', '\\!').replace('#', '\\#').replace('\n', '\\\n')
|
||||
|
||||
stripped_key_stack = [key.strip('"') for key in key_stack]
|
||||
lines = []
|
||||
if isinstance(config, ConfigTree):
|
||||
for key, item in config.items():
|
||||
if item is not None:
|
||||
lines.append(cls.to_properties(item, compact, indent, stripped_key_stack + [key]))
|
||||
elif isinstance(config, list):
|
||||
for index, item in enumerate(config):
|
||||
if item is not None:
|
||||
lines.append(cls.to_properties(item, compact, indent, stripped_key_stack + [str(index)]))
|
||||
elif isinstance(config, basestring):
|
||||
lines.append('.'.join(stripped_key_stack) + ' = ' + escape_value(config))
|
||||
elif config is True:
|
||||
lines.append('.'.join(stripped_key_stack) + ' = true')
|
||||
elif config is False:
|
||||
lines.append('.'.join(stripped_key_stack) + ' = false')
|
||||
elif config is None or isinstance(config, NoneValue):
|
||||
pass
|
||||
else:
|
||||
lines.append('.'.join(stripped_key_stack) + ' = ' + str(config))
|
||||
return '\n'.join([line for line in lines if len(line) > 0])
|
||||
|
||||
@classmethod
|
||||
def convert(cls, config, output_format='json', indent=2, compact=False):
|
||||
converters = {
|
||||
'json': cls.to_json,
|
||||
'properties': cls.to_properties,
|
||||
'yaml': cls.to_yaml,
|
||||
'hocon': cls.to_hocon,
|
||||
}
|
||||
|
||||
if output_format in converters:
|
||||
return converters[output_format](config, compact, indent)
|
||||
else:
|
||||
raise Exception("Invalid format '{format}'. Format must be 'json', 'properties', 'yaml' or 'hocon'".format(
|
||||
format=output_format))
|
||||
|
||||
@classmethod
|
||||
def convert_from_file(cls, input_file=None, output_file=None, output_format='json', indent=2, compact=False):
|
||||
"""Convert to json, properties or yaml
|
||||
|
||||
:param input_file: input file, if not specified stdin
|
||||
:param output_file: output file, if not specified stdout
|
||||
:param output_format: json, properties or yaml
|
||||
:return: json, properties or yaml string representation
|
||||
"""
|
||||
|
||||
if input_file is None:
|
||||
content = sys.stdin.read()
|
||||
config = ConfigFactory.parse_string(content)
|
||||
else:
|
||||
config = ConfigFactory.parse_file(input_file)
|
||||
|
||||
res = cls.convert(config, output_format, indent, compact)
|
||||
if output_file is None:
|
||||
print(res)
|
||||
else:
|
||||
with open(output_file, "w") as fd:
|
||||
fd.write(res)
|
||||
|
||||
@classmethod
|
||||
def __escape_match(cls, match):
|
||||
char = match.group(0)
|
||||
return {
|
||||
'\b': r'\b',
|
||||
'\t': r'\t',
|
||||
'\n': r'\n',
|
||||
'\f': r'\f',
|
||||
'\r': r'\r',
|
||||
'"': r'\"',
|
||||
'\\': r'\\',
|
||||
}.get(char) or (r'\u%04x' % ord(char))
|
||||
|
||||
@classmethod
|
||||
def __escape_string(cls, string):
|
||||
return re.sub(r'[\x00-\x1F"\\]', cls.__escape_match, string)
|
||||
17
clearml_agent/external/pyhocon/exceptions.py
vendored
Normal file
17
clearml_agent/external/pyhocon/exceptions.py
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
class ConfigException(Exception):
|
||||
|
||||
def __init__(self, message, ex=None):
|
||||
super(ConfigException, self).__init__(message)
|
||||
self._exception = ex
|
||||
|
||||
|
||||
class ConfigMissingException(ConfigException, KeyError):
|
||||
pass
|
||||
|
||||
|
||||
class ConfigSubstitutionException(ConfigException):
|
||||
pass
|
||||
|
||||
|
||||
class ConfigWrongTypeException(ConfigException):
|
||||
pass
|
||||
@@ -1,22 +1,26 @@
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES
|
||||
|
||||
from .requirement import Requirement
|
||||
|
||||
|
||||
def parse(reqstr):
|
||||
def parse(reqstr, cwd=None):
|
||||
"""
|
||||
Parse a requirements file into a list of Requirements
|
||||
|
||||
See: pip/req.py:parse_requirements()
|
||||
|
||||
:param reqstr: a string or file like object containing requirements
|
||||
:param cwd: Optional current working dir for -r file.txt loading
|
||||
:returns: a *generator* of Requirement objects
|
||||
"""
|
||||
filename = getattr(reqstr, 'name', None)
|
||||
try:
|
||||
# Python 2.x compatibility
|
||||
if not isinstance(reqstr, basestring):
|
||||
if not isinstance(reqstr, basestring): # noqa
|
||||
reqstr = reqstr.read()
|
||||
except NameError:
|
||||
# Python 3.x only
|
||||
@@ -30,18 +34,25 @@ def parse(reqstr):
|
||||
elif not line or line.startswith('#'):
|
||||
# comments are lines that start with # only
|
||||
continue
|
||||
elif line.startswith('-r') or line.startswith('--requirement'):
|
||||
elif line.startswith('-r ') or line.startswith('--requirement '):
|
||||
_, new_filename = line.split()
|
||||
new_file_path = os.path.join(os.path.dirname(filename or '.'),
|
||||
new_filename)
|
||||
new_file_path = os.path.join(
|
||||
os.path.dirname(filename or '.') if filename or not cwd else cwd, new_filename)
|
||||
if not os.path.exists(new_file_path):
|
||||
continue
|
||||
with open(new_file_path) as f:
|
||||
for requirement in parse(f):
|
||||
yield requirement
|
||||
elif line.startswith('-f') or line.startswith('--find-links') or \
|
||||
line.startswith('-i') or line.startswith('--index-url') or \
|
||||
line.startswith('--extra-index-url') or \
|
||||
line.startswith('--no-index'):
|
||||
warnings.warn('Private repos not supported. Skipping.')
|
||||
elif line.startswith('--extra-index-url'):
|
||||
extra_index = line[len('--extra-index-url'):].strip()
|
||||
extra_index = re.sub(r"\s+#.*$", "", extra_index) # strip comments
|
||||
if extra_index and extra_index not in PIP_EXTRA_INDICES:
|
||||
PIP_EXTRA_INDICES.append(extra_index)
|
||||
print(f"appended {extra_index} to list of extra pip indices")
|
||||
continue
|
||||
elif line.startswith('-Z') or line.startswith('--always-unzip'):
|
||||
warnings.warn('Unused option --always-unzip. Skipping.')
|
||||
|
||||
@@ -20,6 +20,15 @@ VCS_REGEX = re.compile(
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
VCS_EXT_REGEX = re.compile(
|
||||
r'^(?P<scheme>{0})(@)'.format(r'|'.join(
|
||||
[scheme.replace('+', r'\+') for scheme in ['git+git']])) +
|
||||
r'((?P<login>[^/@]+)@)?'
|
||||
r'(?P<path>[^#@]+)'
|
||||
r'(@(?P<revision>[^#]+))?'
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
# This matches just about everyting
|
||||
LOCAL_REGEX = re.compile(
|
||||
r'^((?P<scheme>file)://)?'
|
||||
@@ -30,7 +39,7 @@ LOCAL_REGEX = re.compile(
|
||||
|
||||
class Requirement(object):
|
||||
"""
|
||||
Represents a single requirementfrom clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
Represents a single requirement from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
Typically instances of this class are created with ``Requirement.parse``.
|
||||
For local file requirements, there's no verification that the file
|
||||
@@ -100,7 +109,7 @@ class Requirement(object):
|
||||
|
||||
req = cls('-e {0}'.format(line))
|
||||
req.editable = True
|
||||
vcs_match = VCS_REGEX.match(line)
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
if vcs_match is not None:
|
||||
@@ -147,7 +156,7 @@ class Requirement(object):
|
||||
|
||||
req = cls(line)
|
||||
|
||||
vcs_match = VCS_REGEX.match(line)
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
uri_match = URI_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
@@ -205,6 +214,7 @@ class Requirement(object):
|
||||
def parse(cls, line):
|
||||
"""
|
||||
Parses a Requirement from a line of a requirement file.
|
||||
This is the main entry point for parsing a single requirements line (not parse_line!)
|
||||
|
||||
:param line: a line of a requirement file
|
||||
:returns: a Requirement instance for the given line
|
||||
@@ -217,7 +227,7 @@ class Requirement(object):
|
||||
return cls.parse_editable(
|
||||
re.sub(r'^(-e|--editable=?)\s*', '', line))
|
||||
elif '@' in line and ('#' not in line or line.index('#') > line.index('@')):
|
||||
# Allegro bug fix: support 'name @ git+' entries
|
||||
# ClearML bug fix: support 'name @ git+' entries
|
||||
name, uri = line.split('@', 1)
|
||||
name = name.strip()
|
||||
uri = uri.strip()
|
||||
@@ -226,7 +236,7 @@ class Requirement(object):
|
||||
# check if the name is valid & parsed
|
||||
Req.parse(name)
|
||||
# if we are here, name is a valid package name, check if the vcs part is valid
|
||||
if VCS_REGEX.match(uri):
|
||||
if VCS_REGEX.match(uri) or VCS_EXT_REGEX.match(uri):
|
||||
req = cls.parse_line(uri)
|
||||
req.name = name
|
||||
return req
|
||||
|
||||
15
clearml_agent/glue/daemon.py
Normal file
15
clearml_agent/glue/daemon.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from threading import Thread
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
class K8sDaemon(Thread):
|
||||
|
||||
def __init__(self, agent):
|
||||
super(K8sDaemon, self).__init__(target=self.target)
|
||||
self.daemon = True
|
||||
self._agent = agent
|
||||
self.log = agent.log
|
||||
self._session: Session = agent._session
|
||||
|
||||
def target(self):
|
||||
pass
|
||||
20
clearml_agent/glue/definitions.py
Normal file
20
clearml_agent/glue/definitions.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from clearml_agent.helper.environment import EnvEntry
|
||||
|
||||
ENV_START_AGENT_SCRIPT_PATH = EnvEntry("CLEARML_K8S_GLUE_START_AGENT_SCRIPT_PATH", default="~/__start_agent__.sh")
|
||||
"""
|
||||
Script path to use when creating the bash script to run the agent inside the scheduled pod's docker container.
|
||||
Script will be appended to the specified file.
|
||||
"""
|
||||
|
||||
ENV_DEFAULT_EXECUTION_AGENT_ARGS = EnvEntry("K8S_GLUE_DEF_EXEC_AGENT_ARGS", default="--full-monitoring --require-queue")
|
||||
ENV_POD_AGENT_INSTALL_ARGS = EnvEntry("K8S_GLUE_POD_AGENT_INSTALL_ARGS", default="", lstrip=False)
|
||||
ENV_POD_MONITOR_LOG_BATCH_SIZE = EnvEntry("K8S_GLUE_POD_MONITOR_LOG_BATCH_SIZE", default=5, converter=int)
|
||||
ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION = EnvEntry(
|
||||
"K8S_GLUE_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION", default=False, converter=bool
|
||||
)
|
||||
|
||||
ENV_POD_USE_IMAGE_ENTRYPOINT = EnvEntry("K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT", default=False, converter=bool)
|
||||
"""
|
||||
Do not inject a cmd and args to the container's image when building the k8s template (depend on the built-in image
|
||||
entrypoint)
|
||||
"""
|
||||
12
clearml_agent/glue/errors.py
Normal file
12
clearml_agent/glue/errors.py
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
class GetPodsError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class GetJobsError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class GetPodCountError(Exception):
|
||||
pass
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
249
clearml_agent/glue/pending_pods_daemon.py
Normal file
249
clearml_agent/glue/pending_pods_daemon.py
Normal file
@@ -0,0 +1,249 @@
|
||||
from time import sleep
|
||||
from typing import Dict, Tuple, Optional, List
|
||||
|
||||
from clearml_agent.backend_api.session import Request
|
||||
from clearml_agent.glue.utilities import get_bash_output
|
||||
|
||||
from clearml_agent.helper.process import stringify_bash_output
|
||||
|
||||
from .daemon import K8sDaemon
|
||||
from .utilities import get_path
|
||||
from .errors import GetPodsError
|
||||
from .definitions import ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION
|
||||
|
||||
|
||||
class PendingPodsDaemon(K8sDaemon):
|
||||
def __init__(self, polling_interval: float, agent):
|
||||
super(PendingPodsDaemon, self).__init__(agent=agent)
|
||||
self._polling_interval = polling_interval
|
||||
self._last_tasks_msgs = {} # last msg updated for every task
|
||||
|
||||
def get_pods(self, pod_name=None, debug_msg="Detecting pending pods: {cmd}"):
|
||||
filters = ["status.phase=Pending"]
|
||||
if pod_name:
|
||||
filters.append(f"metadata.name={pod_name}")
|
||||
|
||||
if self._agent.using_jobs:
|
||||
return self._agent.get_pods_for_jobs(
|
||||
job_condition="status.active=1", pod_filters=filters, debug_msg=debug_msg
|
||||
)
|
||||
return self._agent.get_pods(filters=filters, debug_msg=debug_msg)
|
||||
|
||||
def _get_pod_name(self, pod: dict):
|
||||
return get_path(pod, "metadata", "name")
|
||||
|
||||
def _get_k8s_resource_name(self, pod: dict):
|
||||
if self._agent.using_jobs:
|
||||
return get_path(pod, "metadata", "labels", "job-name")
|
||||
return get_path(pod, "metadata", "name")
|
||||
|
||||
def _get_task_id(self, pod: dict):
|
||||
return self._get_k8s_resource_name(pod).rpartition('-')[-1]
|
||||
|
||||
@staticmethod
|
||||
def _get_k8s_resource_namespace(pod: dict):
|
||||
return pod.get('metadata', {}).get('namespace', None)
|
||||
|
||||
def target(self):
|
||||
"""
|
||||
Handle pending objects (pods or jobs, depending on the agent mode).
|
||||
- Delete any pending objects that are not expected to recover
|
||||
- Delete any pending objects for whom the associated task was aborted
|
||||
"""
|
||||
while True:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# Get pods (standalone pods if we're in pods mode, or pods associated to jobs if we're in jobs mode)
|
||||
pods = self.get_pods()
|
||||
if pods is None:
|
||||
raise GetPodsError()
|
||||
|
||||
task_id_to_pod = dict()
|
||||
|
||||
for pod in pods:
|
||||
pod_name = self._get_pod_name(pod)
|
||||
if not pod_name:
|
||||
continue
|
||||
|
||||
task_id = self._get_task_id(pod)
|
||||
if not task_id:
|
||||
continue
|
||||
|
||||
namespace = self._get_k8s_resource_namespace(pod)
|
||||
if not namespace:
|
||||
continue
|
||||
|
||||
updated_pod = self.get_pods(pod_name=pod_name, debug_msg="Refreshing pod information: {cmd}")
|
||||
if not updated_pod:
|
||||
continue
|
||||
pod = updated_pod[0]
|
||||
|
||||
task_id_to_pod[task_id] = pod
|
||||
|
||||
msg = None
|
||||
tags = []
|
||||
|
||||
waiting = get_path(pod, 'status', 'containerStatuses', 0, 'state', 'waiting')
|
||||
if not waiting:
|
||||
condition = get_path(pod, 'status', 'conditions', 0)
|
||||
if condition:
|
||||
reason = condition.get('reason')
|
||||
if reason == 'Unschedulable':
|
||||
message = condition.get('message')
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
else:
|
||||
reason = waiting.get("reason", None)
|
||||
message = waiting.get("message", None)
|
||||
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
|
||||
if reason == 'ImagePullBackOff':
|
||||
self.delete_k8s_resource(k8s_resource=pod, msg=reason)
|
||||
try:
|
||||
self._session.api_client.tasks.failed(
|
||||
task=task_id,
|
||||
status_reason="K8S glue error: {}".format(msg),
|
||||
status_message="Changed by K8S glue",
|
||||
force=True
|
||||
)
|
||||
self._agent.send_logs(
|
||||
task_id, ["K8S Error: {}".format(msg)],
|
||||
session=self._session
|
||||
)
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pending monitor: Failed deleting task "{}"\nEX: {}'.format(task_id, ex)
|
||||
)
|
||||
|
||||
# clean up any msg for this task
|
||||
self._last_tasks_msgs.pop(task_id, None)
|
||||
continue
|
||||
|
||||
self._update_pending_task_msg(task_id, msg, tags)
|
||||
|
||||
if task_id_to_pod:
|
||||
self._process_tasks_for_pending_pods(task_id_to_pod)
|
||||
|
||||
# clean up any last message for a task that wasn't seen as a pod
|
||||
self._last_tasks_msgs = {k: v for k, v in self._last_tasks_msgs.items() if k in task_id_to_pod}
|
||||
except GetPodsError:
|
||||
pass
|
||||
except Exception:
|
||||
self.log.exception("Hanging pods daemon loop")
|
||||
|
||||
sleep(self._polling_interval)
|
||||
|
||||
def delete_k8s_resource(self, k8s_resource: dict, msg: str = None):
|
||||
delete_cmd = "kubectl delete {kind} {name} -n {namespace} --output name".format(
|
||||
kind=self._agent.kind,
|
||||
name=self._get_k8s_resource_name(k8s_resource),
|
||||
namespace=self._get_k8s_resource_namespace(k8s_resource)
|
||||
).strip()
|
||||
self.log.debug(" - deleting {} {}: {}".format(self._agent.kind, (" " + msg) if msg else "", delete_cmd))
|
||||
return get_bash_output(delete_cmd).strip()
|
||||
|
||||
def _process_tasks_for_pending_pods(self, task_id_to_details: Dict[str, dict]):
|
||||
self._handle_aborted_tasks(task_id_to_details)
|
||||
|
||||
def _handle_aborted_tasks(self, pending_tasks_details: Dict[str, dict]):
|
||||
try:
|
||||
result = self._session.get(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
json={
|
||||
"id": list(pending_tasks_details),
|
||||
"status": ["stopped"],
|
||||
"only_fields": ["id"]
|
||||
}
|
||||
)
|
||||
aborted_task_ids = list(filter(None, (task.get("id") for task in result["tasks"])))
|
||||
|
||||
for task_id in aborted_task_ids:
|
||||
pod = pending_tasks_details.get(task_id)
|
||||
if not pod:
|
||||
self.log.error("Failed locating aborted task {} in pending pods list".format(task_id))
|
||||
continue
|
||||
|
||||
pod_name = self._get_pod_name(pod)
|
||||
if not self.get_pods(pod_name=pod_name):
|
||||
self.log.debug("K8S Glue pending monitor: pod {} is no longer pending, skipping".format(pod_name))
|
||||
continue
|
||||
|
||||
resource_name = self._get_k8s_resource_name(pod)
|
||||
self.log.info(
|
||||
"K8S Glue pending monitor: task {} was aborted but the k8s resource {} is still pending, "
|
||||
"deleting pod".format(task_id, resource_name)
|
||||
)
|
||||
|
||||
result = self._session.get(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
json={"id": [task_id], "status": ["stopped"], "only_fields": ["id"]},
|
||||
)
|
||||
if not result["tasks"]:
|
||||
self.log.debug("K8S Glue pending monitor: task {} is no longer aborted, skipping".format(task_id))
|
||||
continue
|
||||
|
||||
output = self.delete_k8s_resource(k8s_resource=pod, msg="Pending resource of an aborted task")
|
||||
if not output:
|
||||
self.log.warning("K8S Glue pending monitor: failed deleting resource {}".format(resource_name))
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pending monitor: failed checking aborted tasks for pending resources: {}'.format(ex)
|
||||
)
|
||||
|
||||
def _update_pending_task_msg(self, task_id: str, msg: str, tags: List[str] = None):
|
||||
if not msg or self._last_tasks_msgs.get(task_id, None) == (msg, tags):
|
||||
return
|
||||
try:
|
||||
if ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION.get():
|
||||
# This disables the option to enqueue the task which is supposed to sync the ClearML task status
|
||||
# in case the pod was preempted. In some cases this does not happen due to preemption but due to
|
||||
# cluster communication lag issues that cause us not to discover the pod is no longer pending and
|
||||
# enqueue the task when it's actually already running, thus essentially killing the task
|
||||
pass
|
||||
else:
|
||||
# Make sure the task is queued
|
||||
result = self._session.send_request(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
json={"id": task_id, "only_fields": ["status"]},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
if result.ok:
|
||||
status = get_path(result.json(), 'data', 'tasks', 0, 'status')
|
||||
# if task is in progress, change its status to enqueued
|
||||
if status == "in_progress":
|
||||
result = self._session.send_request(
|
||||
service='tasks', action='enqueue',
|
||||
json={
|
||||
"task": task_id, "force": True, "queue": self._agent.k8s_pending_queue_id
|
||||
},
|
||||
method=Request.def_method,
|
||||
async_enable=False,
|
||||
)
|
||||
if not result.ok:
|
||||
result_msg = get_path(result.json(), 'meta', 'result_msg')
|
||||
self.log.debug(
|
||||
"K8S Glue pods monitor: failed forcing task status change"
|
||||
" for pending task {}: {}".format(task_id, result_msg)
|
||||
)
|
||||
|
||||
# Update task status message
|
||||
payload = {"task": task_id, "status_message": "K8S glue status: {}".format(msg)}
|
||||
if tags:
|
||||
payload["tags"] = tags
|
||||
result = self._session.send_request('tasks', 'update', json=payload, method=Request.def_method)
|
||||
if not result.ok:
|
||||
result_msg = get_path(result.json(), 'meta', 'result_msg')
|
||||
raise Exception(result_msg or result.text)
|
||||
|
||||
# update last msg for this task
|
||||
self._last_tasks_msgs[task_id] = msg
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed setting status message for task "{}"\nMSG: {}\nEX: {}'.format(
|
||||
task_id, msg, ex
|
||||
)
|
||||
)
|
||||
18
clearml_agent/glue/utilities.py
Normal file
18
clearml_agent/glue/utilities.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import functools
|
||||
|
||||
from subprocess import DEVNULL
|
||||
|
||||
from clearml_agent.helper.process import get_bash_output as _get_bash_output
|
||||
|
||||
|
||||
def get_path(d, *path, default=None):
|
||||
try:
|
||||
return functools.reduce(
|
||||
lambda a, b: a[b], path, d
|
||||
)
|
||||
except (IndexError, KeyError):
|
||||
return default
|
||||
|
||||
|
||||
def get_bash_output(cmd, stderr=DEVNULL, raise_error=False):
|
||||
return _get_bash_output(cmd, stderr=stderr, raise_error=raise_error)
|
||||
@@ -14,29 +14,30 @@ import sys
|
||||
import tempfile
|
||||
from abc import ABCMeta
|
||||
from collections import OrderedDict
|
||||
from distutils.spawn import find_executable
|
||||
from functools import total_ordering
|
||||
from typing import Text, Dict, Any, Optional, AnyStr, IO, Union
|
||||
|
||||
import attr
|
||||
import furl
|
||||
import pyhocon
|
||||
import six
|
||||
import yaml
|
||||
from attr import fields_dict
|
||||
from pathlib2 import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
import six
|
||||
from six.moves import reduce
|
||||
|
||||
from clearml_agent.errors import CommandFailedError
|
||||
from clearml_agent.external import pyhocon
|
||||
from clearml_agent.helper.dicts import filter_keys
|
||||
|
||||
pretty_lines = False
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
use_powershell = os.getenv("CLEARML_AGENT_USE_POWERSHELL", None)
|
||||
|
||||
|
||||
def which(cmd, path=None):
|
||||
from clearml_agent.helper.process import find_executable
|
||||
result = find_executable(cmd, path)
|
||||
if not result:
|
||||
raise ValueError('command "{}" not found'.format(cmd))
|
||||
@@ -53,7 +54,7 @@ def select_for_platform(linux, windows):
|
||||
|
||||
|
||||
def bash_c():
|
||||
return 'bash -c' if not is_windows_platform() else 'cmd /c'
|
||||
return 'bash -c' if not is_windows_platform() else ('powershell -Command' if use_powershell else 'cmd /c')
|
||||
|
||||
|
||||
def return_list(arg):
|
||||
@@ -205,10 +206,13 @@ def get_python_path(script_dir, entry_point, package_api, is_conda_env=False):
|
||||
["-c", "import sys; print('{}'.join(sys.path))".format(python_path_sep)])
|
||||
org_python_path = python_path_cmd.get_output(cwd=script_dir)
|
||||
# Add path of the script directory and executable directory
|
||||
python_path = '{}{python_path_sep}{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(),
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
python_path = '{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(), python_path_sep=python_path_sep)
|
||||
if entry_point:
|
||||
python_path += '{}{python_path_sep}'.format(
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
|
||||
if is_windows_platform():
|
||||
python_path = python_path.replace('/', '\\')
|
||||
|
||||
@@ -399,12 +403,6 @@ class TqdmStream(object):
|
||||
self.buffer.write('\n')
|
||||
|
||||
|
||||
class TqdmLog(tqdm):
|
||||
|
||||
def __init__(self, iterable=None, file=None, **kwargs):
|
||||
super(TqdmLog, self).__init__(iterable, file=TqdmStream(file or sys.stderr), **kwargs)
|
||||
|
||||
|
||||
def url_join(first, *rest):
|
||||
"""
|
||||
Join url parts similarly to Path.join
|
||||
@@ -422,6 +420,7 @@ def mkstemp(
|
||||
open_kwargs=None, # type: Optional[Dict[Text, Any]]
|
||||
text=True, # type: bool
|
||||
name_only=False, # type: bool
|
||||
mode=None, # type: str
|
||||
*args,
|
||||
**kwargs):
|
||||
# type: (...) -> Union[(IO[AnyStr], Text), Text]
|
||||
@@ -431,12 +430,14 @@ def mkstemp(
|
||||
:param open_kwargs: keyword arguments for ``io.open``
|
||||
:param text: open in text mode
|
||||
:param name_only: close the file and return its name
|
||||
:param mode: open file mode
|
||||
:param args: tempfile.mkstemp args
|
||||
:param kwargs: tempfile.mkstemp kwargs
|
||||
"""
|
||||
fd, name = tempfile.mkstemp(text=text, *args, **kwargs)
|
||||
mode = 'w+'
|
||||
if not text:
|
||||
if not mode:
|
||||
mode = 'w+'
|
||||
if not text and 'b' not in mode:
|
||||
mode += 'b'
|
||||
if name_only:
|
||||
os.close(fd)
|
||||
@@ -510,6 +511,68 @@ def is_conda(config):
|
||||
return config['agent.package_manager.type'].lower() == 'conda'
|
||||
|
||||
|
||||
def convert_cuda_version_to_float_single_digit_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into a float representation, e.g. 11.4
|
||||
Notice returns String Single digit only!
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = str(cuda_version or 0)
|
||||
# if we have patch version we parse it here
|
||||
cuda_version_parts = [int(v) for v in cuda_version.split('.')]
|
||||
if len(cuda_version_parts) > 1 or cuda_version_parts[0] < 60:
|
||||
cuda_version = 10 * cuda_version_parts[0]
|
||||
if len(cuda_version_parts) > 1:
|
||||
cuda_version += float(".{:d}".format(cuda_version_parts[1]))*10
|
||||
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
else:
|
||||
cuda_version = cuda_version_parts[0]
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
|
||||
return cuda_version_full
|
||||
|
||||
|
||||
def convert_cuda_version_to_int_10_base_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into an integer version, e.g. 112 for cuda 11.2
|
||||
Return string
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = convert_cuda_version_to_float_single_digit_str(cuda_version)
|
||||
return str(int(float(cuda_version)*10))
|
||||
|
||||
|
||||
def get_python_version(python_executable, log=None):
|
||||
from clearml_agent.helper.process import Argv
|
||||
try:
|
||||
output = Argv(python_executable, "--version").get_output(
|
||||
stderr=subprocess.STDOUT
|
||||
)
|
||||
except subprocess.CalledProcessError as ex:
|
||||
# Windows returns 9009 code and suggests to install Python from Windows Store
|
||||
if is_windows_platform() and ex.returncode == 9009:
|
||||
if log:
|
||||
log.debug("version not found: {}".format(ex))
|
||||
else:
|
||||
if log:
|
||||
log.warning("error getting %s version: %s", python_executable, ex)
|
||||
return None
|
||||
except FileNotFoundError as ex:
|
||||
if log:
|
||||
log.debug("version not found: {}".format(ex))
|
||||
return None
|
||||
|
||||
match = re.search(r"Python ({}(?:\.\d+)*)".format(r"\d+"), output)
|
||||
if match:
|
||||
if log:
|
||||
log.debug("Found: {}".format(python_executable))
|
||||
# only return major.minor version
|
||||
return ".".join(str(match.group(1)).split(".")[:2])
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class NonStrictAttrs(object):
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import unicode_literals, print_function
|
||||
|
||||
import csv
|
||||
import sys
|
||||
from collections import Iterable
|
||||
from collections.abc import Iterable
|
||||
from typing import List, Dict, Text, Any
|
||||
|
||||
from attr import attrs, attrib
|
||||
|
||||
@@ -1,17 +1,23 @@
|
||||
from typing import Callable, Dict, Any
|
||||
from typing import Callable, Dict, Any, Optional
|
||||
|
||||
_not_set = object()
|
||||
|
||||
|
||||
def filter_keys(filter_, dct): # type: (Callable[[Any], bool], Dict) -> Dict
|
||||
return {key: value for key, value in dct.items() if filter_(key)}
|
||||
|
||||
|
||||
def merge_dicts(dict1, dict2):
|
||||
def merge_dicts(dict1, dict2, custom_merge_func=None):
|
||||
# type: (Any, Any, Optional[Callable[[str, Any, Any, Any], Any]]) -> Any
|
||||
""" Recursively merges dict2 into dict1 """
|
||||
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
|
||||
return dict2
|
||||
for k in dict2:
|
||||
if k in dict1:
|
||||
dict1[k] = merge_dicts(dict1[k], dict2[k])
|
||||
res = None
|
||||
if custom_merge_func:
|
||||
res = custom_merge_func(k, dict1[k], dict2[k], _not_set)
|
||||
dict1[k] = merge_dicts(dict1[k], dict2[k], custom_merge_func) if res is _not_set else res
|
||||
else:
|
||||
dict1[k] = dict2[k]
|
||||
return dict1
|
||||
|
||||
169
clearml_agent/helper/docker_args.py
Normal file
169
clearml_agent/helper/docker_args.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import re
|
||||
import shlex
|
||||
from typing import Tuple, List, TYPE_CHECKING
|
||||
from urllib.parse import urlunparse, urlparse
|
||||
|
||||
from clearml_agent.definitions import (
|
||||
ENV_AGENT_GIT_PASS,
|
||||
ENV_AGENT_SECRET_KEY,
|
||||
ENV_AWS_SECRET_KEY,
|
||||
ENV_AZURE_ACCOUNT_KEY,
|
||||
ENV_AGENT_AUTH_TOKEN,
|
||||
ENV_DOCKER_IMAGE,
|
||||
ENV_DOCKER_ARGS_HIDE_ENV,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
def sanitize_urls(s: str) -> Tuple[str, bool]:
|
||||
"""
|
||||
Replaces passwords in URLs with asterisks.
|
||||
Returns the sanitized string and a boolean indicating whether sanitation was performed.
|
||||
"""
|
||||
regex = re.compile("^([^:]*:)[^@]+(.*)$")
|
||||
tokens = re.split(r"\s", s)
|
||||
changed = False
|
||||
for k in range(len(tokens)):
|
||||
if "@" in tokens[k]:
|
||||
res = urlparse(tokens[k])
|
||||
if regex.match(res.netloc):
|
||||
changed = True
|
||||
tokens[k] = urlunparse((
|
||||
res.scheme,
|
||||
regex.sub("\\1********\\2", res.netloc),
|
||||
res.path,
|
||||
res.params,
|
||||
res.query,
|
||||
res.fragment
|
||||
))
|
||||
return " ".join(tokens) if changed else s, changed
|
||||
|
||||
|
||||
class DockerArgsSanitizer:
|
||||
@classmethod
|
||||
def sanitize_docker_command(cls, session, docker_command):
|
||||
# type: (Session, List[str]) -> List[str]
|
||||
if not docker_command:
|
||||
return docker_command
|
||||
|
||||
enabled = (
|
||||
session.config.get('agent.hide_docker_command_env_vars.enabled', False) or ENV_DOCKER_ARGS_HIDE_ENV.get()
|
||||
)
|
||||
if not enabled:
|
||||
return docker_command
|
||||
|
||||
keys = set(session.config.get('agent.hide_docker_command_env_vars.extra_keys', []))
|
||||
if ENV_DOCKER_ARGS_HIDE_ENV.get():
|
||||
keys.update(shlex.split(ENV_DOCKER_ARGS_HIDE_ENV.get().strip()))
|
||||
keys.update(
|
||||
ENV_AGENT_GIT_PASS.vars,
|
||||
ENV_AGENT_SECRET_KEY.vars,
|
||||
ENV_AWS_SECRET_KEY.vars,
|
||||
ENV_AZURE_ACCOUNT_KEY.vars,
|
||||
ENV_AGENT_AUTH_TOKEN.vars,
|
||||
)
|
||||
|
||||
parse_embedded_urls = bool(session.config.get(
|
||||
'agent.hide_docker_command_env_vars.parse_embedded_urls', True
|
||||
))
|
||||
|
||||
skip_next = False
|
||||
result = docker_command[:]
|
||||
for i, item in enumerate(docker_command):
|
||||
if skip_next:
|
||||
skip_next = False
|
||||
continue
|
||||
try:
|
||||
if item in ("-e", "--env"):
|
||||
key, sep, val = result[i + 1].partition("=")
|
||||
if not sep:
|
||||
continue
|
||||
if key in ENV_DOCKER_IMAGE.vars:
|
||||
# special case - this contains a complete docker command
|
||||
val = " ".join(cls.sanitize_docker_command(session, re.split(r"\s", val)))
|
||||
elif key in keys:
|
||||
val = "********"
|
||||
elif parse_embedded_urls:
|
||||
val = sanitize_urls(val)[0]
|
||||
result[i + 1] = "{}={}".format(key, val)
|
||||
skip_next = True
|
||||
elif parse_embedded_urls and not item.startswith("-"):
|
||||
item, changed = sanitize_urls(item)
|
||||
if changed:
|
||||
result[i] = item
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def get_list_of_switches(docker_args: List[str]) -> List[str]:
|
||||
args = []
|
||||
for token in docker_args:
|
||||
if token.strip().startswith("-"):
|
||||
args += [token.strip().split("=")[0].lstrip("-")]
|
||||
|
||||
return args
|
||||
|
||||
@staticmethod
|
||||
def filter_switches(docker_args: List[str], exclude_switches: List[str]) -> List[str]:
|
||||
# shortcut if we are sure we have no matches
|
||||
if (not exclude_switches or
|
||||
not any("-{}".format(s) in " ".join(docker_args) for s in exclude_switches)):
|
||||
return docker_args
|
||||
|
||||
args = []
|
||||
in_switch_args = True
|
||||
for token in docker_args:
|
||||
if token.strip().startswith("-"):
|
||||
if "=" in token:
|
||||
switch = token.strip().split("=")[0]
|
||||
in_switch_args = False
|
||||
else:
|
||||
switch = token
|
||||
in_switch_args = True
|
||||
|
||||
if switch.lstrip("-") in exclude_switches:
|
||||
# if in excluded, skip the switch and following arguments
|
||||
in_switch_args = False
|
||||
else:
|
||||
args += [token]
|
||||
|
||||
elif in_switch_args:
|
||||
args += [token]
|
||||
else:
|
||||
# this is the switch arguments we need to skip
|
||||
pass
|
||||
|
||||
return args
|
||||
|
||||
@staticmethod
|
||||
def merge_docker_args(config, task_docker_arguments: List[str], extra_docker_arguments: List[str]) -> List[str]:
|
||||
base_cmd = []
|
||||
# currently only resolving --network, --ipc, --privileged
|
||||
override_switches = config.get(
|
||||
"agent.protected_docker_extra_args",
|
||||
["privileged", "security-opt", "network", "ipc"]
|
||||
)
|
||||
|
||||
if config.get("agent.docker_args_extra_precedes_task", True):
|
||||
switches = []
|
||||
if extra_docker_arguments:
|
||||
switches = DockerArgsSanitizer.get_list_of_switches(extra_docker_arguments)
|
||||
switches = list(set(switches) & set(override_switches))
|
||||
base_cmd += [str(a) for a in extra_docker_arguments if a]
|
||||
if task_docker_arguments:
|
||||
docker_arguments = DockerArgsSanitizer.filter_switches(task_docker_arguments, switches)
|
||||
base_cmd += [a for a in docker_arguments if a]
|
||||
else:
|
||||
switches = []
|
||||
if task_docker_arguments:
|
||||
switches = DockerArgsSanitizer.get_list_of_switches(task_docker_arguments)
|
||||
switches = list(set(switches) & set(override_switches))
|
||||
base_cmd += [a for a in task_docker_arguments if a]
|
||||
if extra_docker_arguments:
|
||||
extra_docker_arguments = DockerArgsSanitizer.filter_switches(extra_docker_arguments, switches)
|
||||
base_cmd += [a for a in extra_docker_arguments if a]
|
||||
return base_cmd
|
||||
8
clearml_agent/helper/environment/__init__.py
Normal file
8
clearml_agent/helper/environment/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from .entry import Entry, NotSet
|
||||
from .environment import EnvEntry
|
||||
|
||||
__all__ = [
|
||||
'Entry',
|
||||
'NotSet',
|
||||
'EnvEntry',
|
||||
]
|
||||
86
clearml_agent/helper/environment/converters.py
Normal file
86
clearml_agent/helper/environment/converters.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import base64
|
||||
from typing import Union, Optional, Any, TypeVar, Callable, Tuple
|
||||
|
||||
import six
|
||||
|
||||
try:
|
||||
from typing import Text
|
||||
except ImportError:
|
||||
# windows conda-less hack
|
||||
Text = Any
|
||||
|
||||
|
||||
ConverterType = TypeVar("ConverterType", bound=Callable[[Any], Any])
|
||||
|
||||
|
||||
def base64_to_text(value):
|
||||
# type: (Any) -> Text
|
||||
return base64.b64decode(value).decode("utf-8")
|
||||
|
||||
|
||||
def text_to_int(value, default=0):
|
||||
# type: (Any, int) -> int
|
||||
try:
|
||||
return int(value)
|
||||
except (ValueError, TypeError):
|
||||
return default
|
||||
|
||||
|
||||
def text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
return bool(strtobool(value))
|
||||
|
||||
|
||||
def safe_text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
try:
|
||||
return text_to_bool(value)
|
||||
except ValueError:
|
||||
return bool(value)
|
||||
|
||||
|
||||
def any_to_bool(value):
|
||||
# type: (Optional[Union[int, float, Text]]) -> bool
|
||||
if isinstance(value, six.text_type):
|
||||
return text_to_bool(value)
|
||||
return bool(value)
|
||||
|
||||
|
||||
# noinspection PyIncorrectDocstring
|
||||
def or_(*converters, **kwargs):
|
||||
# type: (ConverterType, Tuple[Exception, ...]) -> ConverterType
|
||||
"""
|
||||
Wrapper that implements an "optional converter" pattern. Allows specifying a converter
|
||||
for which a set of exceptions is ignored (and the original value is returned)
|
||||
:param converters: A converter callable
|
||||
:param exceptions: A tuple of exception types to ignore
|
||||
"""
|
||||
# noinspection PyUnresolvedReferences
|
||||
exceptions = kwargs.get("exceptions", (ValueError, TypeError))
|
||||
|
||||
def wrapper(value):
|
||||
for converter in converters:
|
||||
try:
|
||||
return converter(value)
|
||||
except exceptions:
|
||||
pass
|
||||
return value
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def strtobool(val):
|
||||
"""Convert a string representation of truth to true (1) or false (0).
|
||||
|
||||
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
|
||||
are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
|
||||
'val' is anything else.
|
||||
"""
|
||||
val = val.lower()
|
||||
if val in ('y', 'yes', 't', 'true', 'on', '1'):
|
||||
return 1
|
||||
elif val in ('n', 'no', 'f', 'false', 'off', '0'):
|
||||
return 0
|
||||
else:
|
||||
raise ValueError("invalid truth value %r" % (val,))
|
||||
|
||||
134
clearml_agent/helper/environment/entry.py
Normal file
134
clearml_agent/helper/environment/entry.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import abc
|
||||
from typing import Optional, Any, Tuple, Callable, Dict
|
||||
|
||||
import six
|
||||
|
||||
from .converters import any_to_bool
|
||||
|
||||
try:
|
||||
from typing import Text
|
||||
except ImportError:
|
||||
# windows conda-less hack
|
||||
Text = Any
|
||||
|
||||
|
||||
NotSet = object()
|
||||
|
||||
Converter = Callable[[Any], Any]
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class Entry(object):
|
||||
"""
|
||||
Configuration entry definition
|
||||
"""
|
||||
|
||||
def default_conversions(self):
|
||||
# type: () -> Dict[Any, Converter]
|
||||
|
||||
if self.lstrip and self.rstrip:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s).strip()
|
||||
|
||||
elif self.lstrip:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s).lstrip()
|
||||
|
||||
elif self.rstrip:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s).rstrip()
|
||||
|
||||
else:
|
||||
|
||||
def str_convert(s):
|
||||
return six.text_type(s)
|
||||
|
||||
return {
|
||||
bool: lambda x: any_to_bool(x.strip()),
|
||||
six.text_type: str_convert,
|
||||
}
|
||||
|
||||
def __init__(self, key, *more_keys, **kwargs):
|
||||
# type: (Text, Text, Any) -> None
|
||||
"""
|
||||
:rtype: object
|
||||
:param key: Entry's key (at least one).
|
||||
:param more_keys: More alternate keys for this entry.
|
||||
:param type: Value type. If provided, will be used choosing a default conversion or
|
||||
(if none exists) for casting the environment value.
|
||||
:param converter: Value converter. If provided, will be used to convert the environment value.
|
||||
:param default: Default value. If provided, will be used as the default value on calls to get() and get_pair()
|
||||
in case no value is found for any key and no specific default value was provided in the call.
|
||||
Default value is None.
|
||||
:param help: Help text describing this entry
|
||||
"""
|
||||
self.keys = (key,) + more_keys
|
||||
self.type = kwargs.pop("type", six.text_type)
|
||||
self.converter = kwargs.pop("converter", None)
|
||||
self.default = kwargs.pop("default", None)
|
||||
self.help = kwargs.pop("help", None)
|
||||
self.lstrip = kwargs.pop("lstrip", True)
|
||||
self.rstrip = kwargs.pop("rstrip", True)
|
||||
|
||||
def __str__(self):
|
||||
return str(self.key)
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.keys[0]
|
||||
|
||||
def convert(self, value, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Any]
|
||||
converter = converter or self.converter
|
||||
if not converter:
|
||||
converter = self.default_conversions().get(self.type, self.type)
|
||||
return converter(value)
|
||||
|
||||
def get_pair(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Tuple[Text, Any]]
|
||||
for key in self.keys:
|
||||
value = self._get(key)
|
||||
if value is NotSet:
|
||||
continue
|
||||
try:
|
||||
value = self.convert(value, converter)
|
||||
except Exception as ex:
|
||||
self.error("invalid value {key}={value}: {ex}".format(**locals()))
|
||||
break
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if value_cb:
|
||||
value_cb(key, value)
|
||||
except Exception:
|
||||
pass
|
||||
return key, value
|
||||
|
||||
result = self.default if default is NotSet else default
|
||||
return self.key, result
|
||||
|
||||
def get(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter, value_cb=value_cb)[1]
|
||||
|
||||
def set(self, value):
|
||||
# type: (Any, Any) -> (Text, Any)
|
||||
# key, _ = self.get_pair(default=None, converter=None)
|
||||
for k in self.keys:
|
||||
self._set(k, str(value))
|
||||
|
||||
def _set(self, key, value):
|
||||
# type: (Text, Text) -> None
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get(self, key):
|
||||
# type: (Text) -> Any
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def error(self, message):
|
||||
# type: (Text) -> None
|
||||
pass
|
||||
28
clearml_agent/helper/environment/environment.py
Normal file
28
clearml_agent/helper/environment/environment.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from os import getenv, environ
|
||||
|
||||
from .converters import text_to_bool
|
||||
from .entry import Entry, NotSet
|
||||
|
||||
|
||||
class EnvEntry(Entry):
|
||||
def default_conversions(self):
|
||||
conversions = super(EnvEntry, self).default_conversions().copy()
|
||||
conversions[bool] = lambda x: text_to_bool(x.strip())
|
||||
return conversions
|
||||
|
||||
def pop(self):
|
||||
for k in self.keys:
|
||||
environ.pop(k, None)
|
||||
|
||||
def _get(self, key):
|
||||
value = getenv(key, "")
|
||||
return value or NotSet
|
||||
|
||||
def _set(self, key, value):
|
||||
environ[key] = value
|
||||
|
||||
def __str__(self):
|
||||
return "env:{}".format(super(EnvEntry, self).__str__())
|
||||
|
||||
def error(self, message):
|
||||
print("Environment configuration: {}".format(message))
|
||||
@@ -15,11 +15,10 @@ from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
import os.path
|
||||
import platform
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import psutil
|
||||
from ..gpu import pynvml as N
|
||||
@@ -58,6 +57,21 @@ class GPUStat(object):
|
||||
"""
|
||||
return self.entry['uuid']
|
||||
|
||||
@property
|
||||
def mig_index(self):
|
||||
"""
|
||||
Returns the index of the MIG partition (as in nvidia-smi).
|
||||
"""
|
||||
return self.entry.get("mig_index")
|
||||
|
||||
@property
|
||||
def mig_uuid(self):
|
||||
"""
|
||||
Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode,
|
||||
e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef
|
||||
"""
|
||||
return self.entry.get("mig_uuid")
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""
|
||||
@@ -162,14 +176,16 @@ class GPUStatCollection(object):
|
||||
_initialized = False
|
||||
_device_count = None
|
||||
_gpu_device_info = {}
|
||||
_mig_device_info = {}
|
||||
|
||||
def __init__(self, gpu_list, driver_version=None):
|
||||
def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None):
|
||||
self.gpus = gpu_list
|
||||
|
||||
# attach additional system information
|
||||
self.hostname = platform.node()
|
||||
self.query_time = datetime.now()
|
||||
self.driver_version = driver_version
|
||||
self.driver_cuda_version = driver_cuda_version
|
||||
|
||||
@staticmethod
|
||||
def clean_processes():
|
||||
@@ -180,17 +196,18 @@ class GPUStatCollection(object):
|
||||
@staticmethod
|
||||
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||
"""Query the information of all the GPUs on local machine"""
|
||||
|
||||
initialized = False
|
||||
if not GPUStatCollection._initialized:
|
||||
N.nvmlInit()
|
||||
GPUStatCollection._initialized = True
|
||||
initialized = True
|
||||
|
||||
def _decode(b):
|
||||
if isinstance(b, bytes):
|
||||
return b.decode() # for python3, to unicode
|
||||
return b
|
||||
|
||||
def get_gpu_info(index, handle):
|
||||
def get_gpu_info(index, handle, is_mig=False):
|
||||
"""Get one GPU information specified by nvml handle"""
|
||||
|
||||
def get_process_info(nv_process):
|
||||
@@ -199,10 +216,10 @@ class GPUStatCollection(object):
|
||||
if nv_process.pid not in GPUStatCollection.global_processes:
|
||||
GPUStatCollection.global_processes[nv_process.pid] = \
|
||||
psutil.Process(pid=nv_process.pid)
|
||||
ps_process = GPUStatCollection.global_processes[nv_process.pid]
|
||||
process['pid'] = nv_process.pid
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# ps_process = GPUStatCollection.global_processes[nv_process.pid]
|
||||
# we do not actually use these, so no point in collecting them
|
||||
# process['username'] = ps_process.username()
|
||||
# # cmdline returns full path;
|
||||
@@ -226,12 +243,14 @@ class GPUStatCollection(object):
|
||||
pass
|
||||
return process
|
||||
|
||||
if not GPUStatCollection._gpu_device_info.get(index):
|
||||
device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info
|
||||
|
||||
if not device_info.get(index):
|
||||
name = _decode(N.nvmlDeviceGetName(handle))
|
||||
uuid = _decode(N.nvmlDeviceGetUUID(handle))
|
||||
GPUStatCollection._gpu_device_info[index] = (name, uuid)
|
||||
device_info[index] = (name, uuid)
|
||||
|
||||
name, uuid = GPUStatCollection._gpu_device_info[index]
|
||||
name, uuid = device_info[index]
|
||||
|
||||
try:
|
||||
temperature = N.nvmlDeviceGetTemperature(
|
||||
@@ -285,11 +304,11 @@ class GPUStatCollection(object):
|
||||
for nv_process in nv_comp_processes + nv_graphics_processes:
|
||||
try:
|
||||
process = get_process_info(nv_process)
|
||||
processes.append(process)
|
||||
except psutil.NoSuchProcess:
|
||||
# TODO: add some reminder for NVML broken context
|
||||
# e.g. nvidia-smi reset or reboot the system
|
||||
pass
|
||||
process = None
|
||||
processes.append(process)
|
||||
|
||||
# we do not actually use these, so no point in collecting them
|
||||
# # TODO: Do not block if full process info is not requested
|
||||
@@ -313,7 +332,7 @@ class GPUStatCollection(object):
|
||||
# Convert bytes into MBytes
|
||||
'memory.used': memory.used // MB if memory else None,
|
||||
'memory.total': memory.total // MB if memory else None,
|
||||
'processes': processes,
|
||||
'processes': None if (processes and all(p is None for p in processes)) else processes
|
||||
}
|
||||
if per_process_stats:
|
||||
GPUStatCollection.clean_processes()
|
||||
@@ -327,8 +346,36 @@ class GPUStatCollection(object):
|
||||
for index in range(GPUStatCollection._device_count):
|
||||
handle = N.nvmlDeviceGetHandleByIndex(index)
|
||||
gpu_info = get_gpu_info(index, handle)
|
||||
gpu_stat = GPUStat(gpu_info)
|
||||
gpu_list.append(gpu_stat)
|
||||
mig_cnt = 0
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if mig_cnt <= 0:
|
||||
gpu_list.append(GPUStat(gpu_info))
|
||||
continue
|
||||
|
||||
got_mig_info = False
|
||||
for mig_index in range(mig_cnt):
|
||||
try:
|
||||
mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index)
|
||||
mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True)
|
||||
mig_info["mig_name"] = mig_info["name"]
|
||||
mig_info["name"] = gpu_info["name"]
|
||||
mig_info["mig_index"] = mig_info["index"]
|
||||
mig_info["mig_uuid"] = mig_info["uuid"]
|
||||
mig_info["index"] = gpu_info["index"]
|
||||
mig_info["uuid"] = gpu_info["uuid"]
|
||||
mig_info["temperature.gpu"] = gpu_info["temperature.gpu"]
|
||||
mig_info["fan.speed"] = gpu_info["fan.speed"]
|
||||
gpu_list.append(GPUStat(mig_info))
|
||||
got_mig_info = True
|
||||
except Exception as e:
|
||||
pass
|
||||
if not got_mig_info:
|
||||
gpu_list.append(GPUStat(gpu_info))
|
||||
|
||||
# 2. additional info (driver version, etc).
|
||||
if get_driver_info:
|
||||
@@ -336,15 +383,32 @@ class GPUStatCollection(object):
|
||||
driver_version = _decode(N.nvmlSystemGetDriverVersion())
|
||||
except N.NVMLError:
|
||||
driver_version = None # N/A
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_driver_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||
except BaseException:
|
||||
cuda_driver_version = None
|
||||
if cuda_driver_version:
|
||||
try:
|
||||
cuda_driver_version = '{}.{}'.format(
|
||||
int(cuda_driver_version)//1000, (int(cuda_driver_version) % 1000)//10)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
else:
|
||||
driver_version = None
|
||||
cuda_driver_version = None
|
||||
|
||||
# no need to shutdown:
|
||||
if shutdown:
|
||||
if shutdown and initialized:
|
||||
N.nvmlShutdown()
|
||||
GPUStatCollection._initialized = False
|
||||
|
||||
return GPUStatCollection(gpu_list, driver_version=driver_version)
|
||||
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.gpus)
|
||||
@@ -390,3 +454,38 @@ def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||
'''
|
||||
return GPUStatCollection.new_query(shutdown=shutdown, per_process_stats=per_process_stats,
|
||||
get_driver_info=get_driver_info)
|
||||
|
||||
|
||||
def get_driver_cuda_version():
|
||||
# type: () -> Optional[str]
|
||||
"""
|
||||
:return: Return detected CUDA version from driver. On fail return value is None.
|
||||
Example: `110` is cuda version 11.0
|
||||
"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlInit()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||
except BaseException:
|
||||
cuda_version = ''
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlShutdown()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
# for some reason we get CUDA version 11020 instead of 11200, so this is the fix
|
||||
if cuda_version and len(cuda_version) >= 4 and cuda_version[2] == '0' and cuda_version[3] != '0':
|
||||
return cuda_version[:2]+cuda_version[3]
|
||||
|
||||
return cuda_version[:3] if cuda_version else None
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
237
clearml_agent/helper/os/folder_cache.py
Normal file
237
clearml_agent/helper/os/folder_cache.py
Normal file
@@ -0,0 +1,237 @@
|
||||
import os
|
||||
import shutil
|
||||
from logging import warning
|
||||
from random import random
|
||||
from time import time
|
||||
from typing import List, Optional, Sequence
|
||||
|
||||
import psutil
|
||||
from pathlib2 import Path
|
||||
|
||||
from .locks import FileLock
|
||||
|
||||
|
||||
class FolderCache(object):
|
||||
_lock_filename = '.clearml.lock'
|
||||
_def_lock_timeout_seconds = 30
|
||||
_temp_entry_prefix = '_temp.'
|
||||
|
||||
def __init__(self, cache_folder, max_cache_entries=5, min_free_space_gb=None, lock_timeout_seconds=None):
|
||||
self._cache_folder = Path(os.path.expandvars(cache_folder)).expanduser().absolute()
|
||||
self._cache_folder.mkdir(parents=True, exist_ok=True)
|
||||
self._max_cache_entries = max_cache_entries
|
||||
self._last_copied_entry_folder = None
|
||||
self._min_free_space_gb = min_free_space_gb if min_free_space_gb and min_free_space_gb > 0 else None
|
||||
self._lock = FileLock((self._cache_folder / self._lock_filename).as_posix())
|
||||
self._lock_timeout_seconds = float(lock_timeout_seconds or self._def_lock_timeout_seconds)
|
||||
|
||||
def get_cache_folder(self):
|
||||
# type: () -> Path
|
||||
"""
|
||||
:return: Return the base cache folder
|
||||
"""
|
||||
return self._cache_folder
|
||||
|
||||
def copy_cached_entry(self, keys, destination):
|
||||
# type: (List[str], Path) -> Optional[Path]
|
||||
"""
|
||||
Copy a cached entry into a destination directory, if the cached entry does not exist return None
|
||||
:param keys:
|
||||
:param destination:
|
||||
:return: Target path, None if cached entry does not exist
|
||||
"""
|
||||
self._last_copied_entry_folder = None
|
||||
if not keys:
|
||||
return None
|
||||
|
||||
# lock so we make sure no one deletes it before we copy it
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds, readonly=True)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
import traceback
|
||||
warning('DEBUG: Exception {}: {}'.format(ex, traceback.format_exc()))
|
||||
return None
|
||||
|
||||
src = None
|
||||
try:
|
||||
src = self.get_entry(keys)
|
||||
if src:
|
||||
destination = Path(destination).absolute()
|
||||
destination.mkdir(parents=True, exist_ok=True)
|
||||
shutil.rmtree(destination.as_posix())
|
||||
shutil.copytree(src.as_posix(), dst=destination.as_posix(), symlinks=True)
|
||||
except BaseException as ex:
|
||||
warning('Could not copy cache folder {} to {}: {}'.format(src, destination, ex))
|
||||
self._lock.release()
|
||||
return None
|
||||
|
||||
# release Lock
|
||||
self._lock.release()
|
||||
|
||||
self._last_copied_entry_folder = src
|
||||
return destination if src else None
|
||||
|
||||
def get_entry(self, keys):
|
||||
# type: (List[str]) -> Optional[Path]
|
||||
"""
|
||||
Return a folder (a sub-folder of inside the cache_folder) matching one of the keys
|
||||
:param keys: List of keys, return the first match to one of the keys, notice keys cannot contain '.'
|
||||
:return: Path to the sub-folder or None if none was found
|
||||
"""
|
||||
if not keys:
|
||||
return None
|
||||
# conform keys
|
||||
keys = [keys] if isinstance(keys, str) else keys
|
||||
keys = sorted([k.replace('.', '_') for k in keys])
|
||||
for cache_folder in self._cache_folder.glob('*'):
|
||||
if cache_folder.is_dir() and any(True for k in cache_folder.name.split('.') if k in keys):
|
||||
cache_folder.touch()
|
||||
return cache_folder
|
||||
return None
|
||||
|
||||
def add_entry(self, keys, source_folder, exclude_sub_folders=None):
|
||||
# type: (List[str], Path, Optional[Sequence[str]]) -> bool
|
||||
"""
|
||||
Add a local folder into the cache, copy all sub-folders inside `source_folder`
|
||||
excluding folders matching `exclude_sub_folders` list
|
||||
:param keys: Cache entry keys list (str)
|
||||
:param source_folder: Folder to copy into the cache
|
||||
:param exclude_sub_folders: List of sub-folders to exclude from the copy operation
|
||||
:return: return True is new entry was added to cache
|
||||
"""
|
||||
if not keys:
|
||||
return False
|
||||
|
||||
keys = [keys] if isinstance(keys, str) else keys
|
||||
keys = sorted([k.replace('.', '_') for k in keys])
|
||||
|
||||
# If entry already exists skip it
|
||||
cached_entry = self.get_entry(keys)
|
||||
if cached_entry:
|
||||
# make sure the entry contains all keys
|
||||
cached_keys = cached_entry.name.split('.')
|
||||
if set(keys) - set(cached_keys):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
import traceback
|
||||
warning('DEBUG: Exception {}: {}'.format(ex, traceback.format_exc()))
|
||||
# failed locking do nothing
|
||||
return True
|
||||
keys = sorted(list(set(keys) | set(cached_keys)))
|
||||
dst = cached_entry.parent / '.'.join(keys)
|
||||
# rename
|
||||
try:
|
||||
shutil.move(src=cached_entry.as_posix(), dst=dst.as_posix())
|
||||
except BaseException as ex:
|
||||
warning('Could not rename cache entry {} to {}: ex'.format(
|
||||
cached_entry.as_posix(), dst.as_posix(), ex))
|
||||
# release lock
|
||||
self._lock.release()
|
||||
return True
|
||||
|
||||
# make sure we remove old entries
|
||||
self._remove_old_entries()
|
||||
|
||||
# if we do not have enough free space, do nothing.
|
||||
if not self._check_min_free_space():
|
||||
warning('Could not add cache entry, not enough free space on drive, '
|
||||
'free space threshold {} GB. Clearing all cache entries!'.format(self._min_free_space_gb))
|
||||
self._remove_old_entries(max_cache_entries=0)
|
||||
return False
|
||||
|
||||
# create the new entry for us
|
||||
exclude_sub_folders = exclude_sub_folders or []
|
||||
source_folder = Path(source_folder).absolute()
|
||||
# create temp folder
|
||||
temp_folder = \
|
||||
self._temp_entry_prefix + \
|
||||
'{}.{}'.format(str(time()).replace('.', '_'), str(random()).replace('.', '_'))
|
||||
temp_folder = self._cache_folder / temp_folder
|
||||
temp_folder.mkdir(parents=True, exist_ok=False)
|
||||
|
||||
for f in source_folder.glob('*'):
|
||||
if f.name in exclude_sub_folders:
|
||||
continue
|
||||
if f.is_dir():
|
||||
shutil.copytree(
|
||||
src=f.as_posix(), dst=(temp_folder / f.name).as_posix(),
|
||||
symlinks=True, ignore_dangling_symlinks=True)
|
||||
else:
|
||||
shutil.copy(
|
||||
src=f.as_posix(), dst=(temp_folder / f.name).as_posix(),
|
||||
follow_symlinks=False)
|
||||
|
||||
# rename the target folder
|
||||
target_cache_folder = self._cache_folder / '.'.join(keys)
|
||||
# if we failed moving it means someone else created the cached entry before us, we can just leave
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
shutil.move(src=temp_folder.as_posix(), dst=target_cache_folder.as_posix())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
shutil.rmtree(path=temp_folder.as_posix())
|
||||
except BaseException:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_last_copied_entry(self):
|
||||
# type: () -> Optional[Path]
|
||||
"""
|
||||
:return: the last copied cached entry folder inside the cache
|
||||
"""
|
||||
return self._last_copied_entry_folder
|
||||
|
||||
def _remove_old_entries(self, max_cache_entries=None):
|
||||
# type: (Optional[int]) -> ()
|
||||
"""
|
||||
Notice we only keep self._max_cache_entries-1, assuming we will be adding a new entry soon
|
||||
:param int max_cache_entries: if not None use instead of self._max_cache_entries
|
||||
"""
|
||||
folder_entries = [(cache_folder, cache_folder.stat().st_mtime)
|
||||
for cache_folder in self._cache_folder.glob('*')
|
||||
if cache_folder.is_dir() and not cache_folder.name.startswith(self._temp_entry_prefix)]
|
||||
folder_entries = sorted(folder_entries, key=lambda x: x[1], reverse=True)
|
||||
|
||||
number_of_entries_to_keep = self._max_cache_entries - 1 \
|
||||
if max_cache_entries is None else max(0, int(max_cache_entries))
|
||||
|
||||
# if nothing to do, leave
|
||||
if not folder_entries[number_of_entries_to_keep:]:
|
||||
return
|
||||
|
||||
# lock so we make sure no one deletes it before we copy it
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
import traceback
|
||||
warning('DEBUG: Exception {}: {}'.format(ex, traceback.format_exc()))
|
||||
return
|
||||
|
||||
for folder, ts in folder_entries[number_of_entries_to_keep:]:
|
||||
try:
|
||||
shutil.rmtree(folder.as_posix(), ignore_errors=True)
|
||||
except BaseException as ex:
|
||||
warning('Could not delete cache entry {}: {}'.format(folder.as_posix(), ex))
|
||||
|
||||
self._lock.release()
|
||||
|
||||
def _check_min_free_space(self):
|
||||
# type: () -> bool
|
||||
"""
|
||||
:return: return False if we hit the free space limit.
|
||||
If not free space limit provided, always return True
|
||||
"""
|
||||
if not self._min_free_space_gb or not self._cache_folder:
|
||||
return True
|
||||
free_space = float(psutil.disk_usage(self._cache_folder.as_posix()).free)
|
||||
free_space /= 2**30
|
||||
return free_space > self._min_free_space_gb
|
||||
236
clearml_agent/helper/os/locks.py
Normal file
236
clearml_agent/helper/os/locks.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import os
|
||||
import time
|
||||
import tempfile
|
||||
import contextlib
|
||||
|
||||
from .portalocker import constants, exceptions, lock, unlock
|
||||
|
||||
|
||||
current_time = getattr(time, "monotonic", time.time)
|
||||
|
||||
DEFAULT_TIMEOUT = 10 ** 8
|
||||
DEFAULT_CHECK_INTERVAL = 0.25
|
||||
LOCK_METHOD = constants.LOCK_EX | constants.LOCK_NB
|
||||
|
||||
__all__ = [
|
||||
'FileLock',
|
||||
'open_atomic',
|
||||
]
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def open_atomic(filename, binary=True):
|
||||
"""Open a file for atomic writing. Instead of locking this method allows
|
||||
you to write the entire file and move it to the actual location. Note that
|
||||
this makes the assumption that a rename is atomic on your platform which
|
||||
is generally the case but not a guarantee.
|
||||
|
||||
http://docs.python.org/library/os.html#os.rename
|
||||
|
||||
>>> filename = 'test_file.txt'
|
||||
>>> if os.path.exists(filename):
|
||||
... os.remove(filename)
|
||||
|
||||
>>> with open_atomic(filename) as fh:
|
||||
... written = fh.write(b"test")
|
||||
>>> assert os.path.exists(filename)
|
||||
>>> os.remove(filename)
|
||||
|
||||
"""
|
||||
assert not os.path.exists(filename), '%r exists' % filename
|
||||
path, name = os.path.split(filename)
|
||||
|
||||
# Create the parent directory if it doesn't exist
|
||||
if path and not os.path.isdir(path): # pragma: no cover
|
||||
os.makedirs(path)
|
||||
|
||||
temp_fh = tempfile.NamedTemporaryFile(
|
||||
mode=binary and 'wb' or 'w',
|
||||
dir=path,
|
||||
delete=False,
|
||||
)
|
||||
yield temp_fh
|
||||
temp_fh.flush()
|
||||
os.fsync(temp_fh.fileno())
|
||||
temp_fh.close()
|
||||
try:
|
||||
os.rename(temp_fh.name, filename)
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_fh.name)
|
||||
except Exception: # noqa
|
||||
pass
|
||||
|
||||
|
||||
class FileLock(object):
|
||||
|
||||
def __init__(
|
||||
self, filename, mode='a', timeout=DEFAULT_TIMEOUT,
|
||||
check_interval=DEFAULT_CHECK_INTERVAL, fail_when_locked=False,
|
||||
**file_open_kwargs):
|
||||
"""Lock manager with build-in timeout
|
||||
|
||||
filename -- filename
|
||||
mode -- the open mode, 'a' or 'ab' should be used for writing
|
||||
truncate -- use truncate to emulate 'w' mode, None is disabled, 0 is
|
||||
truncate to 0 bytes
|
||||
timeout -- timeout when trying to acquire a lock
|
||||
check_interval -- check interval while waiting
|
||||
fail_when_locked -- after the initial lock failed, return an error
|
||||
or lock the file
|
||||
**file_open_kwargs -- The kwargs for the `open(...)` call
|
||||
|
||||
fail_when_locked is useful when multiple threads/processes can race
|
||||
when creating a file. If set to true than the system will wait till
|
||||
the lock was acquired and then return an AlreadyLocked exception.
|
||||
|
||||
Note that the file is opened first and locked later. So using 'w' as
|
||||
mode will result in truncate _BEFORE_ the lock is checked.
|
||||
"""
|
||||
|
||||
if 'w' in mode:
|
||||
truncate = True
|
||||
mode = mode.replace('w', 'a')
|
||||
else:
|
||||
truncate = False
|
||||
|
||||
self.fh = None
|
||||
self.filename = filename
|
||||
self.mode = mode
|
||||
self.truncate = truncate
|
||||
self.timeout = timeout
|
||||
self.check_interval = check_interval
|
||||
self.fail_when_locked = fail_when_locked
|
||||
self.flags_read = constants.LOCK_SH | constants.LOCK_NB
|
||||
self.flags_write = constants.LOCK_EX | constants.LOCK_NB
|
||||
self.file_open_kwargs = file_open_kwargs
|
||||
|
||||
def acquire(
|
||||
self, timeout=None, check_interval=None, fail_when_locked=None, readonly=False):
|
||||
"""Acquire the locked filehandle"""
|
||||
if timeout is None:
|
||||
timeout = self.timeout
|
||||
if timeout is None:
|
||||
timeout = 0
|
||||
|
||||
if check_interval is None:
|
||||
check_interval = self.check_interval
|
||||
|
||||
if fail_when_locked is None:
|
||||
fail_when_locked = self.fail_when_locked
|
||||
|
||||
# If we already have a filehandle, return it
|
||||
fh = self.fh
|
||||
if fh:
|
||||
return fh
|
||||
|
||||
_fh = None
|
||||
try:
|
||||
# Get a new filehandler
|
||||
_fh = self._get_fh()
|
||||
# Try to lock
|
||||
fh = self._get_lock(_fh, readonly=readonly)
|
||||
except (exceptions.LockException, IOError) as exception:
|
||||
# Try till the timeout has passed
|
||||
timeoutend = current_time() + timeout
|
||||
while timeoutend > current_time():
|
||||
# Wait a bit
|
||||
time.sleep(check_interval)
|
||||
|
||||
# Try again
|
||||
try:
|
||||
|
||||
# We already tried to the get the lock
|
||||
# If fail_when_locked is true, then stop trying
|
||||
if fail_when_locked:
|
||||
raise exceptions.AlreadyLocked(exception)
|
||||
|
||||
else: # pragma: no cover
|
||||
if not _fh:
|
||||
_fh = self._get_fh()
|
||||
# We've got the lock
|
||||
fh = self._get_lock(_fh, readonly=readonly)
|
||||
break
|
||||
|
||||
except (exceptions.LockException, IOError):
|
||||
pass
|
||||
|
||||
else:
|
||||
# We got a timeout... reraising
|
||||
raise exceptions.LockTimeout(exception)
|
||||
|
||||
# Prepare the filehandle (truncate if needed)
|
||||
fh = self._prepare_fh(fh)
|
||||
|
||||
self.fh = fh
|
||||
return fh
|
||||
|
||||
def release(self):
|
||||
"""Releases the currently locked file handle"""
|
||||
if self.fh:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
unlock(self.fh)
|
||||
except Exception:
|
||||
pass
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self.fh.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.fh = None
|
||||
|
||||
def delete_lock_file(self):
|
||||
# type: () -> bool
|
||||
"""
|
||||
Remove the local file used for locking (fail if file is locked)
|
||||
|
||||
:return: True is successful
|
||||
"""
|
||||
if self.fh:
|
||||
return False
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
os.unlink(path=self.filename)
|
||||
except BaseException:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _get_fh(self):
|
||||
"""Get a new filehandle"""
|
||||
# Create the parent directory if it doesn't exist
|
||||
path, name = os.path.split(self.filename)
|
||||
if path and not os.path.isdir(path): # pragma: no cover
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
return open(self.filename, self.mode, **self.file_open_kwargs)
|
||||
|
||||
def _get_lock(self, fh, readonly=False):
|
||||
"""
|
||||
Try to lock the given filehandle
|
||||
|
||||
returns LockException if it fails"""
|
||||
lock(fh, self.flags_read if readonly else self.flags_write)
|
||||
return fh
|
||||
|
||||
def _prepare_fh(self, fh):
|
||||
"""
|
||||
Prepare the filehandle for usage
|
||||
|
||||
If truncate is a number, the file will be truncated to that amount of
|
||||
bytes
|
||||
"""
|
||||
if self.truncate:
|
||||
fh.seek(0)
|
||||
fh.truncate(0)
|
||||
|
||||
return fh
|
||||
|
||||
def __enter__(self):
|
||||
return self.acquire()
|
||||
|
||||
def __exit__(self, type_, value, tb):
|
||||
self.release()
|
||||
|
||||
def __delete__(self, instance): # pragma: no cover
|
||||
instance.release()
|
||||
200
clearml_agent/helper/os/portalocker.py
Normal file
200
clearml_agent/helper/os/portalocker.py
Normal file
@@ -0,0 +1,200 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
class exceptions:
|
||||
class BaseLockException(Exception):
|
||||
# Error codes:
|
||||
LOCK_FAILED = 1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.fh = kwargs.pop('fh', None)
|
||||
Exception.__init__(self, *args, **kwargs)
|
||||
|
||||
class LockException(BaseLockException):
|
||||
pass
|
||||
|
||||
class AlreadyLocked(BaseLockException):
|
||||
pass
|
||||
|
||||
class FileToLarge(BaseLockException):
|
||||
pass
|
||||
|
||||
class LockTimeout(BaseLockException):
|
||||
pass
|
||||
|
||||
|
||||
class constants:
|
||||
# The actual tests will execute the code anyhow so the following code can
|
||||
# safely be ignored from the coverage tests
|
||||
if os.name == 'nt': # pragma: no cover
|
||||
import msvcrt
|
||||
|
||||
LOCK_EX = 0x1 #: exclusive lock
|
||||
LOCK_SH = 0x2 #: shared lock
|
||||
LOCK_NB = 0x4 #: non-blocking
|
||||
LOCK_UN = msvcrt.LK_UNLCK #: unlock
|
||||
|
||||
LOCKFILE_FAIL_IMMEDIATELY = 1
|
||||
LOCKFILE_EXCLUSIVE_LOCK = 2
|
||||
|
||||
elif os.name == 'posix': # pragma: no cover
|
||||
import fcntl
|
||||
|
||||
LOCK_EX = fcntl.LOCK_EX #: exclusive lock
|
||||
LOCK_SH = fcntl.LOCK_SH #: shared lock
|
||||
LOCK_NB = fcntl.LOCK_NB #: non-blocking
|
||||
LOCK_UN = fcntl.LOCK_UN #: unlock
|
||||
|
||||
else: # pragma: no cover
|
||||
raise RuntimeError('PortaLocker only defined for nt and posix platforms')
|
||||
|
||||
|
||||
if os.name == 'nt': # pragma: no cover
|
||||
import msvcrt
|
||||
|
||||
if sys.version_info.major == 2:
|
||||
lock_length = -1
|
||||
else:
|
||||
lock_length = int(2**31 - 1)
|
||||
|
||||
def lock(file_, flags):
|
||||
if flags & constants.LOCK_SH:
|
||||
import win32file
|
||||
import pywintypes
|
||||
import winerror
|
||||
__overlapped = pywintypes.OVERLAPPED()
|
||||
if sys.version_info.major == 2:
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = constants.LOCKFILE_FAIL_IMMEDIATELY
|
||||
else:
|
||||
mode = 0
|
||||
|
||||
else:
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = msvcrt.LK_NBRLCK
|
||||
else:
|
||||
mode = msvcrt.LK_RLCK
|
||||
|
||||
# is there any reason not to reuse the following structure?
|
||||
hfile = win32file._get_osfhandle(file_.fileno())
|
||||
try:
|
||||
win32file.LockFileEx(hfile, mode, 0, -0x10000, __overlapped)
|
||||
except pywintypes.error as exc_value:
|
||||
# error: (33, 'LockFileEx', 'The process cannot access the file
|
||||
# because another process has locked a portion of the file.')
|
||||
if exc_value.winerror == winerror.ERROR_LOCK_VIOLATION:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
else:
|
||||
# Q: Are there exceptions/codes we should be dealing with
|
||||
# here?
|
||||
raise
|
||||
else:
|
||||
mode = constants.LOCKFILE_EXCLUSIVE_LOCK
|
||||
if flags & constants.LOCK_NB:
|
||||
mode |= constants.LOCKFILE_FAIL_IMMEDIATELY
|
||||
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = msvcrt.LK_NBLCK
|
||||
else:
|
||||
mode = msvcrt.LK_LOCK
|
||||
|
||||
# windows locks byte ranges, so make sure to lock from file start
|
||||
try:
|
||||
savepos = file_.tell()
|
||||
if savepos:
|
||||
# [ ] test exclusive lock fails on seek here
|
||||
# [ ] test if shared lock passes this point
|
||||
file_.seek(0)
|
||||
# [x] check if 0 param locks entire file (not documented in
|
||||
# Python)
|
||||
# [x] fails with "IOError: [Errno 13] Permission denied",
|
||||
# but -1 seems to do the trick
|
||||
|
||||
try:
|
||||
msvcrt.locking(file_.fileno(), mode, lock_length)
|
||||
except IOError as exc_value:
|
||||
# [ ] be more specific here
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
finally:
|
||||
if savepos:
|
||||
file_.seek(savepos)
|
||||
except IOError as exc_value:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
|
||||
fh=file_)
|
||||
|
||||
def unlock(file_):
|
||||
try:
|
||||
savepos = file_.tell()
|
||||
if savepos:
|
||||
file_.seek(0)
|
||||
|
||||
try:
|
||||
msvcrt.locking(file_.fileno(), constants.LOCK_UN, lock_length)
|
||||
except IOError as exc_value:
|
||||
if exc_value.strerror == 'Permission denied':
|
||||
import pywintypes
|
||||
import win32file
|
||||
import winerror
|
||||
__overlapped = pywintypes.OVERLAPPED()
|
||||
hfile = win32file._get_osfhandle(file_.fileno())
|
||||
try:
|
||||
win32file.UnlockFileEx(
|
||||
hfile, 0, -0x10000, __overlapped)
|
||||
except pywintypes.error as exc_value:
|
||||
if exc_value.winerror == winerror.ERROR_NOT_LOCKED:
|
||||
# error: (158, 'UnlockFileEx',
|
||||
# 'The segment is already unlocked.')
|
||||
# To match the 'posix' implementation, silently
|
||||
# ignore this error
|
||||
pass
|
||||
else:
|
||||
# Q: Are there exceptions/codes we should be
|
||||
# dealing with here?
|
||||
raise
|
||||
else:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
finally:
|
||||
if savepos:
|
||||
file_.seek(savepos)
|
||||
except IOError as exc_value:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
|
||||
fh=file_)
|
||||
|
||||
elif os.name == 'posix': # pragma: no cover
|
||||
import fcntl
|
||||
|
||||
def lock(file_, flags):
|
||||
locking_exceptions = IOError,
|
||||
try: # pragma: no cover
|
||||
locking_exceptions += BlockingIOError,
|
||||
except NameError: # pragma: no cover
|
||||
pass
|
||||
|
||||
try:
|
||||
fcntl.flock(file_.fileno(), flags)
|
||||
except locking_exceptions as exc_value:
|
||||
# The exception code varies on different systems so we'll catch
|
||||
# every IO error
|
||||
raise exceptions.LockException(exc_value, fh=file_)
|
||||
except BaseException as ex:
|
||||
# DEBUG
|
||||
print("Uncaught [{}] Exception [{}] in portalock: {}".format(locking_exceptions, type(ex), ex))
|
||||
raise
|
||||
|
||||
def unlock(file_):
|
||||
fcntl.flock(file_.fileno(), constants.LOCK_UN)
|
||||
|
||||
else: # pragma: no cover
|
||||
raise RuntimeError('PortaLocker only defined for nt and posix platforms')
|
||||
@@ -1,11 +1,18 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from typing import Text, Iterable, Union
|
||||
from hashlib import md5
|
||||
from typing import Text, Iterable, Union, Optional, Dict, List
|
||||
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.definitions import ENV_VENV_CACHE_PATH
|
||||
from clearml_agent.helper.base import mkstemp, safe_remove_file, join_lines, select_for_platform
|
||||
from clearml_agent.helper.console import ensure_binary
|
||||
from clearml_agent.helper.os.folder_cache import FolderCache
|
||||
from clearml_agent.helper.process import Executable, Argv, PathLike
|
||||
|
||||
|
||||
@@ -18,6 +25,16 @@ class PackageManager(object):
|
||||
_selected_manager = None
|
||||
_cwd = None
|
||||
_pip_version = None
|
||||
_config_cache_folder = 'agent.venvs_cache.path'
|
||||
_config_cache_max_entries = 'agent.venvs_cache.max_entries'
|
||||
_config_cache_free_space_threshold = 'agent.venvs_cache.free_space_threshold_gb'
|
||||
_config_cache_lock_timeout = 'agent.venvs_cache.lock_timeout'
|
||||
_config_pip_legacy_resolver = 'agent.package_manager.pip_legacy_resolver'
|
||||
|
||||
def __init__(self):
|
||||
self._cache_manager = None
|
||||
self._existing_packages = []
|
||||
self._base_install_flags = []
|
||||
|
||||
@abc.abstractproperty
|
||||
def bin(self):
|
||||
@@ -37,7 +54,7 @@ class PackageManager(object):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def freeze(self):
|
||||
def freeze(self, freeze_full_environment=False):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
@@ -65,22 +82,83 @@ class PackageManager(object):
|
||||
# type: (Iterable[Text]) -> None
|
||||
pass
|
||||
|
||||
def add_extra_install_flags(self, extra_flags): # type: (List[str]) -> None
|
||||
if extra_flags:
|
||||
extra_flags = [
|
||||
e for e in extra_flags if e not in list(self._base_install_flags)
|
||||
]
|
||||
self._base_install_flags = list(self._base_install_flags) + list(extra_flags)
|
||||
|
||||
def remove_extra_install_flags(self, extra_flags): # type: (List[str]) -> bool
|
||||
if extra_flags:
|
||||
_base_install_flags = [
|
||||
e for e in self._base_install_flags if e not in list(extra_flags)
|
||||
]
|
||||
if self._base_install_flags != _base_install_flags:
|
||||
self._base_install_flags = _base_install_flags
|
||||
return True
|
||||
return False
|
||||
|
||||
def upgrade_pip(self):
|
||||
result = self._install(
|
||||
select_for_platform(windows='"pip{}"', linux='pip{}').format(self.get_pip_version()), "--upgrade")
|
||||
packages = self.run_with_env(('list',), output=True).splitlines()
|
||||
# p.split is ('pip', 'x.y.z')
|
||||
pip = [p.split() for p in packages if len(p.split()) == 2 and p.split()[0] == 'pip']
|
||||
if pip:
|
||||
# noinspection PyBroadException
|
||||
*select_for_platform(
|
||||
windows=self.get_pip_versions(),
|
||||
linux=self.get_pip_versions()
|
||||
),
|
||||
"--upgrade"
|
||||
)
|
||||
|
||||
packages = (self.freeze(freeze_full_environment=True) or dict()).get("pip")
|
||||
if packages:
|
||||
from clearml_agent.helper.package.requirements import RequirementsManager
|
||||
from .requirements import MarkerRequirement, SimpleVersion
|
||||
|
||||
# store existing packages so that we can check if we can skip preinstalled packages
|
||||
# we will only check "@ file" "@ vcs" for exact match
|
||||
self._existing_packages = RequirementsManager.parse_requirements_section_to_marker_requirements(
|
||||
packages, skip_local_file_validation=True)
|
||||
|
||||
try:
|
||||
from .requirements import MarkerRequirement
|
||||
pip = pip[0][1].split('.')
|
||||
MarkerRequirement.pip_new_version = bool(int(pip[0]) >= 20)
|
||||
except Exception:
|
||||
pass
|
||||
pip_pkg = next(p for p in self._existing_packages if p.name == "pip")
|
||||
except StopIteration:
|
||||
pip_pkg = None
|
||||
|
||||
# check if we need to list the pip version as well
|
||||
if pip_pkg:
|
||||
MarkerRequirement.pip_new_version = SimpleVersion.compare_versions(pip_pkg.version, ">=", "20")
|
||||
|
||||
# add --use-deprecated=legacy-resolver to pip install to avoid mismatched packages issues
|
||||
self._add_legacy_resolver_flag(pip_pkg.version)
|
||||
|
||||
return result
|
||||
|
||||
def _add_legacy_resolver_flag(self, pip_pkg_version):
|
||||
if not self.session.config.get(self._config_pip_legacy_resolver, None):
|
||||
return
|
||||
|
||||
from .requirements import SimpleVersion
|
||||
|
||||
match_versions = self.session.config.get(self._config_pip_legacy_resolver)
|
||||
matched = False
|
||||
for rule in match_versions:
|
||||
matched = False
|
||||
# make sure we match all the parts of the rule
|
||||
for a_version in rule.split(","):
|
||||
o, v = SimpleVersion.split_op_version(a_version.strip())
|
||||
matched = SimpleVersion.compare_versions(pip_pkg_version, o, v)
|
||||
if not matched:
|
||||
break
|
||||
# if the rule is fully matched we have a match
|
||||
if matched:
|
||||
break
|
||||
|
||||
legacy_resolver_flags = ["--use-deprecated=legacy-resolver"]
|
||||
if matched:
|
||||
print("INFO: Using legacy resolver for PIP to avoid inconsistency with package versions!")
|
||||
self.add_extra_install_flags(legacy_resolver_flags)
|
||||
elif self.remove_extra_install_flags(legacy_resolver_flags):
|
||||
print("INFO: removing pip legacy resolver!")
|
||||
|
||||
def get_python_command(self, extra=()):
|
||||
# type: (...) -> Executable
|
||||
return Argv(self.bin, *extra)
|
||||
@@ -123,19 +201,33 @@ class PackageManager(object):
|
||||
@classmethod
|
||||
def out_of_scope_install_package(cls, package_name, *args):
|
||||
if PackageManager._selected_manager is not None:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
result = PackageManager._selected_manager._install(package_name, *args)
|
||||
result = PackageManager._selected_manager.install_packages(package_name, *args)
|
||||
if result not in (0, None, True):
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
try:
|
||||
from .requirements import Requirement, MarkerRequirement
|
||||
req = MarkerRequirement(Requirement.parse(package_name))
|
||||
|
||||
# if pip was part of the requirements, make sure we update the flags
|
||||
# add --use-deprecated=legacy-resolver to pip install to avoid mismatched packages issues
|
||||
if req.name == "pip" and req.version:
|
||||
PackageManager._selected_manager._add_legacy_resolver_flag(req.version)
|
||||
except Exception as e:
|
||||
print("WARNING: Error while parsing pip version legacy [{}]".format(e))
|
||||
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def out_of_scope_freeze(cls):
|
||||
def out_of_scope_freeze(cls, freeze_full_environment=False):
|
||||
if PackageManager._selected_manager is not None:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
return PackageManager._selected_manager.freeze()
|
||||
return PackageManager._selected_manager.freeze(freeze_full_environment)
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
@@ -144,12 +236,152 @@ class PackageManager(object):
|
||||
def set_pip_version(cls, version):
|
||||
if not version:
|
||||
return
|
||||
version = version.replace(' ', '')
|
||||
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
|
||||
cls._pip_version = version
|
||||
|
||||
if isinstance(version, (list, tuple)):
|
||||
versions = version
|
||||
else:
|
||||
cls._pip_version = "=="+version
|
||||
versions = [version]
|
||||
|
||||
cls._pip_version = []
|
||||
for version in versions:
|
||||
version = version.strip()
|
||||
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
|
||||
cls._pip_version.append(version)
|
||||
else:
|
||||
cls._pip_version.append("==" + version)
|
||||
|
||||
@classmethod
|
||||
def get_pip_version(cls):
|
||||
return cls._pip_version or ''
|
||||
def get_pip_versions(cls, pip="pip", wrap=''):
|
||||
return [
|
||||
(wrap + pip + version + wrap)
|
||||
for version in cls._pip_version or [""]
|
||||
]
|
||||
|
||||
def get_cached_venv(self, requirements, docker_cmd, python_version, cuda_version, destination_folder):
|
||||
# type: (Dict, Optional[Union[dict, str]], Optional[str], Optional[str], Path) -> Optional[Path]
|
||||
"""
|
||||
Copy a cached copy of the venv (based on the requirements) into destination_folder.
|
||||
Return None if failed or cached entry does not exist
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return None
|
||||
|
||||
try:
|
||||
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
|
||||
return self._get_cache_manager().copy_cached_entry(keys, destination_folder)
|
||||
except Exception as ex:
|
||||
print("WARNING: Failed accessing venvs cache at {}: {}".format(destination_folder, ex))
|
||||
print("WARNING: Skipping venv cache - folder not accessible!")
|
||||
return None
|
||||
|
||||
def add_cached_venv(
|
||||
self,
|
||||
requirements, # type: Union[Dict, List[Dict]]
|
||||
docker_cmd, # type: Optional[Union[dict, str]]
|
||||
python_version, # type: Optional[str]
|
||||
cuda_version, # type: Optional[str]
|
||||
source_folder, # type: Path
|
||||
exclude_sub_folders=None # type: Optional[List[str]]
|
||||
):
|
||||
# type: (...) -> ()
|
||||
"""
|
||||
Copy the local venv folder into the venv cache (keys are based on the requirements+python+docker).
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
|
||||
print('Adding venv into cache: {}'.format(source_folder))
|
||||
|
||||
try:
|
||||
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
|
||||
return self._get_cache_manager().add_entry(
|
||||
keys=keys, source_folder=source_folder, exclude_sub_folders=exclude_sub_folders)
|
||||
except Exception as ex:
|
||||
print("WARNING: Failed accessing venvs cache at {}: {}".format(source_folder, ex))
|
||||
print("WARNING: Skipping venv cache - folder not accessible!")
|
||||
return None
|
||||
|
||||
def get_cache_folder(self):
|
||||
# type: () -> Optional[Path]
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
return self._get_cache_manager().get_cache_folder()
|
||||
|
||||
def get_last_used_entry_cache(self):
|
||||
# type: () -> Optional[Path]
|
||||
"""
|
||||
:return: the last used cached folder entry
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
return self._get_cache_manager().get_last_copied_entry()
|
||||
|
||||
def is_cached_enabled(self):
|
||||
if not self._cache_manager:
|
||||
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
|
||||
if not cache_folder:
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
|
||||
# type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]
|
||||
requirements_list = requirements_list or dict()
|
||||
if not isinstance(requirements_list, (list, tuple)):
|
||||
requirements_list = [requirements_list]
|
||||
docker_cmd = dict(docker_cmd=docker_cmd) if isinstance(docker_cmd, str) else docker_cmd or dict()
|
||||
docker_cmd = OrderedDict(sorted(docker_cmd.items(), key=lambda t: t[0]))
|
||||
if 'docker_cmd' in docker_cmd:
|
||||
# we only take the first part of the docker_cmd which is the docker image name
|
||||
docker_cmd['docker_cmd'] = docker_cmd['docker_cmd'].strip('\r\n\t ').split(' ')[0]
|
||||
|
||||
keys = []
|
||||
strip_chars = '\n\r\t '
|
||||
for requirements in requirements_list:
|
||||
pip, conda = ('pip', 'conda')
|
||||
pip_reqs = requirements.get(pip, '')
|
||||
conda_reqs = requirements.get(conda, '')
|
||||
if isinstance(pip_reqs, str):
|
||||
pip_reqs = pip_reqs.split('\n')
|
||||
if isinstance(conda_reqs, str):
|
||||
conda_reqs = conda_reqs.split('\n')
|
||||
pip_reqs = sorted([p.strip(strip_chars) for p in pip_reqs
|
||||
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
|
||||
conda_reqs = sorted([p.strip(strip_chars) for p in conda_reqs
|
||||
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
|
||||
if not pip_reqs and not conda_reqs:
|
||||
continue
|
||||
# do not process "-r" or "--requirement" because we cannot know what we have in the git repo.
|
||||
if any(r.strip().startswith('-r ') or r.strip().startswith('--requirement ') for r in pip_reqs):
|
||||
continue
|
||||
hash_text = '{class_type}\n{docker_cmd}\n{cuda_ver}\n{python_version}\n{pip_reqs}\n{conda_reqs}'.format(
|
||||
class_type=str(cls),
|
||||
docker_cmd=str(docker_cmd or ''),
|
||||
cuda_ver=str(cuda_version or ''),
|
||||
python_version=str(python_version or ''),
|
||||
pip_reqs=str(pip_reqs or ''),
|
||||
conda_reqs=str(conda_reqs or ''),
|
||||
)
|
||||
keys.append(md5(ensure_binary(hash_text)).hexdigest())
|
||||
return sorted(list(set(keys)))
|
||||
|
||||
def _get_cache_manager(self):
|
||||
if not self._cache_manager:
|
||||
cache_folder = None
|
||||
try:
|
||||
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
|
||||
if not cache_folder:
|
||||
return None
|
||||
|
||||
max_entries = int(self.session.config.get(self._config_cache_max_entries, 10))
|
||||
free_space_threshold = float(self.session.config.get(self._config_cache_free_space_threshold, 0))
|
||||
self._cache_manager = FolderCache(
|
||||
cache_folder, max_cache_entries=max_entries,
|
||||
min_free_space_gb=free_space_threshold,
|
||||
lock_timeout_seconds=self.session.config.get(self._config_cache_lock_timeout, None))
|
||||
except Exception as ex:
|
||||
print("WARNING: Failed accessing venvs cache at {}: {}".format(cache_folder, ex))
|
||||
print("WARNING: Skipping venv cache - folder not accessible!")
|
||||
return None
|
||||
|
||||
return self._cache_manager
|
||||
|
||||
@@ -5,7 +5,6 @@ import re
|
||||
import os
|
||||
import subprocess
|
||||
from collections import OrderedDict
|
||||
from distutils.spawn import find_executable
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
from typing import Text, Iterable, Union, Dict, Set, Sequence, Any
|
||||
@@ -19,14 +18,16 @@ from clearml_agent.external.requirements_parser import parse
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
from clearml_agent.errors import CommandFailedError
|
||||
from clearml_agent.helper.base import rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo
|
||||
from clearml_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
|
||||
from clearml_agent.helper.base import (
|
||||
rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo,
|
||||
convert_cuda_version_to_float_single_digit_str, convert_cuda_version_to_int_10_base_str, )
|
||||
from clearml_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike, find_executable
|
||||
from clearml_agent.helper.package.requirements import SimpleVersion
|
||||
from clearml_agent.session import Session
|
||||
from .base import PackageManager
|
||||
from .pip_api.venv import VirtualenvPip
|
||||
from .requirements import RequirementsManager, MarkerRequirement
|
||||
from ...backend_api.session.defs import ENV_CONDA_ENV_PACKAGE
|
||||
from ...backend_api.session.defs import ENV_CONDA_ENV_PACKAGE, ENV_USE_CONDA_BASE_ENV
|
||||
|
||||
package_normalize = partial(re.compile(r"""\[version=['"](.*)['"]\]""").sub, r"\1")
|
||||
|
||||
@@ -69,6 +70,7 @@ class CondaAPI(PackageManager):
|
||||
:param python: base python version to use (e.g python3.6)
|
||||
:param path: path of env
|
||||
"""
|
||||
super(CondaAPI, self).__init__()
|
||||
self.session = session
|
||||
self.python = python
|
||||
self.source = None
|
||||
@@ -76,6 +78,11 @@ class CondaAPI(PackageManager):
|
||||
self.path = path
|
||||
self.env_read_only = False
|
||||
self.extra_channels = self.session.config.get('agent.package_manager.conda_channels', [])
|
||||
# install into base conda environment (should only be used if running in docker mode)
|
||||
self.use_conda_base_env = ENV_USE_CONDA_BASE_ENV.get(
|
||||
default=self.session.config.get('agent.package_manager.use_conda_base_env', None)
|
||||
)
|
||||
# notice this will not install any additional packages into the selected environment
|
||||
self.conda_env_as_base_docker = \
|
||||
self.session.config.get('agent.package_manager.conda_env_as_base_docker', None) or \
|
||||
bool(ENV_CONDA_ENV_PACKAGE.get())
|
||||
@@ -126,13 +133,40 @@ class CondaAPI(PackageManager):
|
||||
def bin(self):
|
||||
return self.pip.bin
|
||||
|
||||
def _parse_package_marker_match_python_ver(self, line=None, marker_req=None):
|
||||
if line:
|
||||
marker_req = MarkerRequirement(Requirement.parse(line))
|
||||
|
||||
try:
|
||||
mock_req = MarkerRequirement(Requirement.parse(marker_req.marker.replace("'", "").replace("\"", "")))
|
||||
except Exception as ex:
|
||||
print("WARNING: failed parsing, assuming package is okay {}".format(ex))
|
||||
return marker_req
|
||||
|
||||
if not mock_req.compare_version(requested_version=self.python):
|
||||
print("SKIPPING package `{}` not required python version {}".format(marker_req.tostr(), self.python))
|
||||
return None
|
||||
return marker_req
|
||||
|
||||
# noinspection SpellCheckingInspection
|
||||
def upgrade_pip(self):
|
||||
# do not change pip version if pre built environement is used
|
||||
if self.env_read_only:
|
||||
print('Conda environment in read-only mode, skipping pip upgrade.')
|
||||
return ''
|
||||
return self._install(select_for_platform(windows='"pip{}"', linux='pip{}').format(self.pip.get_pip_version()))
|
||||
|
||||
pip_versions = []
|
||||
for req_pip_line in self.pip.get_pip_versions():
|
||||
req = self._parse_package_marker_match_python_ver(line=req_pip_line)
|
||||
if req:
|
||||
pip_versions.append(req.tostr(markers=False))
|
||||
|
||||
return self._install(
|
||||
*select_for_platform(
|
||||
windows=pip_versions,
|
||||
linux=pip_versions
|
||||
)
|
||||
)
|
||||
|
||||
def create(self):
|
||||
"""
|
||||
@@ -140,19 +174,7 @@ class CondaAPI(PackageManager):
|
||||
"""
|
||||
if self.conda_env_as_base_docker and self.conda_pre_build_env_path:
|
||||
if Path(self.conda_pre_build_env_path).is_dir():
|
||||
print("Using pre-existing Conda environment from {}".format(self.conda_pre_build_env_path))
|
||||
self.path = Path(self.conda_pre_build_env_path)
|
||||
self.source = ("conda", "activate", self.path.as_posix())
|
||||
self.pip = CondaPip(
|
||||
session=self.session,
|
||||
source=self.source,
|
||||
python=self.python,
|
||||
requirements_manager=self.requirements_manager,
|
||||
path=self.path,
|
||||
)
|
||||
conda_env = self._get_conda_sh()
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
self.env_read_only = True
|
||||
self._init_existing_environment(self.conda_pre_build_env_path)
|
||||
return self
|
||||
elif Path(self.conda_pre_build_env_path).is_file():
|
||||
print("Restoring Conda environment from {}".format(self.conda_pre_build_env_path))
|
||||
@@ -177,8 +199,16 @@ class CondaAPI(PackageManager):
|
||||
else:
|
||||
raise ValueError("Could not restore Conda environment, cannot find {}".format(
|
||||
self.conda_pre_build_env_path))
|
||||
elif self.use_conda_base_env:
|
||||
try:
|
||||
base_path = Path(self.conda).parent.parent.as_posix()
|
||||
print("Using base conda environment at {}".format(base_path))
|
||||
self._init_existing_environment(base_path, is_readonly=False)
|
||||
return self
|
||||
except Exception as ex:
|
||||
print("WARNING: Failed using base conda environment, reverting to new environment: {}".format(ex))
|
||||
|
||||
output = Argv(
|
||||
command = Argv(
|
||||
self.conda,
|
||||
"create",
|
||||
"--yes",
|
||||
@@ -186,7 +216,9 @@ class CondaAPI(PackageManager):
|
||||
"--prefix",
|
||||
self.path,
|
||||
"python={}".format(self.python),
|
||||
).get_output(stderr=DEVNULL)
|
||||
)
|
||||
print('Executing Conda: {}'.format(command.serialize()))
|
||||
output = command.get_output(stderr=DEVNULL)
|
||||
match = re.search(
|
||||
r"\W*(.*activate) ({})".format(re.escape(str(self.path))), output
|
||||
)
|
||||
@@ -200,16 +232,38 @@ class CondaAPI(PackageManager):
|
||||
if conda_env.is_file() and not is_windows_platform():
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
|
||||
# install cuda toolkit
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = float(int(self.session.config['agent.cuda_version'])) / 10.0
|
||||
if cuda_version > 0:
|
||||
self._install('cudatoolkit={:.1f}'.format(cuda_version))
|
||||
except Exception:
|
||||
pass
|
||||
return self
|
||||
|
||||
def _init_existing_environment(self, conda_pre_build_env_path, is_readonly=True):
|
||||
print("Using pre-existing Conda environment from {}".format(conda_pre_build_env_path))
|
||||
self.path = Path(conda_pre_build_env_path)
|
||||
self.source = ("conda", "activate", self.path.as_posix())
|
||||
conda_env = self._get_conda_sh()
|
||||
self.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
|
||||
conda_packages_json = json.loads(
|
||||
self._run_command((self.conda, "list", "--json", "-p", self.path), raw=True))
|
||||
|
||||
try:
|
||||
for package in conda_packages_json:
|
||||
if package.get("name") == "python" and package.get("version"):
|
||||
self.python = ".".join(package.get("version").split(".")[:2])
|
||||
print("Existing conda environment, found python version {}".format(self.python))
|
||||
break
|
||||
except Exception as ex:
|
||||
print("WARNING: failed detecting existing conda python version: {}".format(ex))
|
||||
|
||||
self.pip = CondaPip(
|
||||
session=self.session,
|
||||
source=self.source,
|
||||
python=self.python,
|
||||
requirements_manager=self.requirements_manager,
|
||||
path=self.path,
|
||||
)
|
||||
self.pip.source = self.source
|
||||
|
||||
self.env_read_only = is_readonly
|
||||
|
||||
def remove(self):
|
||||
"""
|
||||
Delete a conda environment.
|
||||
@@ -218,7 +272,7 @@ class CondaAPI(PackageManager):
|
||||
Conda seems to load "vcruntime140.dll" from all its environment on startup.
|
||||
This means environment have to be deleted using 'conda env remove'.
|
||||
If necessary, conda can be fooled into deleting a partially-deleted environment by creating an empty file
|
||||
in '<ENV>\conda-meta\history' (value found in 'conda.gateways.disk.test.PREFIX_MAGIC_FILE').
|
||||
in '<ENV>\\conda-meta\\history' (value found in 'conda.gateways.disk.test.PREFIX_MAGIC_FILE').
|
||||
Otherwise, it complains that said directory is not a conda environment.
|
||||
|
||||
See: https://github.com/conda/conda/issues/7682
|
||||
@@ -284,17 +338,11 @@ class CondaAPI(PackageManager):
|
||||
"""
|
||||
Try to install packages from conda. Install packages which are not available from conda with pip.
|
||||
"""
|
||||
try:
|
||||
self._install_from_file(path)
|
||||
return
|
||||
except PackageNotFoundError as e:
|
||||
pip_packages = [e.pkg]
|
||||
except PackagesNotFoundError as e:
|
||||
pip_packages = package_set(e.packages)
|
||||
with self.temp_file("conda_reqs", _package_diff(path, pip_packages)) as reqs:
|
||||
self.install_from_file(reqs)
|
||||
with self.temp_file("pip_reqs", pip_packages) as reqs:
|
||||
self.pip.install_from_file(reqs)
|
||||
requirements = {}
|
||||
# assume requirements.txt
|
||||
with open(path, 'rt') as f:
|
||||
requirements['pip'] = f.read()
|
||||
self.load_requirements(requirements)
|
||||
|
||||
def freeze(self, freeze_full_environment=False):
|
||||
requirements = self.pip.freeze()
|
||||
@@ -430,7 +478,7 @@ class CondaAPI(PackageManager):
|
||||
finally:
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
self.requirements_manager.post_install(self.session)
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
|
||||
def load_requirements(self, requirements):
|
||||
# if we are in read only mode, do not uninstall anything
|
||||
@@ -458,9 +506,18 @@ class CondaAPI(PackageManager):
|
||||
requirements['conda'] = requirements['conda'].split('\n')
|
||||
has_torch = False
|
||||
has_matplotlib = False
|
||||
has_cudatoolkit = False
|
||||
cuda_version_full = 0
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = int(self.session.config.get('agent.cuda_version', 0))
|
||||
except:
|
||||
# notice this is an integer version: 112 (means 11.2)
|
||||
cuda_version = str(self.session.config.get('agent.cuda_version', "")).strip()
|
||||
if not cuda_version:
|
||||
cuda_version = 0
|
||||
else:
|
||||
cuda_version_full = convert_cuda_version_to_float_single_digit_str(cuda_version)
|
||||
cuda_version = int(convert_cuda_version_to_int_10_base_str(cuda_version))
|
||||
except Exception:
|
||||
cuda_version = 0
|
||||
|
||||
# notice 'conda' entry with empty string is a valid conda requirements list, it means pip only
|
||||
@@ -477,6 +534,7 @@ class CondaAPI(PackageManager):
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
m.validate_local_file_ref()
|
||||
# conda does not support version control links
|
||||
if m.vcs:
|
||||
pip_requirements.append(m)
|
||||
@@ -490,6 +548,19 @@ class CondaAPI(PackageManager):
|
||||
if '.' not in m.specs[0][1]:
|
||||
continue
|
||||
|
||||
if m.name.lower() in ('cudatoolkit', 'cuda-toolkit'):
|
||||
# skip cuda if we are running on CPU
|
||||
if not cuda_version:
|
||||
continue
|
||||
|
||||
has_cudatoolkit = True
|
||||
# cuda version, only major.minor
|
||||
requested_cuda_version = '.'.join(m.specs[0][1].split('.')[:2])
|
||||
# make sure that the cuda_version we support can install the requested cuda (major version)
|
||||
if int(float(requested_cuda_version)) > int(float(cuda_version)/10.0):
|
||||
continue
|
||||
m.specs = [(m.specs[0][0], str(requested_cuda_version)), ]
|
||||
|
||||
conda_supported_req_names.append(m.name.lower())
|
||||
if m.req.name.lower() == 'matplotlib':
|
||||
has_matplotlib = True
|
||||
@@ -504,9 +575,28 @@ class CondaAPI(PackageManager):
|
||||
has_torch = True
|
||||
m.req.name = 'tensorflow-gpu' if cuda_version > 0 else 'tensorflow'
|
||||
|
||||
# push the clearml packages into the pip_requirements
|
||||
if "clearml" in m.req.name and "clearml" not in self.extra_channels:
|
||||
if self.session.debug_mode:
|
||||
print("info: moving `{}` packages to `pip` section".format(m.req))
|
||||
pip_requirements.append(m)
|
||||
continue
|
||||
|
||||
reqs.append(m)
|
||||
|
||||
if not has_cudatoolkit and cuda_version:
|
||||
# nvidia channel is using `cuda-toolkit` and has newer versions of cuda,
|
||||
# older cuda can be picked from conda-forge (<12)
|
||||
if "nvidia" in self.extra_channels:
|
||||
m = MarkerRequirement(Requirement.parse("cuda-toolkit == {}".format(cuda_version_full)))
|
||||
else:
|
||||
m = MarkerRequirement(Requirement.parse("cudatoolkit == {}".format(cuda_version_full)))
|
||||
has_cudatoolkit = True
|
||||
reqs.append(m)
|
||||
|
||||
# if we have a conda list, the rest should be installed with pip,
|
||||
# this means any experiment that was executed with pip environment,
|
||||
# will be installed using pip
|
||||
if requirements.get('conda', None) is not None:
|
||||
for r in requirements['pip']:
|
||||
try:
|
||||
@@ -517,10 +607,10 @@ class CondaAPI(PackageManager):
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
# skip over local files (we cannot change the version to a local file)
|
||||
if m.local_file:
|
||||
continue
|
||||
m_name = m.name.lower()
|
||||
# remove local files reference if it does not exist (leave the package name)
|
||||
m.validate_local_file_ref()
|
||||
|
||||
m_name = (m.name or '').lower()
|
||||
if m_name in conda_supported_req_names:
|
||||
# this package is in the conda list,
|
||||
# make sure that if we changed version and we match it in conda
|
||||
@@ -557,22 +647,37 @@ class CondaAPI(PackageManager):
|
||||
# conform conda packages (version/name)
|
||||
for r in reqs:
|
||||
# change _ to - in name but not the prefix _ (as this is conda prefix)
|
||||
if not r.name.startswith('_') and not requirements.get('conda', None):
|
||||
if r.name and not r.name.startswith('_') and not requirements.get('conda', None):
|
||||
r.name = r.name.replace('_', '-')
|
||||
# remove .post from version numbers, it fails ~= version, and change == to ~=
|
||||
if r.specs and r.specs[0]:
|
||||
r.specs = [(r.specs[0][0].replace('==', '~='), r.specs[0][1].split('.post')[0])]
|
||||
|
||||
if has_cudatoolkit and r.specs and len(r.specs[0]) > 1 and r.name in ('cudatoolkit', 'cuda-toolkit'):
|
||||
# select specific cuda version if it came from the requirements
|
||||
r.specs = [(r.specs[0][0].replace('==', '='), r.specs[0][1].split('.post')[0])]
|
||||
elif r.specs and r.specs[0] and len(r.specs[0]) > 1:
|
||||
# remove .post from version numbers it fails with ~= version, and change == to ~=
|
||||
r.specs = [(s[0].replace('==', '~='), s[1].split('.post')[0]) for s in r.specs]
|
||||
|
||||
while reqs:
|
||||
# notice, we give conda more freedom in version selection, to help it choose best combination
|
||||
def clean_ver(ar):
|
||||
if not ar.specs:
|
||||
return ar.tostr()
|
||||
ar.specs = [(ar.specs[0][0], ar.specs[0][1] + '.0' if '.' not in ar.specs[0][1] else ar.specs[0][1])]
|
||||
return ar.tostr()
|
||||
conda_env['dependencies'] = [clean_ver(r) for r in reqs]
|
||||
markers = None
|
||||
if ar.marker:
|
||||
# check if we really need it based on python version
|
||||
ar = self._parse_package_marker_match_python_ver(marker_req=ar)
|
||||
if not ar:
|
||||
# empty lines should be skipped
|
||||
return ""
|
||||
# if we do make sure we note that we ignored markers
|
||||
print("WARNING: ignoring marker in `{}`".format(ar.tostr()))
|
||||
markers = False
|
||||
if ar.specs:
|
||||
ar.specs = [(s[0], s[1] + '.0' if '.' not in s[1] else s[1]) for s in ar.specs]
|
||||
return ar.tostr(markers=markers)
|
||||
conda_env['dependencies'] = [clean_ver(r) for r in reqs if clean_ver(r)]
|
||||
with self.temp_file("conda_env", yaml.dump(conda_env), suffix=".yml") as name:
|
||||
print('Conda: Trying to install requirements:\n{}'.format(conda_env['dependencies']))
|
||||
if self.session.debug_mode:
|
||||
print('{}:\n{}'.format(name, yaml.dump(conda_env)))
|
||||
result = self._run_command(
|
||||
("env", "update", "-p", self.path, "--file", name)
|
||||
)
|
||||
@@ -603,6 +708,8 @@ class CondaAPI(PackageManager):
|
||||
pip_req_str = [r.tostr() for r in pip_requirements if r.name not in ('pip', 'virtualenv', )]
|
||||
print('Conda: Installing requirements: step 2 - using pip:\n{}'.format(pip_req_str))
|
||||
PackageManager._selected_manager = self.pip
|
||||
if self.session.debug_mode:
|
||||
print('pip requirements.txt:\n{}'.format('\n'.join(pip_req_str)))
|
||||
self.pip.load_requirements({'pip': '\n'.join(pip_req_str)})
|
||||
except Exception as e:
|
||||
print(e)
|
||||
@@ -610,7 +717,7 @@ class CondaAPI(PackageManager):
|
||||
finally:
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
self.requirements_manager.post_install(self.session)
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
return True
|
||||
|
||||
def _parse_conda_result_bad_packges(self, result_dict):
|
||||
@@ -646,12 +753,16 @@ class CondaAPI(PackageManager):
|
||||
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
|
||||
return ansi_escape.sub('', line)
|
||||
|
||||
# make sure we are not running it with our own PYTHONPATH
|
||||
env = dict(**os.environ)
|
||||
env.pop('PYTHONPATH', None)
|
||||
|
||||
command = Argv(*command) # type: Executable
|
||||
if not raw:
|
||||
command = (self.conda,) + command + ("--quiet", "--json")
|
||||
try:
|
||||
print('Executing Conda: {}'.format(command.serialize()))
|
||||
result = command.get_output(stdin=DEVNULL, **kwargs)
|
||||
result = command.get_output(stdin=DEVNULL, env=env, **kwargs)
|
||||
if self.session.debug_mode:
|
||||
print(result)
|
||||
except Exception as e:
|
||||
@@ -671,6 +782,8 @@ class CondaAPI(PackageManager):
|
||||
return result
|
||||
|
||||
def get_python_command(self, extra=()):
|
||||
if not self.source:
|
||||
self._init_existing_environment(self.path)
|
||||
return CommandSequence(self.source, self.pip.get_python_command(extra=extra))
|
||||
|
||||
def _get_conda_sh(self):
|
||||
@@ -687,6 +800,25 @@ class CondaAPI(PackageManager):
|
||||
return conda_env
|
||||
return base_conda_env
|
||||
|
||||
def add_cached_venv(self, *args, **kwargs):
|
||||
"""
|
||||
Copy the local venv folder into the venv cache (keys are based on the requirements+python+docker).
|
||||
"""
|
||||
# do not cache if this is a base conda environment
|
||||
if self.conda_env_as_base_docker or self.use_conda_base_env:
|
||||
return
|
||||
return super().add_cached_venv(*args, **kwargs)
|
||||
|
||||
def get_cached_venv(self, *args, **kwargs):
|
||||
"""
|
||||
Copy a cached copy of the venv (based on the requirements) into destination_folder.
|
||||
Return None if failed or cached entry does not exist
|
||||
"""
|
||||
# do not cache if this is a base conda environment
|
||||
if self.conda_env_as_base_docker or self.use_conda_base_env:
|
||||
return
|
||||
return super().get_cached_venv(*args, **kwargs)
|
||||
|
||||
|
||||
# enable hashing with cmp=False because pdb fails on un-hashable exceptions
|
||||
exception = attrs(str=True, cmp=False)
|
||||
|
||||
@@ -2,6 +2,8 @@ import re
|
||||
from collections import OrderedDict
|
||||
from typing import Text
|
||||
|
||||
from pathlib2 import Path
|
||||
|
||||
from .base import PackageManager
|
||||
from .requirements import SimpleSubstitution
|
||||
from ..base import safe_furl as furl
|
||||
@@ -10,13 +12,27 @@ from ..base import safe_furl as furl
|
||||
class ExternalRequirements(SimpleSubstitution):
|
||||
|
||||
name = "external_link"
|
||||
cwd = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ExternalRequirements, self).__init__(*args, **kwargs)
|
||||
self.post_install_req = []
|
||||
self.post_install_req_lookup = OrderedDict()
|
||||
self.post_install_local_req_lookup = OrderedDict()
|
||||
|
||||
def match(self, req):
|
||||
# match local folder building:
|
||||
if self.is_local_folder_package(req):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
folder_path = req.req.line.strip().split('#')[0].strip()
|
||||
if self.cwd and not Path(folder_path).is_absolute():
|
||||
folder_path = (Path(self.cwd) / Path(folder_path)).absolute().as_posix()
|
||||
self.post_install_local_req_lookup['file://{}'.format(folder_path)] = req.req.line
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
# match both editable or code or unparsed
|
||||
if not (not req.name or req.req and (req.req.editable or req.req.vcs)):
|
||||
return False
|
||||
@@ -30,36 +46,19 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
post_install_req = self.post_install_req
|
||||
self.post_install_req = []
|
||||
for req in post_install_req:
|
||||
try:
|
||||
freeze_base = PackageManager.out_of_scope_freeze() or ''
|
||||
except:
|
||||
freeze_base = ''
|
||||
if self.is_already_installed(req):
|
||||
print("No need to reinstall \'{}\' from VCS, "
|
||||
"the exact same version is already installed".format(req.name))
|
||||
continue
|
||||
|
||||
req_line = req.tostr(markers=False)
|
||||
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
|
||||
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
|
||||
|
||||
if req.req.vcs and req_line.startswith('git+'):
|
||||
if not req.pip_new_version:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
url_no_frag = furl(req_line)
|
||||
url_no_frag.set(fragment=None)
|
||||
# reverse replace
|
||||
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
req_line,
|
||||
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
|
||||
req_line = new_req_line
|
||||
freeze_base = PackageManager.out_of_scope_freeze() or dict(pip=[])
|
||||
except Exception:
|
||||
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
|
||||
freeze_base = dict(pip=[])
|
||||
|
||||
req_line = self._add_vcs_credentials(req, session)
|
||||
|
||||
# if we have older pip version we have to make sure we replace back the package name with the
|
||||
# git repository link. In new versions this is supported and we get "package @ git+https://..."
|
||||
@@ -67,18 +66,50 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
PackageManager.out_of_scope_install_package(req_line, "--no-deps")
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
freeze_post = PackageManager.out_of_scope_freeze() or ''
|
||||
freeze_post = PackageManager.out_of_scope_freeze() or dict(pip=[])
|
||||
package_name = list(set(freeze_post['pip']) - set(freeze_base['pip']))
|
||||
if package_name and package_name[0] not in self.post_install_req_lookup:
|
||||
self.post_install_req_lookup[package_name[0]] = req.req.line
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# no need to force reinstall, pip will always rebuilt if the package comes from git
|
||||
# no need to force reinstall, pip will always rebuild if the package comes from git
|
||||
# and make sure the required packages are installed (if they are not it will install them)
|
||||
if not PackageManager.out_of_scope_install_package(req_line):
|
||||
raise ValueError("Failed installing GIT/HTTPs package \'{}\'".format(req_line))
|
||||
|
||||
@staticmethod
|
||||
def _add_vcs_credentials(req, session):
|
||||
req_line = req.tostr(markers=False)
|
||||
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
|
||||
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
|
||||
if req.req.vcs and req_line.startswith('git+'):
|
||||
try:
|
||||
url_no_frag = furl(req_line)
|
||||
url_no_frag.set(fragment=None)
|
||||
# reverse replace
|
||||
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
# notice git:// is actually ssh://
|
||||
if vcs_url and vcs_url.startswith('git://'):
|
||||
vcs_url = vcs_url.replace('git://', 'ssh://', 1)
|
||||
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
req_line,
|
||||
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
|
||||
req_line = new_req_line
|
||||
except Exception:
|
||||
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
|
||||
return req_line
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
@@ -103,4 +134,54 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
if r not in self.post_install_req_lookup]
|
||||
list_of_requirements[k] += [self.post_install_req_lookup.get(r, '')
|
||||
for r in self.post_install_req_lookup.keys() if r in original_requirements]
|
||||
|
||||
if self.post_install_local_req_lookup:
|
||||
original_requirements = list_of_requirements[k]
|
||||
list_of_requirements[k] = [
|
||||
r for r in original_requirements
|
||||
if len(r.split('@', 1)) != 2 or r.split('@', 1)[1].strip() not in self.post_install_local_req_lookup]
|
||||
|
||||
list_of_requirements[k] += [
|
||||
self.post_install_local_req_lookup.get(r.split('@', 1)[1].strip(), '')
|
||||
for r in original_requirements
|
||||
if len(r.split('@', 1)) == 2 and r.split('@', 1)[1].strip() in self.post_install_local_req_lookup]
|
||||
|
||||
return list_of_requirements
|
||||
|
||||
@classmethod
|
||||
def is_local_folder_package(cls, req):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not req.name and req.req and not req.req.editable and not req.req.vcs and \
|
||||
req.req.line and req.req.line.strip().split('#')[0] and \
|
||||
not req.req.line.strip().split('#')[0].lower().endswith('.whl') and \
|
||||
not (req.req.line.strip().startswith('-r ') or req.req.line.strip().startswith('--requirement ')):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
class OnlyExternalRequirements(ExternalRequirements):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(OnlyExternalRequirements, self).__init__(*args, **kwargs)
|
||||
|
||||
def match(self, req):
|
||||
return True
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
# Do not store the skipped requirements
|
||||
# mark skip package
|
||||
if super(OnlyExternalRequirements, self).match(req):
|
||||
if self.is_already_installed(req):
|
||||
print("No need to reinstall \'{}\' from VCS, "
|
||||
"the exact same version is already installed".format(req.name))
|
||||
return Text('')
|
||||
|
||||
return self._add_vcs_credentials(req, self._session)
|
||||
|
||||
return Text('')
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import os
|
||||
import sys
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Text, Optional
|
||||
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME, ENV_PIP_EXTRA_INSTALL_FLAGS
|
||||
from clearml_agent.helper.package.base import PackageManager
|
||||
from clearml_agent.helper.process import Argv, DEVNULL
|
||||
from clearml_agent.session import Session
|
||||
@@ -10,14 +12,13 @@ from clearml_agent.session import Session
|
||||
|
||||
class SystemPip(PackageManager):
|
||||
|
||||
indices_args = None
|
||||
|
||||
def __init__(self, interpreter=None, session=None):
|
||||
# type: (Optional[Text], Optional[Session]) -> ()
|
||||
"""
|
||||
Program interface to the system pip.
|
||||
"""
|
||||
self._bin = interpreter or sys.executable
|
||||
super(SystemPip, self).__init__()
|
||||
self._bin = Path(interpreter or sys.executable)
|
||||
self.session = session
|
||||
|
||||
@property
|
||||
@@ -49,7 +50,7 @@ class SystemPip(PackageManager):
|
||||
package,
|
||||
'--dest', cache_dir,
|
||||
'--no-deps',
|
||||
) + self.install_flags()
|
||||
) + self.download_flags()
|
||||
)
|
||||
|
||||
def load_requirements(self, requirements):
|
||||
@@ -62,13 +63,14 @@ class SystemPip(PackageManager):
|
||||
def uninstall(self, package):
|
||||
self.run_with_env(('uninstall', '-y', package))
|
||||
|
||||
def freeze(self):
|
||||
def freeze(self, freeze_full_environment=False):
|
||||
"""
|
||||
pip freeze to all install packages except the running program
|
||||
:return: Dict contains pip as key and pip's packages to install
|
||||
:rtype: Dict[str: List[str]]
|
||||
"""
|
||||
packages = self.run_with_env(('freeze',), output=True).splitlines()
|
||||
packages = self.run_with_env(
|
||||
('freeze',) if not freeze_full_environment else ('freeze', '--all'), output=True).splitlines()
|
||||
packages_without_program = [package for package in packages if PROGRAM_NAME not in package]
|
||||
return {'pip': packages_without_program}
|
||||
|
||||
@@ -81,14 +83,33 @@ class SystemPip(PackageManager):
|
||||
:param kwargs: kwargs for get_output/check_output command
|
||||
"""
|
||||
command = self._make_command(command)
|
||||
return (command.get_output if output else command.check_call)(stdin=DEVNULL, **kwargs)
|
||||
# make sure we are not running it with our own PYTHONPATH
|
||||
env = dict(**os.environ)
|
||||
env.pop('PYTHONPATH', None)
|
||||
|
||||
# Debug print
|
||||
if self.session.debug_mode:
|
||||
print(command)
|
||||
|
||||
return (command.get_output if output else command.check_call)(stdin=DEVNULL, env=env, **kwargs)
|
||||
|
||||
def _make_command(self, command):
|
||||
return Argv(self.bin, '-m', 'pip', '--disable-pip-version-check', *command)
|
||||
|
||||
def install_flags(self):
|
||||
if self.indices_args is None:
|
||||
self.indices_args = tuple(
|
||||
chain.from_iterable(('--extra-index-url', x) for x in PIP_EXTRA_INDICES)
|
||||
)
|
||||
return self.indices_args
|
||||
base_args = tuple(self._base_install_flags or []) + tuple(
|
||||
chain.from_iterable(('--extra-index-url', x) for x in PIP_EXTRA_INDICES)
|
||||
)
|
||||
|
||||
extra_pip_flags = \
|
||||
ENV_PIP_EXTRA_INSTALL_FLAGS.get() or \
|
||||
self.session.config.get("agent.package_manager.extra_pip_install_flags", None)
|
||||
|
||||
return (base_args + tuple(extra_pip_flags)) if extra_pip_flags else base_args
|
||||
|
||||
def download_flags(self):
|
||||
indices_args = tuple(
|
||||
chain.from_iterable(('--extra-index-url', x) for x in PIP_EXTRA_INDICES)
|
||||
)
|
||||
|
||||
return indices_args
|
||||
|
||||
@@ -12,7 +12,7 @@ from ..requirements import RequirementsManager
|
||||
|
||||
class VirtualenvPip(SystemPip, PackageManager):
|
||||
def __init__(self, session, python, requirements_manager, path, interpreter=None, execution_info=None, **kwargs):
|
||||
# type: (Session, float, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
|
||||
# type: (Session, str, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
|
||||
"""
|
||||
Program interface to virtualenv pip.
|
||||
Must be given either path to virtualenv or source command.
|
||||
@@ -37,9 +37,11 @@ class VirtualenvPip(SystemPip, PackageManager):
|
||||
|
||||
def load_requirements(self, requirements):
|
||||
if isinstance(requirements, dict) and requirements.get("pip"):
|
||||
requirements["pip"] = self.requirements_manager.replace(requirements["pip"])
|
||||
requirements["pip"] = self.requirements_manager.replace(
|
||||
requirements["pip"], existing_packages=self._existing_packages
|
||||
)
|
||||
super(VirtualenvPip, self).load_requirements(requirements)
|
||||
self.requirements_manager.post_install(self.session)
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
|
||||
def create_flags(self):
|
||||
"""
|
||||
@@ -64,9 +66,18 @@ class VirtualenvPip(SystemPip, PackageManager):
|
||||
Only valid if instantiated with path.
|
||||
Use self.python as self.bin does not exist.
|
||||
"""
|
||||
self.session.command(
|
||||
self.python, "-m", "virtualenv", self.path, *self.create_flags()
|
||||
).check_call()
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self.session.command(
|
||||
self.python, "-m", "virtualenv", self.path, *self.create_flags()
|
||||
).check_call()
|
||||
except Exception as ex:
|
||||
# let's try with std library instead
|
||||
print("WARNING: virtualenv call failed: {}\n INFO: Creating virtual environment with venv".format(ex))
|
||||
self.session.command(
|
||||
self.python, "-m", "venv", self.path, *self.create_flags()
|
||||
).check_call()
|
||||
|
||||
return self
|
||||
|
||||
def remove(self):
|
||||
|
||||
@@ -5,6 +5,8 @@ import attr
|
||||
import sys
|
||||
import os
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.definitions import ENV_AGENT_FORCE_POETRY
|
||||
from clearml_agent.helper.process import Argv, DEVNULL, check_if_command_exists
|
||||
from clearml_agent.session import Session, POETRY
|
||||
|
||||
@@ -38,11 +40,11 @@ def prop_guard(prop, log_prop=None):
|
||||
|
||||
class PoetryConfig:
|
||||
|
||||
def __init__(self, session, interpreter=None):
|
||||
# type: (Session, str) -> ()
|
||||
def __init__(self, session):
|
||||
# type: (Session, str) -> None
|
||||
self.session = session
|
||||
self._log = session.get_logger(__name__)
|
||||
self._python = interpreter or sys.executable
|
||||
self._python = sys.executable # default, overwritten from session config in initialize()
|
||||
self._initialized = False
|
||||
|
||||
@property
|
||||
@@ -51,7 +53,7 @@ class PoetryConfig:
|
||||
|
||||
@property
|
||||
def enabled(self):
|
||||
return self.session.config["agent.package_manager.type"] == POETRY
|
||||
return ENV_AGENT_FORCE_POETRY.get() or self.session.config["agent.package_manager.type"] == POETRY
|
||||
|
||||
_guard_enabled = prop_guard(enabled, log)
|
||||
|
||||
@@ -68,6 +70,11 @@ class PoetryConfig:
|
||||
path = path.replace(':'+sys.base_prefix, ':'+sys.real_prefix, 1)
|
||||
kwargs['env']['PATH'] = path
|
||||
|
||||
if self.session and self.session.config and args and args[0] == "install":
|
||||
extra_args = self.session.config.get("agent.package_manager.poetry_install_extra_args", None)
|
||||
if extra_args:
|
||||
args = args + tuple(extra_args)
|
||||
|
||||
if check_if_command_exists("poetry"):
|
||||
argv = Argv("poetry", *args)
|
||||
else:
|
||||
@@ -81,6 +88,53 @@ class PoetryConfig:
|
||||
@_guard_enabled
|
||||
def initialize(self, cwd=None):
|
||||
if not self._initialized:
|
||||
# use correct python version -- detected in Worker.install_virtualenv() and written to
|
||||
# session
|
||||
if self.session.config.get("agent.python_binary", None):
|
||||
self._python = self.session.config.get("agent.python_binary")
|
||||
|
||||
if self.session.config.get("agent.package_manager.poetry_version", None) is not None:
|
||||
version = str(self.session.config.get("agent.package_manager.poetry_version"))
|
||||
|
||||
# get poetry version
|
||||
version = version.replace(' ', '')
|
||||
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
|
||||
version = version
|
||||
elif version:
|
||||
version = "==" + version
|
||||
# (we are not running it yet)
|
||||
argv = Argv(self._python, "-m", "pip", "install", "poetry{}".format(version),
|
||||
"--upgrade", "--disable-pip-version-check")
|
||||
# this is just for beauty and checks, we already set the verion in the Argv
|
||||
if not version:
|
||||
version = "latest"
|
||||
else:
|
||||
# mark to install poetry if not already installed (we are not running it yet)
|
||||
argv = Argv(self._python, "-m", "pip", "install", "poetry", "--disable-pip-version-check")
|
||||
version = ""
|
||||
|
||||
# first upgrade pip if we need to
|
||||
try:
|
||||
from clearml_agent.helper.package.pip_api.venv import VirtualenvPip
|
||||
pip = VirtualenvPip(
|
||||
session=self.session, python=self._python,
|
||||
requirements_manager=None, path=None, interpreter=self._python)
|
||||
pip.upgrade_pip()
|
||||
except Exception as ex:
|
||||
self.log.warning("failed upgrading pip: {}".format(ex))
|
||||
|
||||
# check if we do not have a specific version and poetry is found skip installation
|
||||
if not version and check_if_command_exists("poetry"):
|
||||
print("Notice: Poetry was found, no specific version required, skipping poetry installation")
|
||||
else:
|
||||
print('Installing / Upgrading Poetry package to {}'.format(version))
|
||||
# now install poetry
|
||||
try:
|
||||
print(argv.get_output())
|
||||
except Exception as ex:
|
||||
self.log.warning("failed installing poetry: {}".format(ex))
|
||||
|
||||
# now setup poetry
|
||||
self._initialized = True
|
||||
try:
|
||||
self._config("--local", "virtualenvs.in-project", "true", cwd=cwd)
|
||||
@@ -115,7 +169,7 @@ class PoetryAPI(object):
|
||||
any((self.path / indicator).exists() for indicator in self.INDICATOR_FILES)
|
||||
)
|
||||
|
||||
def freeze(self):
|
||||
def freeze(self, freeze_full_environment=False):
|
||||
lines = self.config.run("show", cwd=str(self.path)).splitlines()
|
||||
lines = [[p for p in line.split(' ') if p] for line in lines]
|
||||
return {"pip": [parts[0]+'=='+parts[1]+' # '+' '.join(parts[2:]) for parts in lines]}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
from typing import Text
|
||||
|
||||
from .base import PackageManager
|
||||
@@ -6,18 +7,19 @@ from .requirements import SimpleSubstitution
|
||||
|
||||
class PriorityPackageRequirement(SimpleSubstitution):
|
||||
|
||||
name = ("cython", "numpy", "setuptools", )
|
||||
name = ("cython", "numpy", "setuptools", "pip", )
|
||||
optional_package_names = tuple()
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(PriorityPackageRequirement, self).__init__(*args, **kwargs)
|
||||
self._replaced_packages = {}
|
||||
# check if we need to replace the packages:
|
||||
priority_packages = self.config.get('agent.package_manager.priority_packages', None)
|
||||
if priority_packages:
|
||||
self.__class__.name = priority_packages
|
||||
self.__class__.name = [p.lower() for p in priority_packages]
|
||||
priority_optional_packages = self.config.get('agent.package_manager.priority_optional_packages', None)
|
||||
if priority_optional_packages:
|
||||
self.__class__.optional_package_names = priority_optional_packages
|
||||
self.__class__.optional_package_names = [p.lower() for p in priority_optional_packages]
|
||||
|
||||
def match(self, req):
|
||||
# match both Cython & cython
|
||||
@@ -28,7 +30,9 @@ class PriorityPackageRequirement(SimpleSubstitution):
|
||||
Replace a requirement
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
if req.name in self.optional_package_names:
|
||||
self._replaced_packages[req.name] = req.line
|
||||
|
||||
if req.name.lower() in self.optional_package_names:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if PackageManager.out_of_scope_install_package(str(req)):
|
||||
@@ -39,6 +43,69 @@ class PriorityPackageRequirement(SimpleSubstitution):
|
||||
PackageManager.out_of_scope_install_package(str(req))
|
||||
return Text(req)
|
||||
|
||||
def replace_back(self, list_of_requirements):
|
||||
"""
|
||||
:param list_of_requirements: {'pip': ['a==1.0', ]}
|
||||
:return: {'pip': ['a==1.0', ]}
|
||||
"""
|
||||
# if we replaced setuptools, it means someone requested it, and since freeze will not contain it,
|
||||
# we need to add it manually
|
||||
if not self._replaced_packages:
|
||||
return list_of_requirements
|
||||
|
||||
# we assume that both pip & setup tools are not in list_of_requirements, and we need to add them
|
||||
|
||||
if "pip" in self._replaced_packages:
|
||||
full_freeze = PackageManager.out_of_scope_freeze(freeze_full_environment=True)
|
||||
if not full_freeze:
|
||||
if "pip" in list_of_requirements:
|
||||
list_of_requirements["pip"] = [self._replaced_packages["pip"]] + list_of_requirements["pip"]
|
||||
else:
|
||||
# now let's look for pip
|
||||
pips = [line for line in full_freeze.get("pip", []) if str(line.split("==")[0]).strip() == "pip"]
|
||||
if pips and "pip" in list_of_requirements:
|
||||
list_of_requirements["pip"] = [pips[0]] + list_of_requirements["pip"]
|
||||
|
||||
if "setuptools" in self._replaced_packages:
|
||||
try:
|
||||
for k, lines in list_of_requirements.items():
|
||||
# k is either pip/conda
|
||||
if k not in ('pip', 'conda'):
|
||||
continue
|
||||
for i, line in enumerate(lines):
|
||||
if not line or line.lstrip().startswith('#'):
|
||||
continue
|
||||
parts = [p for p in re.split(r'\s|=|\.|<|>|~|!|@|#', line) if p]
|
||||
if not parts:
|
||||
continue
|
||||
# if we found setuptools, do nothing
|
||||
if parts[0] == "setuptools":
|
||||
return list_of_requirements
|
||||
|
||||
# if we are here it means we have not found setuptools
|
||||
# we should add it:
|
||||
if "pip" in list_of_requirements:
|
||||
list_of_requirements["pip"] = [self._replaced_packages["setuptools"]] + list_of_requirements["pip"]
|
||||
|
||||
except Exception as ex: # noqa
|
||||
return list_of_requirements
|
||||
|
||||
return list_of_requirements
|
||||
|
||||
|
||||
class CachedPackageRequirement(PriorityPackageRequirement):
|
||||
|
||||
name = ("setuptools", "pip", )
|
||||
optional_package_names = tuple()
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Put the requirement in the list for later conversion
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
self._replaced_packages[req.name] = req.line
|
||||
return Text(req)
|
||||
|
||||
|
||||
class PackageCollectorRequirement(SimpleSubstitution):
|
||||
"""
|
||||
|
||||
@@ -2,17 +2,22 @@ from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import sys
|
||||
import platform
|
||||
from furl import furl
|
||||
import urllib.parse
|
||||
from operator import itemgetter
|
||||
from html.parser import HTMLParser
|
||||
from typing import Text
|
||||
from typing import Text, Optional, Dict
|
||||
|
||||
import attr
|
||||
import requests
|
||||
|
||||
import six
|
||||
from .requirements import SimpleSubstitution, FatalSpecsResolutionError, SimpleVersion
|
||||
from .requirements import (
|
||||
SimpleSubstitution, FatalSpecsResolutionError, SimpleVersion, MarkerRequirement,
|
||||
compare_version_rules, )
|
||||
from ...definitions import ENV_PACKAGE_PYTORCH_RESOLVE
|
||||
from ...external.requirements_parser.requirement import Requirement
|
||||
|
||||
OS_TO_WHEEL_NAME = {"linux": "linux_x86_64", "windows": "win_amd64"}
|
||||
|
||||
@@ -51,17 +56,16 @@ class PytorchWheel(object):
|
||||
python = attr.ib(type=str, converter=lambda x: str(x).replace(".", ""))
|
||||
torch_version = attr.ib(type=str, converter=fix_version)
|
||||
|
||||
url_template = (
|
||||
"http://download.pytorch.org/whl/"
|
||||
"{0.cuda_version}/torch-{0.torch_version}-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
|
||||
)
|
||||
url_template_prefix = "http://download.pytorch.org/whl/"
|
||||
url_template = "{0.cuda_version}/torch-{0.torch_version}" \
|
||||
"-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
|
||||
|
||||
def __attrs_post_init__(self):
|
||||
self.unicode = "u" if self.python.startswith("2") else ""
|
||||
|
||||
def make_url(self):
|
||||
# type: () -> Text
|
||||
return self.url_template.format(self)
|
||||
return (self.url_template_prefix + self.url_template).format(self)
|
||||
|
||||
|
||||
class PytorchResolutionError(FatalSpecsResolutionError):
|
||||
@@ -168,41 +172,80 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
name = "torch"
|
||||
packages = ("torch", "torchvision", "torchaudio", "torchcsprng", "torchtext")
|
||||
|
||||
extra_index_url_template = 'https://download.pytorch.org/whl/cu{}/'
|
||||
nightly_extra_index_url_template = 'https://download.pytorch.org/whl/nightly/cu{}/'
|
||||
torch_index_url_lookup = {}
|
||||
resolver_types = ("pip", "direct", "none")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
os_name = kwargs.pop("os_override", None)
|
||||
super(PytorchRequirement, self).__init__(*args, **kwargs)
|
||||
self.log = self._session.get_logger(__name__)
|
||||
self.package_manager = self.config["agent.package_manager.type"].lower()
|
||||
self.os = os_name or self.get_platform()
|
||||
self.cuda = "cuda{}".format(self.cuda_version).lower()
|
||||
self.python_version_string = str(self.config["agent.default_python"])
|
||||
self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
|
||||
if '.' not in self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
"must have both major and minor parts of the version (for example: '3.7')".format(
|
||||
self.python_version_string
|
||||
)
|
||||
)
|
||||
self.python = "python{}".format(self.python_major_minor_str)
|
||||
|
||||
self.exceptions = [
|
||||
PytorchResolutionError(message)
|
||||
for message in (
|
||||
None,
|
||||
'cuda version "{}" is not supported'.format(self.cuda),
|
||||
'python version "{}" is not supported'.format(
|
||||
self.python_version_string
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
try:
|
||||
self.validate_python_version()
|
||||
except PytorchResolutionError as e:
|
||||
self.log.warn("will not be able to install pytorch wheels: %s", e.args[0])
|
||||
|
||||
self.cuda = None
|
||||
self.python_version_string = None
|
||||
self.python_major_minor_str = None
|
||||
self.python = None
|
||||
self._fix_setuptools = None
|
||||
self.exceptions = []
|
||||
self._original_req = []
|
||||
# allow override pytorch lookup pages
|
||||
if self.config.get("agent.package_manager.extra_index_url_template", None):
|
||||
self.extra_index_url_template = \
|
||||
self.config.get("agent.package_manager.extra_index_url_template", None)
|
||||
if self.config.get("agent.package_manager.nightly_extra_index_url_template", None):
|
||||
self.nightly_extra_index_url_template = \
|
||||
self.config.get("agent.package_manager.nightly_extra_index_url_template", None)
|
||||
# allow override pytorch lookup pages
|
||||
if self.config.get("agent.package_manager.torch_page", None):
|
||||
SimplePytorchRequirement.page_lookup_template = \
|
||||
self.config.get("agent.package_manager.torch_page", None)
|
||||
if self.config.get("agent.package_manager.torch_nightly_page", None):
|
||||
SimplePytorchRequirement.nightly_page_lookup_template = \
|
||||
self.config.get("agent.package_manager.torch_nightly_page", None)
|
||||
if self.config.get("agent.package_manager.torch_url_template_prefix", None):
|
||||
PytorchWheel.url_template_prefix = \
|
||||
self.config.get("agent.package_manager.torch_url_template_prefix", None)
|
||||
if self.config.get("agent.package_manager.torch_url_template", None):
|
||||
PytorchWheel.url_template = \
|
||||
self.config.get("agent.package_manager.torch_url_template", None)
|
||||
self.resolve_algorithm = str(
|
||||
ENV_PACKAGE_PYTORCH_RESOLVE.get() or
|
||||
self.config.get("agent.package_manager.pytorch_resolve", "pip")).lower()
|
||||
if self.resolve_algorithm not in self.resolver_types:
|
||||
print("WARNING: agent.package_manager.pytorch_resolve=={} not in {} reverting to '{}'".format(
|
||||
self.resolve_algorithm, self.resolver_types, self.resolver_types[0]))
|
||||
self.resolve_algorithm = self.resolver_types[0]
|
||||
|
||||
def _init_python_ver_cuda_ver(self):
|
||||
if self.cuda is None:
|
||||
self.cuda = "cuda{}".format(self.cuda_version).lower()
|
||||
if self.python_version_string is None:
|
||||
self.python_version_string = str(self.config["agent.default_python"])
|
||||
if self.python_major_minor_str is None:
|
||||
self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
|
||||
if '.' not in self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
"must have both major and minor parts of the version (for example: '3.7')".format(
|
||||
self.python_version_string
|
||||
)
|
||||
)
|
||||
if self.python is None:
|
||||
self.python = "python{}".format(self.python_major_minor_str)
|
||||
|
||||
if not self.exceptions:
|
||||
self.exceptions = [
|
||||
PytorchResolutionError(message)
|
||||
for message in (
|
||||
None,
|
||||
'cuda version "{}" is not supported'.format(self.cuda),
|
||||
'python version "{}" is not supported'.format(
|
||||
self.python_version_string
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
@property
|
||||
def is_conda(self):
|
||||
@@ -216,6 +259,8 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
"""
|
||||
Make sure python version has both major and minor versions as required for choosing pytorch wheel
|
||||
"""
|
||||
self._init_python_ver_cuda_ver()
|
||||
|
||||
if self.is_pip and not self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
@@ -225,6 +270,10 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
)
|
||||
|
||||
def match(self, req):
|
||||
if self.resolve_algorithm == "none":
|
||||
# skipping resolver
|
||||
return False
|
||||
|
||||
return req.name in self.packages
|
||||
|
||||
@staticmethod
|
||||
@@ -237,10 +286,15 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
return "macos"
|
||||
raise RuntimeError("unrecognized OS")
|
||||
|
||||
@staticmethod
|
||||
def get_arch():
|
||||
return str(platform.machine()).lower()
|
||||
|
||||
def _get_link_from_torch_page(self, req, torch_url):
|
||||
links_parser = LinksHTMLParser()
|
||||
links_parser.feed(requests.get(torch_url, timeout=10).text)
|
||||
platform_wheel = "win" if self.get_platform() == "windows" else self.get_platform()
|
||||
arch_wheel = self.get_arch()
|
||||
py_ver = self.python_major_minor_str.replace('.', '')
|
||||
url = None
|
||||
last_v = None
|
||||
@@ -261,8 +315,20 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
continue
|
||||
if len(parts) < 3 or not parts[2].endswith(py_ver):
|
||||
continue
|
||||
if len(parts) < 5 or platform_wheel not in parts[4]:
|
||||
if len(parts) < 5 or platform_wheel not in parts[4].lower():
|
||||
continue
|
||||
if len(parts) < 5 or arch_wheel not in parts[4].lower():
|
||||
continue
|
||||
|
||||
# yes this is for linux python 2.7 support, this is the only python 2.7 we support...
|
||||
if py_ver and py_ver[0] == '2' and len(parts) > 3 and not parts[3].endswith('u'):
|
||||
continue
|
||||
|
||||
# check if this an actual match
|
||||
if not req.compare_version(v) or \
|
||||
(last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
|
||||
continue
|
||||
|
||||
# update the closest matched version (from above)
|
||||
if not closest_v:
|
||||
closest_v = v
|
||||
@@ -271,10 +337,6 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
SimpleVersion.compare_versions(
|
||||
version_a=v, op='>=', version_b=req.specs[0][1], num_parts=3):
|
||||
closest_v = v
|
||||
# check if this an actual match
|
||||
if not req.compare_version(v) or \
|
||||
(last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
|
||||
continue
|
||||
|
||||
url = '/'.join(torch_url.split('/')[:-1] + l.split('/'))
|
||||
last_v = v
|
||||
@@ -291,19 +353,25 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
|
||||
def get_url_for_platform(self, req):
|
||||
# check if package is already installed with system packages
|
||||
self.validate_python_version()
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if self.config.get("agent.package_manager.system_site_packages", None):
|
||||
from pip._internal.commands.show import search_packages_info
|
||||
installed_torch = list(search_packages_info([req.name]))
|
||||
# notice the comparison order, the first part will make sure we have a valid installed package
|
||||
if installed_torch and installed_torch[0]['version'] and \
|
||||
req.compare_version(installed_torch[0]['version']):
|
||||
installed_torch_version = \
|
||||
(getattr(installed_torch[0], 'version', None) or
|
||||
installed_torch[0]['version']) if installed_torch else None
|
||||
|
||||
if installed_torch and installed_torch_version and \
|
||||
req.compare_version(installed_torch_version):
|
||||
print('PyTorch: requested "{}" version {}, using pre-installed version {}'.format(
|
||||
req.name, req.specs[0] if req.specs else 'unspecified', installed_torch[0]['version']))
|
||||
req.name, req.specs[0] if req.specs else 'unspecified', installed_torch_version))
|
||||
# package already installed, do nothing
|
||||
req.specs = [('==', str(installed_torch[0]['version']))]
|
||||
req.specs = [('==', str(installed_torch_version))]
|
||||
return '{} {} {}'.format(req.name, req.specs[0][0], req.specs[0][1]), True
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -343,6 +411,11 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
else:
|
||||
print('Trying PyTorch CUDA version {} support'.format(torch_url_key))
|
||||
|
||||
# fix broken pytorch setuptools incompatibility
|
||||
if req.name == "torch" and closest_matched_version and \
|
||||
SimpleVersion.compare_versions(closest_matched_version, "<", "1.11.0"):
|
||||
self._fix_setuptools = "setuptools < 59"
|
||||
|
||||
if not url:
|
||||
url = PytorchWheel(
|
||||
torch_version=fix_version(version),
|
||||
@@ -420,6 +493,66 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
return self.match_version(req, base).replace(" ", "\n")
|
||||
|
||||
def replace(self, req):
|
||||
# we first try to resolve things ourselves because pytorch pip is not always picking the correct
|
||||
# versions from their pip repository
|
||||
|
||||
resolve_algorithm = self.resolve_algorithm
|
||||
if resolve_algorithm == "none":
|
||||
# skipping resolver
|
||||
return None
|
||||
elif resolve_algorithm == "direct":
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
new_req = self._replace(req)
|
||||
if new_req:
|
||||
self._original_req.append((req, new_req))
|
||||
return new_req
|
||||
except Exception:
|
||||
print("Warning: Failed resolving using `pytorch_resolve=direct` reverting to `pytorch_resolve=pip`")
|
||||
elif resolve_algorithm not in self.resolver_types:
|
||||
print("Warning: `agent.package_manager.pytorch_resolve={}` "
|
||||
"unrecognized, default to `pip`".format(resolve_algorithm))
|
||||
|
||||
# check if package is already installed with system packages
|
||||
self.validate_python_version()
|
||||
|
||||
# try to check if we can just use the new index URL, if we do not we will revert to old method
|
||||
try:
|
||||
extra_index_url = self.get_torch_index_url(self.cuda_version)
|
||||
if extra_index_url:
|
||||
# check if the torch version cannot be above 1.11 , we need to fix setup tools
|
||||
try:
|
||||
if req.name == "torch" and not compare_version_rules(req.specs, [(">=", "1.11.0")]):
|
||||
self._fix_setuptools = "setuptools < 59"
|
||||
except Exception: # noqa
|
||||
pass
|
||||
# now we just need to add the correct extra index url for the cuda version
|
||||
self.set_add_install_extra_index(extra_index_url[0])
|
||||
|
||||
if req.specs and len(req.specs) == 1 and req.specs[0][0] == "==":
|
||||
# remove any +cu extension and let pip resolve that
|
||||
# and add .* if we have 3 parts version to deal with nvidia container 'a' version
|
||||
# i.e. "1.13.0" -> "1.13.0.*" so it should match preinstalled "1.13.0a0+936e930"
|
||||
spec_3_parts = req.format_specs(num_parts=3)
|
||||
spec_max3_parts = req.format_specs(max_num_parts=3)
|
||||
if spec_3_parts == spec_max3_parts and not spec_max3_parts.endswith("*"):
|
||||
line = "{} {}.*".format(req.name, spec_max3_parts)
|
||||
else:
|
||||
line = "{} {}".format(req.name, spec_max3_parts)
|
||||
|
||||
if req.marker:
|
||||
line += " ; {}".format(req.marker)
|
||||
else:
|
||||
# return the original line
|
||||
line = req.line
|
||||
|
||||
print("PyTorch: Adding index `{}` and installing `{}`".format(extra_index_url[0], line))
|
||||
|
||||
return line
|
||||
|
||||
except Exception: # noqa
|
||||
pass
|
||||
|
||||
try:
|
||||
new_req = self._replace(req)
|
||||
if new_req:
|
||||
@@ -473,6 +606,19 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
:param list_of_requirements: {'pip': ['a==1.0', ]}
|
||||
:return: {'pip': ['a==1.0', ]}
|
||||
"""
|
||||
def build_specific_version_req(a_line, a_name, a_new_req):
|
||||
try:
|
||||
r = Requirement.parse(a_line)
|
||||
wheel_parts = r.uri.split("/")[-1].split('-')
|
||||
version = str(wheel_parts[1].split('%')[0].split('+')[0])
|
||||
new_r = Requirement.parse("{} == {} # {}".format(a_name, version, str(a_new_req)))
|
||||
if new_r.line:
|
||||
# great it worked!
|
||||
return new_r.line
|
||||
except: # noqa
|
||||
pass
|
||||
return None
|
||||
|
||||
if not self._original_req:
|
||||
return list_of_requirements
|
||||
try:
|
||||
@@ -483,7 +629,7 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
for i, line in enumerate(lines):
|
||||
if not line or line.lstrip().startswith('#'):
|
||||
continue
|
||||
parts = [p for p in re.split('\s|=|\.|<|>|~|!|@|#', line) if p]
|
||||
parts = [p for p in re.split(r'\s|=|\.|<|>|~|!|@|#', line) if p]
|
||||
if not parts:
|
||||
continue
|
||||
for req, new_req in self._original_req:
|
||||
@@ -496,15 +642,78 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
if req.local_file:
|
||||
lines[i] = '{}'.format(str(new_req))
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(str(req), str(new_req))
|
||||
# try to rebuild requirements with specific version:
|
||||
new_line = build_specific_version_req(line, req.req.name, new_req)
|
||||
if new_line:
|
||||
lines[i] = new_line
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(str(req), str(new_req))
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(line, str(new_req))
|
||||
new_line = build_specific_version_req(line, req.req.name, new_req)
|
||||
if new_line:
|
||||
lines[i] = new_line
|
||||
else:
|
||||
lines[i] = '{} # {}'.format(line, str(new_req))
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
return list_of_requirements
|
||||
|
||||
def post_scan_add_req(self): # type: () -> Optional[MarkerRequirement]
|
||||
"""
|
||||
Allows the RequirementSubstitution to add an extra line/requirements after
|
||||
the initial requirements scan is completed.
|
||||
Called only once per requirements.txt object
|
||||
"""
|
||||
if self._fix_setuptools:
|
||||
return MarkerRequirement(Requirement.parse(self._fix_setuptools))
|
||||
return None
|
||||
|
||||
def get_torch_index_url(self, cuda_version, nightly=False):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda = int(cuda_version)
|
||||
except Exception:
|
||||
cuda = 0
|
||||
|
||||
if nightly:
|
||||
for c in range(cuda, max(-1, cuda-15), -1):
|
||||
# then try the nightly builds, it might be there...
|
||||
torch_url = self.nightly_extra_index_url_template.format(c)
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if requests.get(torch_url, timeout=10).ok:
|
||||
print('Torch nightly CUDA {} index page found'.format(c))
|
||||
self.torch_index_url_lookup[c] = torch_url
|
||||
return self.torch_index_url_lookup[c], c
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
# first check if key is valid
|
||||
if cuda in self.torch_index_url_lookup:
|
||||
return self.torch_index_url_lookup[cuda], cuda
|
||||
|
||||
# then try a new cuda version page
|
||||
for c in range(cuda, max(-1, cuda-15), -1):
|
||||
torch_url = self.extra_index_url_template.format(c)
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if requests.get(torch_url, timeout=10).ok:
|
||||
print('Torch CUDA {} index page found, adding `{}`'.format(c, torch_url))
|
||||
self.torch_index_url_lookup[c] = torch_url
|
||||
return self.torch_index_url_lookup[c], c
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
keys = sorted(self.torch_index_url_lookup.keys(), reverse=True)
|
||||
for k in keys:
|
||||
if k <= cuda:
|
||||
return self.torch_index_url_lookup[k], k
|
||||
# return default - zero
|
||||
return self.torch_index_url_lookup[0], 0
|
||||
|
||||
MAP = {
|
||||
"windows": {
|
||||
"cuda100": {
|
||||
|
||||
@@ -11,12 +11,17 @@ from os import path
|
||||
from typing import Text, List, Type, Optional, Tuple, Dict
|
||||
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
from clearml_agent.external.pyhocon import ConfigTree
|
||||
|
||||
import six
|
||||
from six.moves.urllib.parse import unquote
|
||||
import logging
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES
|
||||
from clearml_agent.helper.base import warning, is_conda, which, join_lines, is_windows_platform
|
||||
from clearml_agent.helper.base import (
|
||||
warning, is_conda, which, join_lines, is_windows_platform,
|
||||
convert_cuda_version_to_int_10_base_str, dump_yaml, )
|
||||
from clearml_agent.helper.process import Argv, PathLike
|
||||
from clearml_agent.helper.gpu.gpustat import get_driver_cuda_version
|
||||
from clearml_agent.session import Session, normalize_cuda_version
|
||||
from clearml_agent.external.requirements_parser import parse
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
@@ -89,13 +94,20 @@ class MarkerRequirement(object):
|
||||
def __repr__(self):
|
||||
return '{self.__class__.__name__}[{self}]'.format(self=self)
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, MarkerRequirement) and str(self) == str(other)
|
||||
|
||||
def __hash__(self):
|
||||
return str(self).__hash__()
|
||||
|
||||
def format_specs(self, num_parts=None, max_num_parts=None):
|
||||
max_num_parts = max_num_parts or num_parts
|
||||
if max_num_parts is None or not self.specs:
|
||||
return ','.join(starmap(operator.add, self.specs))
|
||||
|
||||
op, version = self.specs[0]
|
||||
for v in self._sub_versions_pep440:
|
||||
# noinspection PyProtectedMember
|
||||
for v in SimpleVersion._sub_versions_pep440:
|
||||
version = version.replace(v, '.')
|
||||
if num_parts:
|
||||
version = (version.strip('.').split('.') + ['0'] * num_parts)[:max_num_parts]
|
||||
@@ -110,6 +122,10 @@ class MarkerRequirement(object):
|
||||
def specs(self): # type: () -> List[Tuple[Text, Text]]
|
||||
return self.req.specs
|
||||
|
||||
@property
|
||||
def version(self): # type: () -> Text
|
||||
return self.specs[0][1] if self.specs else ""
|
||||
|
||||
@specs.setter
|
||||
def specs(self, value): # type: (List[Tuple[Text, Text]]) -> None
|
||||
self.req.specs = value
|
||||
@@ -137,6 +153,8 @@ class MarkerRequirement(object):
|
||||
If the requested version is 1.2 the self.spec should be 1.2*
|
||||
etc.
|
||||
|
||||
usage: it returns the value of the following comparison: requested_version "op" self.version
|
||||
|
||||
:param str requested_version:
|
||||
:param str op: '==', '>', '>=', '<=', '<', '~='
|
||||
:param int num_parts: number of parts to compare
|
||||
@@ -146,12 +164,49 @@ class MarkerRequirement(object):
|
||||
if not self.specs:
|
||||
return True
|
||||
|
||||
version = self.specs[0][1]
|
||||
version = self.version
|
||||
op = (op or self.specs[0][0]).strip()
|
||||
|
||||
return SimpleVersion.compare_versions(
|
||||
version_a=requested_version, op=op, version_b=version, num_parts=num_parts)
|
||||
|
||||
def remove_local_file_ref(self):
|
||||
if not self.local_file or self.vcs or self.editable or self.path:
|
||||
return False
|
||||
parts = re.split(r"@\s*{}".format(self.req.uri), self.req.line)
|
||||
# if we did not find anything do nothing
|
||||
if len(parts) < 2:
|
||||
return False
|
||||
self.req.line = ''.join(parts).strip()
|
||||
self.req.uri = None
|
||||
self.req.local_file = False
|
||||
return True
|
||||
|
||||
def is_local_package_ref(self):
|
||||
# if local file does not exist, remove the reference to it
|
||||
if self.vcs or self.editable or self.path or not self.local_file or not self.name or \
|
||||
not self.uri or not self.uri.startswith("file://"):
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_vcs_ref(self):
|
||||
return bool(self.vcs)
|
||||
|
||||
def validate_local_file_ref(self):
|
||||
# if local file does not exist, remove the reference to it
|
||||
if not self.is_local_package_ref():
|
||||
return
|
||||
|
||||
local_path = Path(self.uri[len("file://"):])
|
||||
if not local_path.exists():
|
||||
local_path = Path(unquote(self.uri)[len("file://"):])
|
||||
if not local_path.exists():
|
||||
line = self.line
|
||||
if self.remove_local_file_ref():
|
||||
# print warning
|
||||
logging.getLogger(__name__).warning(
|
||||
'Local file not found [{}], references removed'.format(line))
|
||||
|
||||
|
||||
class SimpleVersion:
|
||||
_sub_versions_pep440 = ['a', 'b', 'rc', '.post', '.dev', '+', ]
|
||||
@@ -188,6 +243,19 @@ class SimpleVersion:
|
||||
_local_version_separators = re.compile(r"[\._-]")
|
||||
_regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
|
||||
|
||||
@classmethod
|
||||
def split_op_version(cls, line):
|
||||
"""
|
||||
Split a string in the form of ">=1.2.3" into a (op, version), i.e. (">=", "1.2.3")
|
||||
Notice is calling with only a version string (e.g. "1.2.3") default operator is "=="
|
||||
which means you get ("==", "1.2.3")
|
||||
:param line: string examples: "<=0.1.2"
|
||||
:return: tuple of (op, version) example ("<=", "0.1.2")
|
||||
"""
|
||||
match = r"\s*([>=<~!]*)\s*(\S*)\s*"
|
||||
groups = re.match(match, line).groups()
|
||||
return groups[0] or "==", groups[1]
|
||||
|
||||
@classmethod
|
||||
def compare_versions(cls, version_a, op, version_b, ignore_sub_versions=True, num_parts=3):
|
||||
"""
|
||||
@@ -207,7 +275,28 @@ class SimpleVersion:
|
||||
if not version_b:
|
||||
return True
|
||||
|
||||
# remove trailing "*" in both
|
||||
if "*" in version_a:
|
||||
ignore_sub_versions = True
|
||||
while version_a.endswith(".*"):
|
||||
version_a = version_a[:-2]
|
||||
if version_a == "*":
|
||||
version_a = ""
|
||||
num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
|
||||
|
||||
if "*" in version_b:
|
||||
ignore_sub_versions = True
|
||||
while version_b.endswith(".*"):
|
||||
version_b = version_b[:-2]
|
||||
if version_b == "*":
|
||||
version_b = ""
|
||||
num_parts = min(len(version_a.split('.')), len(version_b.split('.')), )
|
||||
|
||||
if not num_parts:
|
||||
num_parts = max(len(version_a.split('.')), len(version_b.split('.')), )
|
||||
|
||||
if op == '~=':
|
||||
num_parts = len(version_b.split('.')) - 1
|
||||
num_parts = max(num_parts, 2)
|
||||
op = '=='
|
||||
ignore_sub_versions = True
|
||||
@@ -242,8 +331,20 @@ class SimpleVersion:
|
||||
return version_a_key > version_b_key
|
||||
if op == '<':
|
||||
return version_a_key < version_b_key
|
||||
if op == '!=':
|
||||
return version_a_key != version_b_key
|
||||
raise ValueError('Unrecognized comparison operator [{}]'.format(op))
|
||||
|
||||
@classmethod
|
||||
def max_version(cls, version_a, version_b):
|
||||
return version_a if cls.compare_versions(
|
||||
version_a=version_a, op='>=', version_b=version_b, num_parts=None) else version_b
|
||||
|
||||
@classmethod
|
||||
def min_version(cls, version_a, version_b):
|
||||
return version_a if cls.compare_versions(
|
||||
version_a=version_a, op='<=', version_b=version_b, num_parts=None) else version_b
|
||||
|
||||
@staticmethod
|
||||
def _parse_letter_version(
|
||||
letter, # type: str
|
||||
@@ -312,17 +413,94 @@ class SimpleVersion:
|
||||
return ()
|
||||
|
||||
|
||||
def compare_version_rules(specs_a, specs_b):
|
||||
# specs_a/b are a list of tuples: [('==', '1.2.3'), ] or [('>=', '1.2'), ('<', '1.3')]
|
||||
# section definition:
|
||||
class Section(object):
|
||||
def __init__(self, left="-999999999", left_eq=False, right="999999999", right_eq=False):
|
||||
self.left, self.left_eq, self.right, self.right_eq = left, left_eq, right, right_eq
|
||||
# first create a list of in/out sections for each spec
|
||||
# >, >= are left rule
|
||||
# <, <= are right rule
|
||||
# ~= x.y.z is converted to: >= x.y and < x.y+1
|
||||
# ==/=== are converted to: >= and <=
|
||||
# != x.y.z will split a section into: left < x.y.z and right > x.y.z
|
||||
def create_section(specs):
|
||||
section = Section()
|
||||
for op, v in specs:
|
||||
a = section
|
||||
if op == '>':
|
||||
a.left = v
|
||||
a.left_eq = False
|
||||
elif op == '>=':
|
||||
a.left = v
|
||||
a.left_eq = True
|
||||
elif op == '<':
|
||||
a.right = v
|
||||
a.right_eq = False
|
||||
elif op == '<=':
|
||||
a.right = v
|
||||
a.right_eq = True
|
||||
elif op == '==':
|
||||
a.left = v
|
||||
a.left_eq = True
|
||||
a.right = v
|
||||
a.right_eq = True
|
||||
elif op == '~=':
|
||||
new_v = v.split('.')
|
||||
a_left = '.'.join(new_v[:-1])
|
||||
a.left = a_left if not a.left else SimpleVersion.max_version(a_left, a.left)
|
||||
a.left_eq = True
|
||||
a_right = '.'.join(new_v[:-2] + [str(int(new_v[-2])+1)])
|
||||
a.right = a_right if not a.right else SimpleVersion.min_version(a_right, a.right)
|
||||
a.right_eq = False if a.right == a_right else a.right_eq
|
||||
|
||||
return section
|
||||
|
||||
section_a = create_section(specs_a)
|
||||
section_b = create_section(specs_b)
|
||||
i = Section()
|
||||
# then we have a list of sections for spec A/B
|
||||
if section_a.left == section_b.left:
|
||||
i.left = section_a.left
|
||||
i.left_eq = section_a.left_eq and section_b.left_eq
|
||||
else:
|
||||
i.left = SimpleVersion.max_version(section_a.left, section_b.left)
|
||||
i.left_eq = section_a.left_eq if i.left == section_a.left else section_b.left_eq
|
||||
if section_a.right == section_b.right:
|
||||
i.right = section_a.right
|
||||
i.right_eq = section_a.right_eq and section_b.right_eq
|
||||
else:
|
||||
i.right = SimpleVersion.min_version(section_a.right, section_b.right)
|
||||
i.right_eq = section_a.right_eq if i.right == section_a.right else section_b.right_eq
|
||||
|
||||
# return true if any section from A intersects a section from B
|
||||
valid = True
|
||||
valid &= SimpleVersion.compare_versions(
|
||||
version_a=i.left, op='<=' if i.left_eq else '<', version_b=i.right, num_parts=None)
|
||||
valid &= SimpleVersion.compare_versions(
|
||||
version_a=i.right, op='>=' if i.left_eq else '>', version_b=i.left, num_parts=None)
|
||||
|
||||
return valid
|
||||
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class RequirementSubstitution(object):
|
||||
|
||||
_pip_extra_index_url = PIP_EXTRA_INDICES
|
||||
|
||||
@classmethod
|
||||
def set_add_install_extra_index(cls, extra_index_url):
|
||||
if extra_index_url not in cls._pip_extra_index_url:
|
||||
cls._pip_extra_index_url.append(extra_index_url)
|
||||
|
||||
def __init__(self, session):
|
||||
# type: (Session) -> ()
|
||||
self._session = session
|
||||
self.config = session.config # type: ConfigTree
|
||||
self.suffix = '.post{config[agent.cuda_version]}.dev{config[agent.cudnn_version]}'.format(config=self.config)
|
||||
self.package_manager = self.config['agent.package_manager.type']
|
||||
self._is_already_installed_cb = None
|
||||
|
||||
@abstractmethod
|
||||
def match(self, req): # type: (MarkerRequirement) -> bool
|
||||
@@ -338,6 +516,20 @@ class RequirementSubstitution(object):
|
||||
"""
|
||||
pass
|
||||
|
||||
def set_is_already_installed_cb(self, cb):
|
||||
self._is_already_installed_cb = cb
|
||||
|
||||
def is_already_installed(self, req):
|
||||
if not self._is_already_installed_cb:
|
||||
return False
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
return self._is_already_installed_cb(req)
|
||||
except BaseException as ex:
|
||||
# debug could not resolve something
|
||||
print("Warning: Requirements post install callback exception (check if package installed): {}".format(ex))
|
||||
return False
|
||||
|
||||
def post_scan_add_req(self): # type: () -> Optional[MarkerRequirement]
|
||||
"""
|
||||
Allows the RequirementSubstitution to add an extra line/requirements after
|
||||
@@ -362,7 +554,7 @@ class RequirementSubstitution(object):
|
||||
|
||||
@property
|
||||
def cuda_version(self):
|
||||
return self.config['agent.cuda_version']
|
||||
return convert_cuda_version_to_int_10_base_str(self.config['agent.cuda_version'])
|
||||
|
||||
@property
|
||||
def cudnn_version(self):
|
||||
@@ -446,10 +638,16 @@ class RequirementsManager(object):
|
||||
'cu'+agent['cuda_version'] if self.found_cuda else 'cpu')
|
||||
self.translator = RequirementsTranslator(session, interpreter=base_interpreter,
|
||||
cache_dir=pip_cache_dir.as_posix())
|
||||
self._base_interpreter = base_interpreter
|
||||
self._cwd = None
|
||||
self._installed_parsed_packages = set()
|
||||
|
||||
def register(self, cls): # type: (Type[RequirementSubstitution]) -> None
|
||||
self.handlers.append(cls(self._session))
|
||||
|
||||
def set_cwd(self, cwd):
|
||||
self._cwd = str(cwd) if cwd else None
|
||||
|
||||
def _replace_one(self, req): # type: (MarkerRequirement) -> Optional[Text]
|
||||
match = re.search(r';\s*(.*)', Text(req))
|
||||
if match:
|
||||
@@ -461,24 +659,54 @@ class RequirementsManager(object):
|
||||
return handler.replace(req)
|
||||
return None
|
||||
|
||||
def replace(self, requirements): # type: (Text) -> Text
|
||||
def safe_parse(req_str):
|
||||
try:
|
||||
return next(parse(req_str))
|
||||
except Exception as ex:
|
||||
return Requirement(req_str)
|
||||
def replace(
|
||||
self,
|
||||
requirements, # type: Text
|
||||
existing_packages=None, # type: List[MarkerRequirement]
|
||||
pkg_skip_existing_local=True, # type: bool
|
||||
pkg_skip_existing_vcs=True, # type: bool
|
||||
pkg_skip_existing=True, # type: bool
|
||||
): # type: (...) -> Text
|
||||
parsed_requirements = self.parse_requirements_section_to_marker_requirements(
|
||||
requirements=requirements, cwd=self._cwd, skip_local_file_validation=True)
|
||||
|
||||
parsed_requirements = tuple(
|
||||
map(
|
||||
MarkerRequirement,
|
||||
[safe_parse(line) for line in (requirements.splitlines()
|
||||
if isinstance(requirements, six.text_type) else requirements)]
|
||||
)
|
||||
)
|
||||
if parsed_requirements and existing_packages:
|
||||
skipped_packages = None
|
||||
if pkg_skip_existing:
|
||||
skipped_packages = set(parsed_requirements) & set(existing_packages)
|
||||
elif pkg_skip_existing_local or pkg_skip_existing_vcs:
|
||||
existing_packages = [
|
||||
p for p in existing_packages if (
|
||||
(pkg_skip_existing_local and p.is_local_package_ref()) or
|
||||
(pkg_skip_existing_vcs and p.is_vcs_ref())
|
||||
)
|
||||
]
|
||||
skipped_packages = set(parsed_requirements) & set(existing_packages)
|
||||
|
||||
if skipped_packages:
|
||||
# maintain order
|
||||
num_skipped_packages = len(parsed_requirements)
|
||||
parsed_requirements = [p for p in parsed_requirements if p not in skipped_packages]
|
||||
num_skipped_packages -= len(parsed_requirements)
|
||||
print("Skipping {} pre-installed packages:\n{}Remaining {} additional packages to install".format(
|
||||
num_skipped_packages,
|
||||
dump_yaml(sorted([str(p) for p in skipped_packages])),
|
||||
len(parsed_requirements)
|
||||
))
|
||||
|
||||
# nothing to install!
|
||||
if not parsed_requirements:
|
||||
return ""
|
||||
|
||||
# sanity check
|
||||
if not parsed_requirements:
|
||||
# return the original requirements just in case
|
||||
return requirements
|
||||
|
||||
# remove local file reference that do not exist
|
||||
for p in parsed_requirements:
|
||||
p.validate_local_file_ref()
|
||||
|
||||
def replace_one(i, req):
|
||||
# type: (int, MarkerRequirement) -> Optional[Text]
|
||||
try:
|
||||
@@ -503,14 +731,29 @@ class RequirementsManager(object):
|
||||
|
||||
result = list(result)
|
||||
# add post scan add requirements call back
|
||||
double_req_set = None
|
||||
for h in self.handlers:
|
||||
req = h.post_scan_add_req()
|
||||
if req:
|
||||
result.append(req.tostr())
|
||||
reqs = h.post_scan_add_req()
|
||||
if reqs:
|
||||
if double_req_set is None:
|
||||
def safe_parse_name(line):
|
||||
try:
|
||||
return Requirement.parse(line).name
|
||||
except: # noqa
|
||||
return None
|
||||
double_req_set = set([safe_parse_name(r) for r in result if r])
|
||||
|
||||
for r in (reqs if isinstance(reqs, (tuple, list)) else [reqs]):
|
||||
if r and (not r.name or r.name not in double_req_set):
|
||||
result.append(r.tostr())
|
||||
elif r:
|
||||
print("SKIPPING additional auto installed package: \"{}\"".format(r))
|
||||
|
||||
return join_lines(result)
|
||||
|
||||
def post_install(self, session):
|
||||
def post_install(self, session, package_manager=None):
|
||||
if package_manager:
|
||||
self.update_installed_packages_state(package_manager.freeze())
|
||||
for h in self.handlers:
|
||||
try:
|
||||
h.post_install(session)
|
||||
@@ -529,6 +772,37 @@ class RequirementsManager(object):
|
||||
pass
|
||||
return requirements
|
||||
|
||||
def get_interpreter(self):
|
||||
return self._base_interpreter
|
||||
|
||||
def update_installed_packages_state(self, requirements):
|
||||
"""
|
||||
Updates internal Installed Packages objects, so that later we can detect
|
||||
if we already have a pre-installed package
|
||||
:param requirements: is the output of a freeze() call, i.e. dict {'pip': "package==version"}
|
||||
"""
|
||||
requirements = requirements if not isinstance(requirements, dict) else requirements.get("pip")
|
||||
self._installed_parsed_packages = self.parse_requirements_section_to_marker_requirements(
|
||||
requirements=requirements, cwd=self._cwd)
|
||||
for h in self.handlers:
|
||||
h.set_is_already_installed_cb(self._callback_is_already_installed)
|
||||
|
||||
def _callback_is_already_installed(self, req):
|
||||
for p in (self._installed_parsed_packages or []):
|
||||
if p.name != req.name:
|
||||
continue
|
||||
# if this is version control package, only return true of both installed and requests specify commit ID
|
||||
if req.vcs:
|
||||
return p.vcs and req.revision and req.revision == p.revision
|
||||
|
||||
if not req.specs and not p.specs:
|
||||
return True
|
||||
|
||||
# return if this is the same version
|
||||
return req.specs and p.specs and req.compare_version(p, op="==")
|
||||
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_cuda_version(config): # type: (ConfigTree) -> (Text, Text)
|
||||
# we assume os.environ already updated the config['agent.cuda_version'] & config['agent.cudnn_version']
|
||||
@@ -537,6 +811,9 @@ class RequirementsManager(object):
|
||||
if cuda_version and cudnn_version:
|
||||
return normalize_cuda_version(cuda_version), normalize_cuda_version(cudnn_version)
|
||||
|
||||
if not cuda_version:
|
||||
cuda_version = get_driver_cuda_version()
|
||||
|
||||
if not cuda_version and is_windows_platform():
|
||||
try:
|
||||
cuda_vers = [int(k.replace('CUDA_PATH_V', '').replace('_', '')) for k in os.environ.keys()
|
||||
@@ -602,3 +879,29 @@ class RequirementsManager(object):
|
||||
return (normalize_cuda_version(cuda_version or 0),
|
||||
normalize_cuda_version(cudnn_version or 0))
|
||||
|
||||
@staticmethod
|
||||
def parse_requirements_section_to_marker_requirements(requirements, cwd=None, skip_local_file_validation=False):
|
||||
def safe_parse(req_str):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
return list(parse(req_str, cwd=cwd))
|
||||
except Exception as ex:
|
||||
return [Requirement(req_str)]
|
||||
|
||||
def create_req(x):
|
||||
r = MarkerRequirement(x)
|
||||
if not skip_local_file_validation:
|
||||
r.validate_local_file_ref()
|
||||
return r
|
||||
|
||||
if not requirements:
|
||||
return tuple()
|
||||
|
||||
parsed_requirements = tuple(
|
||||
map(
|
||||
create_req,
|
||||
[r for line in (requirements.splitlines() if isinstance(requirements, str) else requirements)
|
||||
for r in safe_parse(line)]
|
||||
)
|
||||
)
|
||||
return parsed_requirements
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from tempfile import mkdtemp
|
||||
from typing import Text
|
||||
|
||||
from furl import furl
|
||||
@@ -20,7 +21,16 @@ class RequirementsTranslator(object):
|
||||
config = session.config
|
||||
self.cache_dir = cache_dir or Path(config["agent.pip_download_cache.path"]).expanduser().as_posix()
|
||||
self.enabled = config["agent.pip_download_cache.enabled"]
|
||||
Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
|
||||
except Exception:
|
||||
temp_cache_folder = mkdtemp(prefix='pip_download_cache.')
|
||||
print("Failed creating pip download cache folder at `{}` reverting to `{}`".format(
|
||||
self.cache_dir, temp_cache_folder))
|
||||
self.cache_dir = temp_cache_folder
|
||||
Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.config = Config()
|
||||
self.pip = SystemPip(interpreter=interpreter, session=self._session)
|
||||
self._translate_back = {}
|
||||
|
||||
@@ -7,8 +7,7 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from distutils.spawn import find_executable
|
||||
from copy import copy
|
||||
from itertools import chain, repeat, islice
|
||||
from os.path import devnull
|
||||
from time import sleep
|
||||
@@ -16,7 +15,6 @@ from typing import Union, Text, Sequence, Any, TypeVar, Callable
|
||||
|
||||
import psutil
|
||||
from furl import furl
|
||||
from future.builtins import super
|
||||
from pathlib2 import Path
|
||||
|
||||
import six
|
||||
@@ -26,7 +24,7 @@ from clearml_agent.helper.base import bash_c, is_windows_platform, select_for_pl
|
||||
PathLike = Union[Text, Path]
|
||||
|
||||
|
||||
def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
|
||||
def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False, raise_error=False):
|
||||
try:
|
||||
output = (
|
||||
subprocess.check_output(
|
||||
@@ -38,24 +36,41 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
|
||||
.strip()
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
if raise_error:
|
||||
raise
|
||||
output = None
|
||||
return output if not strip or not output else output.strip()
|
||||
|
||||
|
||||
def terminate_process(pid, timeout=10.):
|
||||
def stringify_bash_output(value):
|
||||
return '' if not value else (value if isinstance(value, str) else value.decode('utf-8'))
|
||||
|
||||
|
||||
def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
proc = psutil.Process(pid)
|
||||
children = proc.children(recursive=True) if include_children else []
|
||||
proc.terminate()
|
||||
cnt = 0
|
||||
while proc.is_running() and cnt < timeout:
|
||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||
sleep(1.)
|
||||
cnt += 1
|
||||
proc.terminate()
|
||||
|
||||
# terminate children
|
||||
for c in children:
|
||||
c.terminate()
|
||||
|
||||
cnt = 0
|
||||
while proc.is_running() and cnt < timeout:
|
||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||
sleep(1.)
|
||||
cnt += 1
|
||||
|
||||
# kill children
|
||||
for c in children:
|
||||
c.kill()
|
||||
|
||||
proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
@@ -66,9 +81,8 @@ def terminate_process(pid, timeout=10.):
|
||||
return True
|
||||
|
||||
|
||||
def kill_all_child_processes(pid=None):
|
||||
def kill_all_child_processes(pid=None, include_parent=True):
|
||||
# get current process if pid not provided
|
||||
include_parent = True
|
||||
if not pid:
|
||||
pid = os.getpid()
|
||||
include_parent = False
|
||||
@@ -84,11 +98,29 @@ def kill_all_child_processes(pid=None):
|
||||
parent.kill()
|
||||
|
||||
|
||||
def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
|
||||
# get current process if pid not provided
|
||||
if not pid:
|
||||
pid = os.getpid()
|
||||
include_parent = False
|
||||
try:
|
||||
parent = psutil.Process(pid)
|
||||
except psutil.Error:
|
||||
# could not find parent process id
|
||||
return
|
||||
for child in parent.children(recursive=False):
|
||||
print('Terminating child process {}'.format(child.pid))
|
||||
terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True)
|
||||
if include_parent:
|
||||
terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
|
||||
|
||||
|
||||
def get_docker_id(docker_cmd_contains):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"')
|
||||
for docker_line in containers_running.split('\n'):
|
||||
parts = docker_line.split(':')
|
||||
parts = docker_line.split(':', 1)
|
||||
if docker_cmd_contains in parts[-1]:
|
||||
# we found our docker, return it
|
||||
return parts[0]
|
||||
@@ -103,9 +135,10 @@ def shutdown_docker_process(docker_cmd_contains=None, docker_id=None):
|
||||
docker_id = get_docker_id(docker_cmd_contains=docker_cmd_contains)
|
||||
if docker_id:
|
||||
# we found our docker, stop it
|
||||
get_bash_output(cmd='docker stop -t 1 {}'.format(docker_id))
|
||||
return get_bash_output(cmd='docker stop -t 1 {}'.format(docker_id))
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def commit_docker(container_name, docker_cmd_contains=None, docker_id=None, apply_change=None):
|
||||
@@ -193,6 +226,7 @@ class Argv(Executable):
|
||||
"""
|
||||
self.argv = argv
|
||||
self._log = kwargs.pop("log", None)
|
||||
self._display_argv = kwargs.pop("display_argv", argv)
|
||||
if not self._log:
|
||||
self._log = logging.getLogger(__name__)
|
||||
self._log.propagate = False
|
||||
@@ -217,10 +251,10 @@ class Argv(Executable):
|
||||
return self.argv
|
||||
|
||||
def __repr__(self):
|
||||
return "<Argv{}>".format(self.argv)
|
||||
return "<Argv{}>".format(self._display_argv)
|
||||
|
||||
def __str__(self):
|
||||
return "Executing: {}".format(self.argv)
|
||||
return "Executing: {}".format(self._display_argv)
|
||||
|
||||
def __iter__(self):
|
||||
if is_windows_platform():
|
||||
@@ -276,9 +310,9 @@ class CommandSequence(Executable):
|
||||
self.commands = []
|
||||
for c in commands:
|
||||
if isinstance(c, CommandSequence):
|
||||
self.commands.extend(deepcopy(c.commands))
|
||||
self.commands.extend([copy(p) for p in c.commands])
|
||||
elif isinstance(c, Argv):
|
||||
self.commands.append(deepcopy(c))
|
||||
self.commands.append(copy(c))
|
||||
else:
|
||||
self.commands.append(Argv(*c, log=self._log))
|
||||
|
||||
@@ -420,7 +454,7 @@ SOURCE_COMMAND = select_for_platform(linux="source", windows="call")
|
||||
class ExitStatus(object):
|
||||
success = 0
|
||||
failure = 1
|
||||
interrupted = 2
|
||||
interrupted = -2
|
||||
|
||||
|
||||
COMMAND_SUCCESS = 0
|
||||
@@ -457,3 +491,40 @@ def double_quote(s):
|
||||
# use single quotes, and put single quotes into double quotes
|
||||
# the string $"b is then quoted as "$"""b"
|
||||
return '"' + s.replace('"', '"\'\"\'"') + '"'
|
||||
|
||||
|
||||
def find_executable(executable, path=None):
|
||||
"""Tries to find 'executable' in the directories listed in 'path'.
|
||||
|
||||
A string listing directories separated by 'os.pathsep'; defaults to
|
||||
os.environ['PATH']. Returns the complete filename or None if not found.
|
||||
"""
|
||||
_, ext = os.path.splitext(executable)
|
||||
if (sys.platform == 'win32') and (ext != '.exe'):
|
||||
executable = executable + '.exe'
|
||||
|
||||
if os.path.isfile(executable):
|
||||
return executable
|
||||
|
||||
if path is None:
|
||||
path = os.environ.get('PATH', None)
|
||||
if path is None:
|
||||
try:
|
||||
path = os.confstr("CS_PATH")
|
||||
except (AttributeError, ValueError):
|
||||
# os.confstr() or CS_PATH is not available
|
||||
path = os.defpath
|
||||
# bpo-35755: Don't use os.defpath if the PATH environment variable is
|
||||
# set to an empty string
|
||||
|
||||
# PATH='' doesn't match, whereas PATH=':' looks in the current directory
|
||||
if not path:
|
||||
return None
|
||||
|
||||
paths = path.split(os.pathsep)
|
||||
for p in paths:
|
||||
f = os.path.join(p, executable)
|
||||
if os.path.isfile(f):
|
||||
# the file exists, we have a shot at spawn working
|
||||
return f
|
||||
return None
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
import abc
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
from distutils.spawn import find_executable
|
||||
import sys
|
||||
import tempfile
|
||||
from hashlib import md5
|
||||
from os import environ, getenv
|
||||
from os import environ
|
||||
from random import random
|
||||
from threading import Lock
|
||||
from typing import Text, Sequence, Mapping, Iterable, TypeVar, Callable, Tuple, Optional
|
||||
|
||||
import attr
|
||||
@@ -13,7 +18,7 @@ from pathlib2 import Path
|
||||
|
||||
import six
|
||||
|
||||
from clearml_agent.definitions import ENV_AGENT_GIT_USER, ENV_AGENT_GIT_PASS, ENV_AGENT_GIT_HOST
|
||||
from clearml_agent.definitions import ENV_AGENT_GIT_USER, ENV_AGENT_GIT_PASS, ENV_AGENT_GIT_HOST, ENV_GIT_CLONE_VERBOSE
|
||||
from clearml_agent.helper.console import ensure_text, ensure_binary
|
||||
from clearml_agent.errors import CommandFailedError
|
||||
from clearml_agent.helper.base import (
|
||||
@@ -21,9 +26,10 @@ from clearml_agent.helper.base import (
|
||||
rm_tree,
|
||||
ExecutionInfo,
|
||||
normalize_path,
|
||||
create_file_if_not_exists,
|
||||
create_file_if_not_exists, safe_remove_file,
|
||||
)
|
||||
from clearml_agent.helper.process import DEVNULL, Argv, PathLike, COMMAND_SUCCESS
|
||||
from clearml_agent.helper.os.locks import FileLock
|
||||
from clearml_agent.helper.process import DEVNULL, Argv, PathLike, COMMAND_SUCCESS, find_executable
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
@@ -88,7 +94,7 @@ class VCS(object):
|
||||
# additional environment variables for VCS commands
|
||||
COMMAND_ENV = {}
|
||||
|
||||
PATCH_ADDED_FILE_RE = re.compile(r"^\+\+\+ b/(?P<path>.*)")
|
||||
PATCH_ADDED_FILE_RE = re.compile(r"^--- a/(?P<path>.*)")
|
||||
|
||||
def __init__(self, session, url, location, revision):
|
||||
# type: (Session, Text, PathLike, Text) -> ()
|
||||
@@ -105,7 +111,7 @@ class VCS(object):
|
||||
)
|
||||
self.url = url
|
||||
self.location = Text(location)
|
||||
self.revision = revision
|
||||
self._revision = revision
|
||||
self.log = self.session.get_logger(__name__)
|
||||
|
||||
@property
|
||||
@@ -115,21 +121,28 @@ class VCS(object):
|
||||
"""
|
||||
return self.add_auth(self.session.config, self.url)
|
||||
|
||||
@abc.abstractproperty
|
||||
@property
|
||||
def url_without_auth(self):
|
||||
"""
|
||||
Return URL without configured user/password
|
||||
"""
|
||||
return self.add_auth(self.session.config, self.url, reset_auth=True)
|
||||
|
||||
@abc.abstractmethod
|
||||
def executable_name(self):
|
||||
"""
|
||||
Name of command executable
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def main_branch(self):
|
||||
"""
|
||||
Name of default/main branch
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def checkout_flags(self):
|
||||
# type: () -> Sequence[Text]
|
||||
"""
|
||||
@@ -137,7 +150,7 @@ class VCS(object):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def patch_base(self):
|
||||
# type: () -> Sequence[Text]
|
||||
"""
|
||||
@@ -183,8 +196,9 @@ class VCS(object):
|
||||
self.log.info("successfully applied uncommitted changes")
|
||||
return True
|
||||
|
||||
# Command-line flags for clone command
|
||||
clone_flags = ()
|
||||
def clone_flags(self):
|
||||
"""Command-line flags for clone command"""
|
||||
return tuple()
|
||||
|
||||
@abc.abstractmethod
|
||||
def executable_not_found_error_help(self):
|
||||
@@ -254,15 +268,15 @@ class VCS(object):
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def replace_http_url(cls, url, port=None):
|
||||
# type: (Text, Optional[int]) -> Text
|
||||
def replace_http_url(cls, url, port=None, username=None):
|
||||
# type: (Text, Optional[int], Optional[str]) -> Text
|
||||
"""
|
||||
Replace HTTPS URL with SSH URL when applicable
|
||||
"""
|
||||
parsed_url = furl(url)
|
||||
if parsed_url.scheme == "https":
|
||||
parsed_url.scheme = "ssh"
|
||||
parsed_url.username = "git"
|
||||
parsed_url.username = username or "git"
|
||||
parsed_url.password = None
|
||||
# make sure there is no port in the final url (safe_furl support)
|
||||
# the original port was an https port, and we do not know if there is a different ssh port,
|
||||
@@ -271,6 +285,18 @@ class VCS(object):
|
||||
url = parsed_url.url
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def rewrite_ssh_url(cls, url, port=None, username=None):
|
||||
# type: (Text, Optional[int], Optional[str]) -> Text
|
||||
"""
|
||||
Rewrite SSH URL with custom port and username
|
||||
"""
|
||||
parsed_url = furl(url)
|
||||
if parsed_url.scheme == "ssh":
|
||||
parsed_url.username = username or "git"
|
||||
parsed_url.port = port or None
|
||||
return parsed_url.url
|
||||
|
||||
def _set_ssh_url(self):
|
||||
"""
|
||||
Replace instance URL with SSH substitution result and report to log.
|
||||
@@ -285,24 +311,54 @@ class VCS(object):
|
||||
return
|
||||
if parsed_url.scheme == "https":
|
||||
new_url = self.replace_http_url(
|
||||
self.url, port=self.session.config.get('agent.force_git_ssh_port', None))
|
||||
self.url,
|
||||
port=self.session.config.get('agent.force_git_ssh_port', None),
|
||||
username=self.session.config.get('agent.force_git_ssh_user', None)
|
||||
)
|
||||
if new_url != self.url:
|
||||
print("Using SSH credentials - replacing https url '{}' with ssh url '{}'".format(
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
return
|
||||
|
||||
# rewrite ssh URLs only if either ssh port or ssh user are forced in config
|
||||
# TODO: fix, when url is in the form of `git@domain.com:user/project.git` we will fail to get scheme
|
||||
# need to add ssh:// and replace first ":" with / , unless port is specified
|
||||
if parsed_url.scheme == "ssh" and (
|
||||
self.session.config.get('agent.force_git_ssh_port', None) or
|
||||
self.session.config.get('agent.force_git_ssh_user', None)
|
||||
):
|
||||
new_url = self.rewrite_ssh_url(
|
||||
self.url,
|
||||
port=self.session.config.get('agent.force_git_ssh_port', None),
|
||||
username=self.session.config.get('agent.force_git_ssh_user', None)
|
||||
)
|
||||
if new_url != self.url:
|
||||
print("Using SSH credentials - ssh url '{}' with ssh url '{}'".format(
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
return
|
||||
elif parsed_url.scheme == "ssh":
|
||||
return
|
||||
|
||||
if not self.session.config.agent.translate_ssh:
|
||||
return
|
||||
|
||||
# if we have git_user / git_pass replace ssh credentials with https authentication
|
||||
if (ENV_AGENT_GIT_USER.get() or self.session.config.get('agent.git_user', None)) and \
|
||||
(ENV_AGENT_GIT_PASS.get() or self.session.config.get('agent.git_pass', None)):
|
||||
|
||||
# only apply to a specific domain (if requested)
|
||||
config_domain = \
|
||||
ENV_AGENT_GIT_HOST.get() or self.session.config.get("git_host", None)
|
||||
if config_domain and config_domain != furl(self.url).host:
|
||||
return
|
||||
ENV_AGENT_GIT_HOST.get() or self.session.config.get("agent.git_host", None)
|
||||
if config_domain:
|
||||
if config_domain != furl(self.url).host:
|
||||
# bail out here if we have a git_host configured and it's different than the URL host
|
||||
# however, we should make sure this is not an ssh@ URL that furl failed to parse
|
||||
ssh_git_url_match = self.SSH_URL_GIT_SYNTAX.match(self.url)
|
||||
if not ssh_git_url_match or config_domain != ssh_git_url_match.groupdict().get("host"):
|
||||
# do not replace to ssh url
|
||||
return
|
||||
|
||||
new_url = self.replace_ssh_url(self.url)
|
||||
if new_url != self.url:
|
||||
@@ -317,7 +373,9 @@ class VCS(object):
|
||||
If not in debug mode, filter VCS password from output.
|
||||
"""
|
||||
self._set_ssh_url()
|
||||
clone_command = ("clone", self.url_with_auth, self.location) + self.clone_flags
|
||||
# if we are on linux no need for the full auth url because we use GIT_ASKPASS
|
||||
url = self.url_without_auth if self._use_ask_pass else self.url_with_auth
|
||||
clone_command = ("clone", url, self.location) + self.clone_flags()
|
||||
# clone all branches regardless of when we want to later checkout
|
||||
# if branch:
|
||||
# clone_command += ("-b", branch)
|
||||
@@ -325,40 +383,41 @@ class VCS(object):
|
||||
self.call(*clone_command)
|
||||
return
|
||||
|
||||
def normalize_output(result):
|
||||
"""
|
||||
Returns result string without user's password.
|
||||
NOTE: ``self.get_stderr``'s result might or might not have the same type as ``e.output`` in case of error.
|
||||
"""
|
||||
string_type = (
|
||||
ensure_text
|
||||
if isinstance(result, six.text_type)
|
||||
else ensure_binary
|
||||
)
|
||||
return result.replace(
|
||||
string_type(self.url),
|
||||
string_type(furl(self.url).remove(password=True).tostr()),
|
||||
)
|
||||
|
||||
def print_output(output):
|
||||
print(ensure_text(output))
|
||||
|
||||
try:
|
||||
print_output(normalize_output(self.get_stderr(*clone_command)))
|
||||
self._print_output(self._normalize_output(self.get_stderr(*clone_command)))
|
||||
except subprocess.CalledProcessError as e:
|
||||
# In Python 3, subprocess.CalledProcessError has a `stderr` attribute,
|
||||
# but since stderr is redirect to `subprocess.PIPE` it will appear in the usual `output` attribute
|
||||
if e.output:
|
||||
e.output = normalize_output(e.output)
|
||||
print_output(e.output)
|
||||
e.output = self._normalize_output(e.output)
|
||||
self._print_output(e.output)
|
||||
raise
|
||||
|
||||
def _normalize_output(self, result):
|
||||
"""
|
||||
Returns result string without user's password.
|
||||
NOTE: ``self.get_stderr``'s result might or might not have the same type as ``e.output`` in case of error.
|
||||
"""
|
||||
string_type = (
|
||||
ensure_text
|
||||
if isinstance(result, six.text_type)
|
||||
else ensure_binary
|
||||
)
|
||||
return result.replace(
|
||||
string_type(self.url),
|
||||
string_type(furl(self.url).remove(password=True).tostr()),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _print_output(output):
|
||||
print(ensure_text(output))
|
||||
|
||||
def checkout(self):
|
||||
# type: () -> None
|
||||
"""
|
||||
Checkout repository at specified revision
|
||||
"""
|
||||
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
|
||||
self.call("checkout", self._revision, *self.checkout_flags, cwd=self.location)
|
||||
|
||||
@abc.abstractmethod
|
||||
def pull(self):
|
||||
@@ -441,16 +500,18 @@ class VCS(object):
|
||||
return Argv(self.executable_name, *argv)
|
||||
|
||||
@classmethod
|
||||
def add_auth(cls, config, url):
|
||||
def add_auth(cls, config, url, reset_auth=False):
|
||||
"""
|
||||
Add username and password to URL if missing from URL and present in config.
|
||||
Does not modify ssh URLs.
|
||||
|
||||
:param reset_auth: If true remove the user/pass from the URL (default False)
|
||||
"""
|
||||
try:
|
||||
parsed_url = furl(url)
|
||||
except ValueError:
|
||||
return url
|
||||
if parsed_url.scheme in ["", "ssh"] or parsed_url.scheme.startswith("git"):
|
||||
if parsed_url.scheme in ["", "ssh"] or (parsed_url.scheme or '').startswith("git"):
|
||||
return parsed_url.url
|
||||
config_user = ENV_AGENT_GIT_USER.get() or config.get("agent.{}_user".format(cls.executable_name), None)
|
||||
config_pass = ENV_AGENT_GIT_PASS.get() or config.get("agent.{}_pass".format(cls.executable_name), None)
|
||||
@@ -461,10 +522,13 @@ class VCS(object):
|
||||
and config_pass
|
||||
and (not config_domain or config_domain.lower() == parsed_url.host)
|
||||
):
|
||||
parsed_url.set(username=config_user, password=config_pass)
|
||||
if reset_auth:
|
||||
parsed_url.set(username=None, password=None)
|
||||
else:
|
||||
parsed_url.set(username=config_user, password=config_pass)
|
||||
return parsed_url.url
|
||||
|
||||
@abc.abstractproperty
|
||||
@abc.abstractmethod
|
||||
def info_commands(self):
|
||||
# type: () -> Mapping[Text, Argv]
|
||||
"""
|
||||
@@ -487,8 +551,7 @@ class VCS(object):
|
||||
|
||||
class Git(VCS):
|
||||
executable_name = "git"
|
||||
main_branch = "master"
|
||||
clone_flags = ("--quiet", "--recursive")
|
||||
main_branch = ("master", "main")
|
||||
checkout_flags = ("--force",)
|
||||
COMMAND_ENV = {
|
||||
# do not prompt for password
|
||||
@@ -497,9 +560,28 @@ class Git(VCS):
|
||||
"GIT_SSH_COMMAND": "ssh -oBatchMode=yes",
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Git, self).__init__(*args, **kwargs)
|
||||
|
||||
self._use_ask_pass = False if not self.session.config.get('agent.enable_git_ask_pass', True) \
|
||||
else sys.platform == "linux"
|
||||
|
||||
try:
|
||||
self.call("config", "--global", "--replace-all", "safe.directory", "*", cwd=self.location)
|
||||
except: # noqa
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def remote_branch_name(branch):
|
||||
return "origin/{}".format(branch)
|
||||
return [
|
||||
"origin/{}".format(b) for b in ([branch] if isinstance(branch, str) else branch)
|
||||
]
|
||||
|
||||
def clone_flags(self):
|
||||
return (
|
||||
"--recursive",
|
||||
"--verbose" if ENV_GIT_CLONE_VERBOSE.get() else "--quiet"
|
||||
)
|
||||
|
||||
def executable_not_found_error_help(self):
|
||||
return 'Cannot find "{}" executable. {}'.format(
|
||||
@@ -515,16 +597,85 @@ class Git(VCS):
|
||||
)
|
||||
|
||||
def pull(self):
|
||||
self.call("fetch", "--all", "--recurse-submodules", cwd=self.location)
|
||||
self._set_ssh_url()
|
||||
self.call("fetch", "--all", "--tags", "--recurse-submodules", cwd=self.location)
|
||||
|
||||
def _git_pass_auth_wrapper(self, func, *args, **kwargs):
|
||||
try:
|
||||
url_with_auth = furl(self.url_with_auth)
|
||||
password = url_with_auth.password if url_with_auth else None
|
||||
username = url_with_auth.username if url_with_auth else None
|
||||
except: # noqa
|
||||
password = None
|
||||
username = None
|
||||
|
||||
# if this is not linux or we do not have a password, just run as is
|
||||
if not self._use_ask_pass or not password or not username:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# create the password file
|
||||
fp, pass_file = tempfile.mkstemp(prefix='clearml_git_', suffix='.sh')
|
||||
os.close(fp)
|
||||
with open(pass_file, 'wt') as f:
|
||||
# get first letter only (username / password are the argument options)
|
||||
# then echo the correct information
|
||||
f.writelines([
|
||||
'#!/bin/bash\n',
|
||||
'c="$1"\n',
|
||||
'c="${c%"${c#?}"}"\n',
|
||||
'if [ "$c" == "u" ] || [ "$c" == "U" ]; then echo "{}"; else echo "{}"; fi\n'.format(
|
||||
username.replace('"', '\\"'), password.replace('"', '\\"')
|
||||
)
|
||||
])
|
||||
# mark executable
|
||||
st = os.stat(pass_file)
|
||||
os.chmod(pass_file, st.st_mode | stat.S_IEXEC)
|
||||
# let GIT use it
|
||||
self.COMMAND_ENV["GIT_ASKPASS"] = pass_file
|
||||
# call git command
|
||||
try:
|
||||
ret = func(*args, **kwargs)
|
||||
finally:
|
||||
# delete temp password file
|
||||
self.COMMAND_ENV.pop("GIT_ASKPASS", None)
|
||||
safe_remove_file(pass_file)
|
||||
|
||||
return ret
|
||||
|
||||
def get_stderr(self, *argv, **kwargs):
|
||||
"""
|
||||
Wrapper with git password authentication
|
||||
"""
|
||||
return self._git_pass_auth_wrapper(super(Git, self).get_stderr, *argv, **kwargs)
|
||||
|
||||
def call_with_stdin(self, *argv, **kwargs):
|
||||
"""
|
||||
Wrapper with git password authentication
|
||||
"""
|
||||
return self._git_pass_auth_wrapper(super(Git, self).call_with_stdin, *argv, **kwargs)
|
||||
|
||||
def call(self, *argv, **kwargs):
|
||||
"""
|
||||
Wrapper with git password authentication
|
||||
"""
|
||||
return self._git_pass_auth_wrapper(super(Git, self).call, *argv, **kwargs)
|
||||
|
||||
def checkout(self): # type: () -> None
|
||||
"""
|
||||
Checkout repository at specified revision
|
||||
"""
|
||||
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
|
||||
revisions = [self._revision] if isinstance(self._revision, str) else self._revision
|
||||
for i, revision in enumerate(revisions):
|
||||
try:
|
||||
self.call("checkout", revision, *self.checkout_flags, cwd=self.location)
|
||||
break
|
||||
except subprocess.CalledProcessError:
|
||||
if i == len(revisions) - 1:
|
||||
raise
|
||||
|
||||
try:
|
||||
self.call("submodule", "update", "--recursive", cwd=self.location)
|
||||
except:
|
||||
except: # noqa
|
||||
pass
|
||||
|
||||
info_commands = dict(
|
||||
@@ -561,7 +712,7 @@ class Hg(VCS):
|
||||
"pull",
|
||||
self.url_with_auth,
|
||||
cwd=self.location,
|
||||
*(("-r", self.revision) if self.revision else ())
|
||||
*(("-r", self._revision) if self._revision else ())
|
||||
)
|
||||
|
||||
info_commands = dict(
|
||||
@@ -582,7 +733,10 @@ def clone_repository_cached(session, execution, destination):
|
||||
:return: repository information
|
||||
:raises: CommandFailedError if git/hg is not installed
|
||||
"""
|
||||
repo_url = execution.repository # type: str
|
||||
# mock lock
|
||||
repo_lock = Lock()
|
||||
repo_lock_timeout_sec = 300
|
||||
repo_url = execution.repository or '' # type: str
|
||||
parsed_url = furl(repo_url)
|
||||
no_password_url = parsed_url.copy().remove(password=True).url
|
||||
|
||||
@@ -593,41 +747,69 @@ def clone_repository_cached(session, execution, destination):
|
||||
if standalone_mode:
|
||||
cached_repo_path = clone_folder
|
||||
else:
|
||||
cached_repo_path = (
|
||||
Path(session.config["agent.vcs_cache.path"]).expanduser()
|
||||
/ "{}.{}".format(clone_folder_name, md5(ensure_binary(repo_url)).hexdigest())
|
||||
/ clone_folder_name
|
||||
) # type: Path
|
||||
vcs_cache_path = Path(session.config["agent.vcs_cache.path"]).expanduser()
|
||||
repo_hash = md5(ensure_binary(repo_url)).hexdigest()
|
||||
# create lock
|
||||
repo_lock = FileLock(filename=(vcs_cache_path / '{}.lock'.format(repo_hash)).as_posix())
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
repo_lock.acquire(timeout=repo_lock_timeout_sec)
|
||||
except BaseException:
|
||||
print('Could not lock cache folder "{}" (timeout {} sec), using temp vcs cache.'.format(
|
||||
clone_folder_name, repo_lock_timeout_sec))
|
||||
repo_hash = '{}_{}'.format(repo_hash, str(random()).replace('.', ''))
|
||||
# use mock lock for the context
|
||||
repo_lock = Lock()
|
||||
# select vcs cache folder
|
||||
cached_repo_path = vcs_cache_path / "{}.{}".format(clone_folder_name, repo_hash) / clone_folder_name
|
||||
|
||||
vcs = VcsFactory.create(
|
||||
session, execution_info=execution, location=cached_repo_path
|
||||
)
|
||||
if not find_executable(vcs.executable_name):
|
||||
raise CommandFailedError(vcs.executable_not_found_error_help())
|
||||
with repo_lock:
|
||||
vcs = VcsFactory.create(
|
||||
session, execution_info=execution, location=cached_repo_path
|
||||
)
|
||||
if not find_executable(vcs.executable_name):
|
||||
raise CommandFailedError(vcs.executable_not_found_error_help())
|
||||
|
||||
if not standalone_mode:
|
||||
if session.config["agent.vcs_cache.enabled"] and cached_repo_path.exists():
|
||||
print('Using cached repository in "{}"'.format(cached_repo_path))
|
||||
if not standalone_mode:
|
||||
if session.config["agent.vcs_cache.enabled"] and cached_repo_path.exists():
|
||||
print('Using cached repository in "{}"'.format(cached_repo_path))
|
||||
|
||||
else:
|
||||
print("cloning: {}".format(no_password_url))
|
||||
rm_tree(cached_repo_path)
|
||||
# We clone the entire repository, not a specific branch
|
||||
vcs.clone() # branch=execution.branch)
|
||||
else:
|
||||
print("cloning: {}".format(no_password_url))
|
||||
rm_tree(cached_repo_path)
|
||||
# We clone the entire repository, not a specific branch
|
||||
vcs.clone() # branch=execution.branch)
|
||||
|
||||
vcs.pull()
|
||||
rm_tree(destination)
|
||||
shutil.copytree(Text(cached_repo_path), Text(clone_folder))
|
||||
if not clone_folder.is_dir():
|
||||
raise CommandFailedError(
|
||||
"copying of repository failed: from {} to {}".format(
|
||||
cached_repo_path, clone_folder
|
||||
print("pulling git")
|
||||
try:
|
||||
vcs.pull()
|
||||
except Exception as ex:
|
||||
print("git pull failed: {}".format(ex))
|
||||
if (
|
||||
session.config.get("agent.vcs_cache.enabled", False) and
|
||||
session.config.get("agent.vcs_cache.clone_on_pull_fail", False)
|
||||
):
|
||||
print("pulling git failed, re-cloning: {}".format(no_password_url))
|
||||
rm_tree(cached_repo_path)
|
||||
vcs.clone()
|
||||
else:
|
||||
raise ex
|
||||
print("pulling git completed")
|
||||
|
||||
rm_tree(destination)
|
||||
shutil.copytree(Text(cached_repo_path), Text(clone_folder),
|
||||
symlinks=select_for_platform(linux=True, windows=False),
|
||||
ignore_dangling_symlinks=True)
|
||||
if not clone_folder.is_dir():
|
||||
raise CommandFailedError(
|
||||
"copying of repository failed: from {} to {}".format(
|
||||
cached_repo_path, clone_folder
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# checkout in the newly copy destination
|
||||
vcs.location = Text(clone_folder)
|
||||
vcs.checkout()
|
||||
# checkout in the newly copy destination
|
||||
vcs.location = Text(clone_folder)
|
||||
vcs.checkout()
|
||||
|
||||
repo_info = vcs.get_repository_copy_info(clone_folder)
|
||||
|
||||
@@ -635,3 +817,157 @@ def clone_repository_cached(session, execution, destination):
|
||||
repo_info = attr.evolve(repo_info, url=no_password_url)
|
||||
|
||||
return vcs, repo_info
|
||||
|
||||
|
||||
def fix_package_import_diff_patch(entry_script_file):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open(entry_script_file, 'rt') as f:
|
||||
lines = f.readlines()
|
||||
except Exception:
|
||||
return
|
||||
# make sure we are the first import (i.e. we patched the source code)
|
||||
if len(lines or []) < 2 or not lines[0].strip().startswith('from clearml ') or 'Task.init' not in lines[1]:
|
||||
return
|
||||
|
||||
original_lines = lines
|
||||
# skip over the first two lines, they are ours
|
||||
# then skip over empty or comment lines
|
||||
lines = [(i, line.split('#', 1)[0].rstrip()) for i, line in enumerate(lines)
|
||||
if i >= 2 and line.strip('\r\n\t ') and not line.strip().startswith('#')]
|
||||
|
||||
# remove triple quotes ' """ '
|
||||
nested_c = -1
|
||||
skip_lines = []
|
||||
for i, line_pair in enumerate(lines):
|
||||
for _ in line_pair[1].split('"""')[1:]:
|
||||
if nested_c >= 0:
|
||||
skip_lines.extend(list(range(nested_c, i+1)))
|
||||
nested_c = -1
|
||||
else:
|
||||
nested_c = i
|
||||
# now select all the
|
||||
lines = [pair for i, pair in enumerate(lines) if i not in skip_lines]
|
||||
|
||||
from_future = re.compile(r"^from[\s]*__future__[\s]*")
|
||||
import_future = re.compile(r"^import[\s]*__future__[\s]*")
|
||||
# test if we have __future__ import
|
||||
found_index = -1
|
||||
for a_i, (_, a_line) in enumerate(lines):
|
||||
if found_index >= a_i:
|
||||
continue
|
||||
if from_future.match(a_line) or import_future.match(a_line):
|
||||
found_index = a_i
|
||||
# check the last import block
|
||||
i, line = lines[found_index]
|
||||
# wither we have \\ character at the end of the line or the line is indented
|
||||
parenthesized_lines = '(' in line and ')' not in line
|
||||
while line.endswith('\\') or parenthesized_lines:
|
||||
found_index += 1
|
||||
i, line = lines[found_index]
|
||||
if ')' in line:
|
||||
break
|
||||
|
||||
else:
|
||||
break
|
||||
|
||||
# no imports found
|
||||
if found_index < 0:
|
||||
return
|
||||
|
||||
# now we need to move back the patched two lines
|
||||
entry_line = lines[found_index][0]
|
||||
new_lines = original_lines[2:entry_line + 1] + original_lines[0:2] + original_lines[entry_line + 1:]
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open(entry_script_file, 'wt') as f:
|
||||
f.writelines(new_lines)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
|
||||
def _locate_future_import(lines):
|
||||
# type: (list[str]) -> int
|
||||
"""
|
||||
:param lines: string lines of a python file
|
||||
:return: line index of the last __future_ import. return -1 if no __future__ was found
|
||||
"""
|
||||
# skip over the first two lines, they are ours
|
||||
# then skip over empty or comment lines
|
||||
lines = [(i, line.split('#', 1)[0].rstrip()) for i, line in enumerate(lines)
|
||||
if line.strip('\r\n\t ') and not line.strip().startswith('#')]
|
||||
|
||||
# remove triple quotes ' """ '
|
||||
nested_c = -1
|
||||
skip_lines = []
|
||||
for i, line_pair in enumerate(lines):
|
||||
for _ in line_pair[1].split('"""')[1:]:
|
||||
if nested_c >= 0:
|
||||
skip_lines.extend(list(range(nested_c, i + 1)))
|
||||
nested_c = -1
|
||||
else:
|
||||
nested_c = i
|
||||
# now select all the
|
||||
lines = [pair for i, pair in enumerate(lines) if i not in skip_lines]
|
||||
|
||||
from_future = re.compile(r"^from[\s]*__future__[\s]*")
|
||||
import_future = re.compile(r"^import[\s]*__future__[\s]*")
|
||||
# test if we have __future__ import
|
||||
found_index = -1
|
||||
for a_i, (_, a_line) in enumerate(lines):
|
||||
if found_index >= a_i:
|
||||
continue
|
||||
if from_future.match(a_line) or import_future.match(a_line):
|
||||
found_index = a_i
|
||||
# check the last import block
|
||||
i, line = lines[found_index]
|
||||
# wither we have \\ character at the end of the line or the line is indented
|
||||
parenthesized_lines = '(' in line and ')' not in line
|
||||
while line.endswith('\\') or parenthesized_lines:
|
||||
found_index += 1
|
||||
i, line = lines[found_index]
|
||||
if ')' in line:
|
||||
break
|
||||
|
||||
else:
|
||||
break
|
||||
|
||||
return found_index if found_index < 0 else lines[found_index][0]
|
||||
|
||||
|
||||
def patch_add_task_init_call(local_filename):
|
||||
if not local_filename or not Path(local_filename).is_file() or not str(local_filename).lower().endswith(".py"):
|
||||
return
|
||||
|
||||
idx_a = 0
|
||||
# find the right entry for the patch if we have a local file (basically after __future__
|
||||
try:
|
||||
with open(local_filename, 'rt') as f:
|
||||
lines = f.readlines()
|
||||
except Exception as ex:
|
||||
print("Failed patching entry point file {}: {}".format(local_filename, ex))
|
||||
return
|
||||
|
||||
future_found = _locate_future_import(lines)
|
||||
if future_found >= 0:
|
||||
idx_a = future_found + 1
|
||||
|
||||
# check if we have not already patched it, no need to add another one
|
||||
if len(lines or []) >= idx_a+2 and lines[idx_a].strip().startswith('from clearml ') and 'Task.init' in lines[idx_a+1]:
|
||||
print("File {} already patched with Task.init()".format(local_filename))
|
||||
return
|
||||
|
||||
patch = [
|
||||
"from clearml import Task\n",
|
||||
"(__name__ != \"__main__\") or Task.init()\n",
|
||||
]
|
||||
lines = lines[:idx_a] + patch + lines[idx_a:]
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open(local_filename, 'wt') as f:
|
||||
f.writelines(lines)
|
||||
except Exception as ex:
|
||||
print("Failed patching entry point file {}: {}".format(local_filename, ex))
|
||||
return
|
||||
|
||||
print("Force clearml Task.init patch adding to entry point script: {}".format(local_filename))
|
||||
|
||||
@@ -1,16 +1,19 @@
|
||||
from __future__ import unicode_literals, division
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
from collections import deque
|
||||
from itertools import starmap
|
||||
from threading import Thread, Event
|
||||
from time import time
|
||||
from typing import Text, Sequence
|
||||
from typing import Sequence, List, Union, Dict, Optional
|
||||
|
||||
import attr
|
||||
import psutil
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.definitions import ENV_WORKER_TAGS, ENV_GPU_FRACTIONS
|
||||
from clearml_agent.session import Session
|
||||
|
||||
try:
|
||||
@@ -52,6 +55,14 @@ class ResourceMonitor(object):
|
||||
if value is not None
|
||||
}
|
||||
|
||||
@attr.s
|
||||
class ClusterReport:
|
||||
cluster_key = attr.ib(type=str)
|
||||
max_gpus = attr.ib(type=int, default=None)
|
||||
max_workers = attr.ib(type=int, default=None)
|
||||
max_cpus = attr.ib(type=int, default=None)
|
||||
resource_groups = attr.ib(type=Sequence[str], factory=list)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session, # type: Session
|
||||
@@ -59,6 +70,7 @@ class ResourceMonitor(object):
|
||||
sample_frequency_per_sec=2.0,
|
||||
report_frequency_sec=30.0,
|
||||
first_report_sec=None,
|
||||
worker_tags=None
|
||||
):
|
||||
self.session = session
|
||||
self.queue = deque(maxlen=1)
|
||||
@@ -76,7 +88,17 @@ class ResourceMonitor(object):
|
||||
self._gpustat_fail = 0
|
||||
self._gpustat = gpustat
|
||||
self._active_gpus = None
|
||||
if os.environ.get('NVIDIA_VISIBLE_DEVICES') == 'none':
|
||||
self._default_gpu_utilization = session.config.get("agent.resource_monitoring.default_gpu_utilization", 100)
|
||||
# allow default_gpu_utilization as null in the config, in which case we don't log anything
|
||||
if self._default_gpu_utilization is not None:
|
||||
self._default_gpu_utilization = int(self._default_gpu_utilization)
|
||||
self._gpu_utilization_warning_sent = False
|
||||
self._disk_use_path = str(session.config.get("agent.resource_monitoring.disk_use_path", None) or Path.home())
|
||||
self._fractions_handler = GpuFractionsHandler() if session.feature_set != "basic" else None
|
||||
if not worker_tags and ENV_WORKER_TAGS.get():
|
||||
worker_tags = shlex.split(ENV_WORKER_TAGS.get())
|
||||
self._worker_tags = worker_tags
|
||||
if Session.get_nvidia_visible_env() == 'none':
|
||||
# NVIDIA_VISIBLE_DEVICES set to none, marks cpu_only flag
|
||||
# active_gpus == False means no GPU reporting
|
||||
self._active_gpus = False
|
||||
@@ -85,13 +107,18 @@ class ResourceMonitor(object):
|
||||
else:
|
||||
# None means no filtering, report all gpus
|
||||
self._active_gpus = None
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
active_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES', '') or \
|
||||
os.environ.get('CUDA_VISIBLE_DEVICES', '')
|
||||
if active_gpus:
|
||||
self._active_gpus = [int(g.strip()) for g in active_gpus.split(',')]
|
||||
active_gpus = Session.get_nvidia_visible_env()
|
||||
# None means no filtering, report all gpus
|
||||
if active_gpus and active_gpus != "all":
|
||||
self._active_gpus = [g.strip() for g in str(active_gpus).split(',')]
|
||||
except Exception:
|
||||
pass
|
||||
self._cluster_report_interval_sec = int(session.config.get(
|
||||
"agent.resource_monitoring.cluster_report_interval_sec", 60
|
||||
))
|
||||
self._cluster_report = None
|
||||
|
||||
def set_report(self, report):
|
||||
# type: (ResourceMonitor.StatusReport) -> ()
|
||||
@@ -118,10 +145,12 @@ class ResourceMonitor(object):
|
||||
machine_stats=stats,
|
||||
timestamp=(int(time()) * 1000),
|
||||
worker=self._worker_id,
|
||||
tags=self._worker_tags,
|
||||
**self.get_report().to_dict()
|
||||
)
|
||||
log.debug("sending report: %s", report)
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self.session.get(service="workers", action="status_report", **report)
|
||||
except Exception:
|
||||
@@ -129,45 +158,126 @@ class ResourceMonitor(object):
|
||||
return False
|
||||
return True
|
||||
|
||||
def send_cluster_report(self) -> bool:
|
||||
if not self.session.feature_set == "basic":
|
||||
return False
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
properties = {
|
||||
"max_cpus": self._cluster_report.max_cpus,
|
||||
"max_gpus": self._cluster_report.max_gpus,
|
||||
"max_workers": self._cluster_report.max_workers,
|
||||
}
|
||||
payload = {
|
||||
"key": self._cluster_report.cluster_key,
|
||||
"timestamp": int(time() * 1000),
|
||||
"timeout": int(self._cluster_report_interval_sec * 2),
|
||||
# "resource_groups": self._cluster_report.resource_groups, # yet to be supported
|
||||
"properties": {k: v for k, v in properties.items() if v is not None},
|
||||
}
|
||||
self.session.post(service="workers", action="cluster_report", **payload)
|
||||
except Exception as ex:
|
||||
log.warning("Failed sending cluster report: %s", ex)
|
||||
return False
|
||||
return True
|
||||
|
||||
def setup_cluster_report(self, available_gpus, gpu_queues, worker_id=None, cluster_key=None, resource_groups=None):
|
||||
# type: (List[int], Dict[str, int], Optional[str], Optional[str], Optional[List[str]]) -> ()
|
||||
"""
|
||||
Set up a cluster report for the enterprise server dashboard feature.
|
||||
If a worker_id is provided, cluster_key and resource_groups are inferred from it.
|
||||
"""
|
||||
if self.session.feature_set == "basic":
|
||||
return
|
||||
|
||||
if not worker_id and not cluster_key:
|
||||
print("Error: cannot set up dashboard reporting - worker_id or cluster key are required")
|
||||
return
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not cluster_key:
|
||||
worker_id_parts = worker_id.split(":")
|
||||
if len(worker_id_parts) < 3:
|
||||
cluster_key = self.session.config.get("agent.resource_dashboard.default_cluster_name", "onprem")
|
||||
resource_group = ":".join((cluster_key, worker_id_parts[0]))
|
||||
print(
|
||||
'WARNING: your worker ID "{}" is not suitable for proper resource dashboard reporting, please '
|
||||
'set up agent.worker_name to be at least two colon-separated parts (i.e. "<category>:<name>"). '
|
||||
'Using "{}" as the resource dashboard category and "{}" as the resource group.'.format(
|
||||
worker_id, cluster_key, resource_group
|
||||
)
|
||||
)
|
||||
else:
|
||||
cluster_key = worker_id_parts[0]
|
||||
resource_group = ":".join((worker_id_parts[:2]))
|
||||
|
||||
resource_groups = [resource_group]
|
||||
|
||||
self._cluster_report = ResourceMonitor.ClusterReport(
|
||||
cluster_key=cluster_key,
|
||||
max_gpus=len(available_gpus),
|
||||
max_workers=len(available_gpus) // min(x for x, _ in gpu_queues.values()),
|
||||
resource_groups=resource_groups
|
||||
)
|
||||
|
||||
self.send_cluster_report()
|
||||
except Exception as ex:
|
||||
print("Error: failed setting cluster report: {}".format(ex))
|
||||
|
||||
def _daemon(self):
|
||||
last_cluster_report = 0
|
||||
seconds_since_started = 0
|
||||
reported = 0
|
||||
while True:
|
||||
last_report = time()
|
||||
current_report_frequency = (
|
||||
self._report_frequency if reported != 0 else self._first_report_sec
|
||||
)
|
||||
while (time() - last_report) < current_report_frequency:
|
||||
# wait for self._sample_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1 / self._sample_frequency):
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._update_readouts()
|
||||
except Exception as ex:
|
||||
log.warning("failed getting machine stats: %s", report_error(ex))
|
||||
self._failure()
|
||||
try:
|
||||
while True:
|
||||
last_report = time()
|
||||
current_report_frequency = (
|
||||
self._report_frequency if reported != 0 else self._first_report_sec
|
||||
)
|
||||
while (time() - last_report) < current_report_frequency:
|
||||
# wait for self._sample_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1 / self._sample_frequency):
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._update_readouts()
|
||||
except Exception as ex:
|
||||
log.error("failed getting machine stats: %s", report_error(ex))
|
||||
self._failure()
|
||||
|
||||
seconds_since_started += int(round(time() - last_report))
|
||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||
seconds_since_started += int(round(time() - last_report))
|
||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
|
||||
# start reporting only when we figured out, if this is seconds based, or iterations based
|
||||
average_readouts = self._get_average_readouts()
|
||||
stats = {
|
||||
# 3 points after the dot
|
||||
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
||||
for key, value in average_readouts.items()
|
||||
}
|
||||
# start reporting only when we figured out, if this is seconds based, or iterations based
|
||||
average_readouts = self._get_average_readouts()
|
||||
stats = {
|
||||
# 3 points after the dot
|
||||
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
||||
for key, value in average_readouts.items()
|
||||
}
|
||||
|
||||
# send actual report
|
||||
if self.send_report(stats):
|
||||
# clear readouts if this is update was sent
|
||||
self._clear_readouts()
|
||||
# send actual report
|
||||
if self.send_report(stats):
|
||||
# clear readouts if this is update was sent
|
||||
self._clear_readouts()
|
||||
|
||||
# count reported iterations
|
||||
reported += 1
|
||||
# count reported iterations
|
||||
reported += 1
|
||||
|
||||
if (
|
||||
self._cluster_report and
|
||||
self._cluster_report_interval_sec
|
||||
and time() - last_cluster_report > self._cluster_report_interval_sec
|
||||
):
|
||||
if self.send_cluster_report():
|
||||
last_cluster_report = time()
|
||||
|
||||
except Exception as ex:
|
||||
log.exception("Error reporting monitoring info: %s", str(ex))
|
||||
|
||||
def _update_readouts(self):
|
||||
readouts = self._machine_stats()
|
||||
@@ -232,7 +342,7 @@ class ResourceMonitor(object):
|
||||
virtual_memory = psutil.virtual_memory()
|
||||
stats["memory_used"] = BytesSizes.megabytes(virtual_memory.used)
|
||||
stats["memory_free"] = BytesSizes.megabytes(virtual_memory.available)
|
||||
disk_use_percentage = psutil.disk_usage(Text(Path.home())).percent
|
||||
disk_use_percentage = psutil.disk_usage(self._disk_use_path).percent
|
||||
stats["disk_free_percent"] = 100 - disk_use_percentage
|
||||
sensor_stat = (
|
||||
psutil.sensors_temperatures()
|
||||
@@ -254,23 +364,48 @@ class ResourceMonitor(object):
|
||||
if self._active_gpus is not False and self._gpustat:
|
||||
try:
|
||||
gpu_stat = self._gpustat.new_query()
|
||||
report_index = 0
|
||||
for i, g in enumerate(gpu_stat.gpus):
|
||||
# only monitor the active gpu's, if none were selected, monitor everything
|
||||
if self._active_gpus and i not in self._active_gpus:
|
||||
continue
|
||||
stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"]
|
||||
stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"]
|
||||
stats["gpu_mem_usage_{:d}".format(i)] = (
|
||||
if self._active_gpus:
|
||||
uuid = getattr(g, "uuid", None)
|
||||
mig_uuid = getattr(g, "mig_uuid", None)
|
||||
if (
|
||||
str(g.index) not in self._active_gpus
|
||||
and (not uuid or uuid not in self._active_gpus)
|
||||
and (not mig_uuid or mig_uuid not in self._active_gpus)
|
||||
):
|
||||
continue
|
||||
stats["gpu_temperature_{}".format(report_index)] = g["temperature.gpu"]
|
||||
|
||||
if g["utilization.gpu"] is not None:
|
||||
stats["gpu_utilization_{}".format(report_index)] = g["utilization.gpu"]
|
||||
elif self._default_gpu_utilization is not None:
|
||||
stats["gpu_utilization_{}".format(report_index)] = self._default_gpu_utilization
|
||||
if getattr(g, "mig_index", None) is None and not self._gpu_utilization_warning_sent:
|
||||
# this shouldn't happen for non-MIGs, warn the user about it
|
||||
log.error("Failed fetching GPU utilization")
|
||||
self._gpu_utilization_warning_sent = True
|
||||
|
||||
stats["gpu_mem_usage_{}".format(report_index)] = (
|
||||
100.0 * g["memory.used"] / g["memory.total"]
|
||||
)
|
||||
# already in MBs
|
||||
stats["gpu_mem_free_{:d}".format(i)] = (
|
||||
stats["gpu_mem_free_{}".format(report_index)] = (
|
||||
g["memory.total"] - g["memory.used"]
|
||||
)
|
||||
stats["gpu_mem_used_%d" % i] = g["memory.used"]
|
||||
|
||||
stats["gpu_mem_used_{}".format(report_index)] = g["memory.used"] or 0
|
||||
|
||||
if self._fractions_handler:
|
||||
fractions = self._fractions_handler.fractions
|
||||
stats["gpu_fraction_{}".format(report_index)] = \
|
||||
(fractions[i] if i < len(fractions) else fractions[-1]) if fractions else 1.0
|
||||
report_index += 1
|
||||
|
||||
except Exception as ex:
|
||||
# something happened and we can't use gpu stats,
|
||||
log.warning("failed getting machine stats: %s", report_error(ex))
|
||||
log.error("failed getting machine stats: %s", report_error(ex))
|
||||
self._failure()
|
||||
|
||||
return stats
|
||||
@@ -283,19 +418,142 @@ class ResourceMonitor(object):
|
||||
)
|
||||
self._gpustat = None
|
||||
|
||||
BACKEND_STAT_MAP = {"cpu_usage_*": "cpu_usage",
|
||||
"cpu_temperature_*": "cpu_temperature",
|
||||
"disk_free_percent": "disk_free_home",
|
||||
"io_read_mbs": "disk_read",
|
||||
"io_write_mbs": "disk_write",
|
||||
"network_tx_mbs": "network_tx",
|
||||
"network_rx_mbs": "network_rx",
|
||||
"memory_free": "memory_free",
|
||||
"memory_used": "memory_used",
|
||||
"gpu_temperature_*": "gpu_temperature",
|
||||
"gpu_mem_used_*": "gpu_memory_used",
|
||||
"gpu_mem_free_*": "gpu_memory_free",
|
||||
"gpu_utilization_*": "gpu_usage"}
|
||||
BACKEND_STAT_MAP = {
|
||||
"cpu_usage_*": "cpu_usage",
|
||||
"cpu_temperature_*": "cpu_temperature",
|
||||
"disk_free_percent": "disk_free_home",
|
||||
"io_read_mbs": "disk_read",
|
||||
"io_write_mbs": "disk_write",
|
||||
"network_tx_mbs": "network_tx",
|
||||
"network_rx_mbs": "network_rx",
|
||||
"memory_free": "memory_free",
|
||||
"memory_used": "memory_used",
|
||||
"gpu_temperature_*": "gpu_temperature",
|
||||
"gpu_mem_used_*": "gpu_memory_used",
|
||||
"gpu_mem_free_*": "gpu_memory_free",
|
||||
"gpu_utilization_*": "gpu_usage",
|
||||
"gpu_fraction_*": "gpu_fraction"
|
||||
}
|
||||
|
||||
|
||||
class GpuFractionsHandler:
|
||||
_number_re = re.compile(r"^clear\.ml/fraction(-\d+)?$")
|
||||
_mig_re = re.compile(r"^nvidia\.com/mig-(?P<compute>[0-9]+)g\.(?P<memory>[0-9]+)gb$")
|
||||
_frac_gpu_injector_re = re.compile(r"^clearml-injector/fraction$")
|
||||
|
||||
_gpu_name_to_memory_gb = {
|
||||
"A30": 24,
|
||||
"NVIDIA A30": 24,
|
||||
"A100-SXM4-40GB": 40,
|
||||
"NVIDIA-A100-40GB-PCIe": 40,
|
||||
"NVIDIA A100-40GB-PCIe": 40,
|
||||
"NVIDIA-A100-SXM4-40GB": 40,
|
||||
"NVIDIA A100-SXM4-40GB": 40,
|
||||
"NVIDIA-A100-SXM4-80GB": 79,
|
||||
"NVIDIA A100-SXM4-80GB": 79,
|
||||
"NVIDIA-A100-80GB-PCIe": 79,
|
||||
"NVIDIA A100-80GB-PCIe": 79,
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._total_memory_gb = [
|
||||
self._gpu_name_to_memory_gb.get(name, 0)
|
||||
for name in (self._get_gpu_names() or [])
|
||||
]
|
||||
self._fractions = self._get_fractions()
|
||||
|
||||
@property
|
||||
def fractions(self) -> List[float]:
|
||||
return self._fractions
|
||||
|
||||
def _get_fractions(self) -> List[float]:
|
||||
if not self._total_memory_gb:
|
||||
# Can't compute
|
||||
return [1.0]
|
||||
|
||||
fractions = (ENV_GPU_FRACTIONS.get() or "").strip()
|
||||
if not fractions:
|
||||
# No fractions
|
||||
return [1.0]
|
||||
|
||||
decoded_fractions = self.decode_fractions(fractions)
|
||||
|
||||
if isinstance(decoded_fractions, list):
|
||||
return decoded_fractions
|
||||
|
||||
totals = []
|
||||
for i, (fraction, count) in enumerate(decoded_fractions.items()):
|
||||
m = self._mig_re.match(fraction)
|
||||
if not m:
|
||||
continue
|
||||
try:
|
||||
total_gb = self._total_memory_gb[i] if i < len(self._total_memory_gb) else self._total_memory_gb[-1]
|
||||
if not total_gb:
|
||||
continue
|
||||
totals.append((int(m.group("memory")) * count) / total_gb)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if not totals:
|
||||
log.warning("Fractions count is empty for {}".format(fractions))
|
||||
return [1.0]
|
||||
|
||||
return totals
|
||||
|
||||
@classmethod
|
||||
def extract_custom_limits(cls, limits: dict):
|
||||
for k, v in list((limits or {}).items()):
|
||||
if cls._number_re.match(k):
|
||||
limits.pop(k, None)
|
||||
|
||||
@classmethod
|
||||
def get_simple_fractions_total(cls, limits: dict) -> float:
|
||||
try:
|
||||
if any(cls._number_re.match(x) for x in limits):
|
||||
return sum(float(v) for k, v in limits.items() if cls._number_re.match(k))
|
||||
except Exception as ex:
|
||||
log.error("Failed summing up fractions from {}: {}".format(limits, ex))
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def encode_fractions(cls, limits: dict, annotations: dict) -> str:
|
||||
if limits:
|
||||
if any(cls._number_re.match(x) for x in (limits or {})):
|
||||
return ",".join(str(v) for k, v in sorted(limits.items()) if cls._number_re.match(k))
|
||||
return ",".join(("{}:{}".format(k, v) for k, v in (limits or {}).items() if cls._mig_re.match(k)))
|
||||
elif annotations:
|
||||
if any(cls._frac_gpu_injector_re.match(x) for x in (annotations or {})):
|
||||
return ",".join(str(v) for k, v in sorted(annotations.items()) if cls._frac_gpu_injector_re.match(k))
|
||||
|
||||
@staticmethod
|
||||
def decode_fractions(fractions: str) -> Union[List[float], Dict[str, int]]:
|
||||
try:
|
||||
items = [f.strip() for f in fractions.strip().split(",")]
|
||||
tuples = [(k.strip(), v.strip()) for k, v in (f.partition(":")[::2] for f in items)]
|
||||
if all(not v for _, v in tuples):
|
||||
# comma-separated float fractions
|
||||
return [float(k) for k, _ in tuples]
|
||||
# comma-separated slice:count items
|
||||
return {
|
||||
k.strip(): int(v.strip())
|
||||
for k, v in tuples
|
||||
}
|
||||
except Exception as ex:
|
||||
log.error("Failed decoding GPU fractions '{}': {}".format(fractions, ex))
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def _get_gpu_names():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
gpus = gpustat.new_query().gpus
|
||||
names = [g["name"] for g in gpus]
|
||||
|
||||
print("GPU names: {}".format(names))
|
||||
|
||||
return names
|
||||
except Exception as ex:
|
||||
log.error("Failed getting GPU names: {}".format(ex))
|
||||
|
||||
|
||||
def report_error(ex):
|
||||
|
||||
@@ -129,8 +129,9 @@ def get_uptime_string(entry):
|
||||
|
||||
|
||||
def get_runtime_properties_string(runtime_properties):
|
||||
# type: (List[dict]) -> Tuple[Optional[str], str]
|
||||
# type: (Optional[List[dict]]) -> Tuple[Optional[str], str]
|
||||
server_string = []
|
||||
runtime_properties = runtime_properties or []
|
||||
force_flag = next(
|
||||
(prop for prop in runtime_properties if prop["key"] == UptimeConf.worker_key),
|
||||
None,
|
||||
|
||||
@@ -7,7 +7,7 @@ from tempfile import gettempdir, NamedTemporaryFile
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
from clearml_agent.definitions import ENV_DOCKER_HOST_MOUNT
|
||||
from clearml_agent.helper.base import warning
|
||||
from clearml_agent.helper.base import warning, is_windows_platform, safe_remove_file
|
||||
|
||||
|
||||
class Singleton(object):
|
||||
@@ -22,6 +22,13 @@ class Singleton(object):
|
||||
_lock_timeout = 10
|
||||
_pid = None
|
||||
|
||||
@classmethod
|
||||
def close_pid_file(cls):
|
||||
if cls._pid_file:
|
||||
cls._pid_file.close()
|
||||
safe_remove_file(cls._pid_file.name)
|
||||
cls._pid_file = None
|
||||
|
||||
@classmethod
|
||||
def update_pid_file(cls):
|
||||
new_pid = str(os.getpid())
|
||||
@@ -115,7 +122,7 @@ class Singleton(object):
|
||||
|
||||
@classmethod
|
||||
def _register_instance(cls, unique_worker_id=None, worker_name=None, api_client=None, allow_double=False):
|
||||
if cls.worker_id:
|
||||
if cls.worker_id and cls.instance_slot is not None:
|
||||
return cls.worker_id, cls.instance_slot
|
||||
# make sure we have a unique name
|
||||
instance_num = 0
|
||||
@@ -167,7 +174,9 @@ class Singleton(object):
|
||||
# create lock
|
||||
cls._pid = str(os.getpid())
|
||||
cls._pid_file = NamedTemporaryFile(
|
||||
dir=cls._get_temp_folder(), prefix=cls.prefix + cls.sep + cls._pid + cls.sep, suffix=cls.ext)
|
||||
dir=cls._get_temp_folder(), prefix=cls.prefix + cls.sep + cls._pid + cls.sep, suffix=cls.ext,
|
||||
delete=False if is_windows_platform() else True
|
||||
)
|
||||
cls._pid_file.write(('{}\n{}'.format(unique_worker_id, cls.instance_slot)).encode())
|
||||
cls._pid_file.flush()
|
||||
cls.worker_id = unique_worker_id
|
||||
|
||||
@@ -22,7 +22,7 @@ WORKER_ARGS = {
|
||||
'help': 'git username for repository access',
|
||||
},
|
||||
'--git-pass': {
|
||||
'help': 'git password for repository access',
|
||||
'help': 'git password (personal access tokens) for repository access',
|
||||
},
|
||||
'--log-level': {
|
||||
'help': 'SDK log level',
|
||||
@@ -44,13 +44,18 @@ WORKER_ARGS = {
|
||||
}
|
||||
|
||||
DAEMON_ARGS = dict({
|
||||
'--polling-interval': {
|
||||
'help': 'Polling interval in seconds. Minimum is 5 (default 5)',
|
||||
'type': int,
|
||||
'default': 5,
|
||||
},
|
||||
'--foreground': {
|
||||
'help': 'Pipe full log to stdout/stderr, should not be used if running in background',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
@@ -62,7 +67,10 @@ DAEMON_ARGS = dict({
|
||||
'group': 'Docker support',
|
||||
},
|
||||
'--queue': {
|
||||
'help': 'Queue ID(s)/Name(s) to pull tasks from (\'default\' queue)',
|
||||
'help': 'Queue ID(s)/Name(s) to pull tasks from (\'default\' queue).'
|
||||
' Note that the queue list order determines priority, with the first listed queue having the'
|
||||
' highest priority. To change this behavior, use --order-fairness to pull from each queue in a'
|
||||
' round-robin order',
|
||||
'nargs': '+',
|
||||
'default': tuple(),
|
||||
'dest': 'queues',
|
||||
@@ -78,7 +86,16 @@ DAEMON_ARGS = dict({
|
||||
},
|
||||
'--services-mode': {
|
||||
'help': 'Launch multiple long-term docker services. Implies docker & cpu-only flags.',
|
||||
'action': 'store_true',
|
||||
'nargs': '?',
|
||||
'const': -1,
|
||||
'type': int,
|
||||
'default': None,
|
||||
},
|
||||
'--child-report-tags': {
|
||||
'help': 'List of tags to send with the status reports from the worker that runs a task',
|
||||
'nargs': '+',
|
||||
'type': str,
|
||||
'default': None,
|
||||
},
|
||||
'--create-queue': {
|
||||
'help': 'Create requested queue if it does not exist already.',
|
||||
@@ -90,7 +107,19 @@ DAEMON_ARGS = dict({
|
||||
'aliases': ['-d'],
|
||||
},
|
||||
'--stop': {
|
||||
'help': 'Stop the running agent (based on the same set of arguments)',
|
||||
'help': 'Stop the running agent (based on the same set of arguments). '
|
||||
'Optional: provide a list of specific local worker IDs to stop',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
},
|
||||
'--dynamic-gpus': {
|
||||
'help': 'Allow to dynamically allocate gpus based on queue properties, '
|
||||
'configure with \'--queue <queue_name>=<num_gpus>\'.'
|
||||
' Example: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 single_gpu=1\'.'
|
||||
' Example Opportunistic: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 max_quad_gpus=1-4\'.'
|
||||
' Note that the queue list order determines priority, with the first listed queue having the'
|
||||
' highest priority. To change this behavior, use --order-fairness to pull from each queue in a'
|
||||
' round-robin order',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--uptime': {
|
||||
@@ -101,7 +130,7 @@ DAEMON_ARGS = dict({
|
||||
'default': None,
|
||||
},
|
||||
'--downtime': {
|
||||
'help': 'Specify uptime for clearml-agent in "<hours> <days>" format. for example, use "09-13 TUE" to set '
|
||||
'help': 'Specify downtime for clearml-agent in "<hours> <days>" format. for example, use "09-13 TUE" to set '
|
||||
'Tuesday\'s downtime to 09-13'
|
||||
'Note: Make sure to have only on of uptime/downtime configuration and not both.',
|
||||
'nargs': '*',
|
||||
@@ -111,6 +140,10 @@ DAEMON_ARGS = dict({
|
||||
'help': 'Print the worker\'s schedule (uptime properties, server\'s runtime properties and listening queues)',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--use-owner-token': {
|
||||
'help': 'Generate and use task owner token for the execution of the task',
|
||||
'action': 'store_true',
|
||||
}
|
||||
}, **WORKER_ARGS)
|
||||
|
||||
COMMANDS = {
|
||||
@@ -145,7 +178,7 @@ COMMANDS = {
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
@@ -179,11 +212,18 @@ COMMANDS = {
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Build the experiment inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
},
|
||||
'--force-docker': {
|
||||
'help': 'Force using the agent-specified docker image (either explicitly in the --docker argument or '
|
||||
'using the agent\'s default docker image). If provided, the agent will not use any docker '
|
||||
'container information stored on the task itself (default False)',
|
||||
'default': False,
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--python-version': {
|
||||
'help': 'Virtual environment python version to use',
|
||||
},
|
||||
|
||||
@@ -4,14 +4,15 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import sys
|
||||
from copy import deepcopy
|
||||
from typing import Any, Callable
|
||||
|
||||
import attr
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigFactory, HOCONConverter, ConfigTree
|
||||
|
||||
from clearml_agent.external.pyhocon import ConfigFactory, HOCONConverter, ConfigTree
|
||||
from clearml_agent.backend_api.session import Session as _Session, Request
|
||||
from clearml_agent.backend_api.session.client import APIClient
|
||||
from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILE_OVERRIDE_VAR, LOCAL_CONFIG_FILES
|
||||
@@ -19,6 +20,7 @@ from clearml_agent.definitions import ENVIRONMENT_CONFIG, ENV_TASK_EXECUTE_AS_US
|
||||
from clearml_agent.errors import APIError
|
||||
from clearml_agent.helper.base import HOCONEncoder
|
||||
from clearml_agent.helper.process import Argv
|
||||
from clearml_agent.helper.docker_args import DockerArgsSanitizer, sanitize_urls
|
||||
from .version import __version__
|
||||
|
||||
POETRY = "poetry"
|
||||
@@ -76,7 +78,7 @@ class Session(_Session):
|
||||
|
||||
cpu_only = kwargs.get('cpu_only')
|
||||
if cpu_only:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = 'none'
|
||||
Session.set_nvidia_visible_env('none')
|
||||
|
||||
if kwargs.get('gpus') and not os.environ.get('KUBERNETES_SERVICE_HOST') \
|
||||
and not os.environ.get('KUBERNETES_PORT'):
|
||||
@@ -85,7 +87,7 @@ class Session(_Session):
|
||||
os.environ.pop('CUDA_VISIBLE_DEVICES', None)
|
||||
os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
|
||||
else:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
|
||||
Session.set_nvidia_visible_env(kwargs.get('gpus'))
|
||||
|
||||
if kwargs.get('only_load_config'):
|
||||
from clearml_agent.backend_api.config import load
|
||||
@@ -105,7 +107,7 @@ class Session(_Session):
|
||||
if os.path.exists(os.path.expanduser(os.path.expandvars(f))):
|
||||
self._config_file = f
|
||||
break
|
||||
self.api_client = APIClient(session=self, api_version="2.5")
|
||||
self._api_client = None
|
||||
# HACK make sure we have python version to execute,
|
||||
# if nothing was specific, use the one that runs us
|
||||
def_python = ConfigValue(self.config, "agent.default_python")
|
||||
@@ -132,7 +134,7 @@ class Session(_Session):
|
||||
# override with environment variables
|
||||
# cuda_version & cudnn_version are overridden with os.environ here, and normalized in the next section
|
||||
for config_key, env_config in ENVIRONMENT_CONFIG.items():
|
||||
# check if the propery is of a list:
|
||||
# check if the property is of a list:
|
||||
if config_key.endswith('.0'):
|
||||
if all(not i.get() for i in env_config.values()):
|
||||
continue
|
||||
@@ -166,6 +168,16 @@ class Session(_Session):
|
||||
if not kwargs.get('only_load_config'):
|
||||
self.create_cache_folders()
|
||||
|
||||
@property
|
||||
def api_client(self):
|
||||
if self._api_client is None:
|
||||
self._api_client = APIClient(session=self, api_version="2.5")
|
||||
return self._api_client
|
||||
|
||||
@api_client.setter
|
||||
def api_client(self, value):
|
||||
self._api_client = value
|
||||
|
||||
@staticmethod
|
||||
def get_logger(name):
|
||||
logger = logging.getLogger(name)
|
||||
@@ -204,7 +216,7 @@ class Session(_Session):
|
||||
folder_keys = ('agent.venvs_dir', 'agent.vcs_cache.path',
|
||||
'agent.pip_download_cache.path',
|
||||
'agent.docker_pip_cache', 'agent.docker_apt_cache')
|
||||
singleton_folders = ('agent.venvs_dir', 'agent.vcs_cache.path', 'agent.docker_apt_cache')
|
||||
singleton_folders = ('agent.venvs_dir', 'agent.docker_apt_cache')
|
||||
|
||||
if ENV_TASK_EXECUTE_AS_USER.get():
|
||||
folder_keys = tuple(list(folder_keys) + ['sdk.storage.cache.default_base_dir'])
|
||||
@@ -229,26 +241,49 @@ class Session(_Session):
|
||||
except:
|
||||
pass
|
||||
|
||||
def print_configuration(self, remove_secret_keys=("secret", "pass", "token", "account_key")):
|
||||
def print_configuration(self):
|
||||
def load_config(key, default):
|
||||
return [re.compile(x) for x in self.config.get(f"agent.sanitize_config_printout.{key}", default=default)]
|
||||
|
||||
dont_hide_secret_keys = load_config("dont_hide_secrets", ("^enable_git_ask_pass$",))
|
||||
hide_secret_keys = load_config("hide_secrets", ("secret", "pass", "token", "account_key", "contents"))
|
||||
hide_secret_section_keys = load_config("hide_secrets_recursive", ("^environment$",))
|
||||
docker_cmd_keys = load_config("docker_commands", ("^extra_docker_arguments$",))
|
||||
urls_keys = load_config("urls", ("^extra_index_url$",))
|
||||
|
||||
# remove all the secrets from the print
|
||||
def recursive_remove_secrets(dictionary, secret_keys=()):
|
||||
def recursive_remove_secrets(dictionary):
|
||||
for k in list(dictionary):
|
||||
for s in secret_keys:
|
||||
if s in k:
|
||||
dictionary.pop(k)
|
||||
break
|
||||
if not any(r.search(k) for r in dont_hide_secret_keys):
|
||||
if any(r.search(k) for r in hide_secret_keys):
|
||||
dictionary[k] = '****'
|
||||
continue
|
||||
if any(r.search(k) for r in hide_secret_section_keys):
|
||||
dictionary[k] = {key: '****' for key in dictionary[k]} \
|
||||
if isinstance(dictionary[k], dict) else '****'
|
||||
continue
|
||||
if any(r.search(k) for r in urls_keys):
|
||||
value = dictionary.get(k, None)
|
||||
if isinstance(value, str):
|
||||
dictionary[k] = sanitize_urls(value)[0]
|
||||
elif isinstance(value, (list, tuple)):
|
||||
dictionary[k] = [sanitize_urls(v)[0] for v in value]
|
||||
elif isinstance(value, dict):
|
||||
dictionary[k] = {k_: sanitize_urls(v)[0] for k_, v in value.items()}
|
||||
if isinstance(dictionary.get(k, None), dict):
|
||||
recursive_remove_secrets(dictionary[k], secret_keys=secret_keys)
|
||||
recursive_remove_secrets(dictionary[k])
|
||||
elif isinstance(dictionary.get(k, None), (list, tuple)):
|
||||
if any(r.match(k) for r in docker_cmd_keys):
|
||||
dictionary[k] = DockerArgsSanitizer.sanitize_docker_command(self, dictionary[k])
|
||||
for item in dictionary[k]:
|
||||
if isinstance(item, dict):
|
||||
recursive_remove_secrets(item, secret_keys=secret_keys)
|
||||
recursive_remove_secrets(item)
|
||||
|
||||
config = deepcopy(self.config.to_dict())
|
||||
# remove the env variable, it's not important
|
||||
config.pop('env', None)
|
||||
if remove_secret_keys:
|
||||
recursive_remove_secrets(config, secret_keys=remove_secret_keys)
|
||||
if hide_secret_keys or hide_secret_section_keys or docker_cmd_keys or urls_keys:
|
||||
recursive_remove_secrets(config)
|
||||
# remove logging.loggers.urllib3.level from the print
|
||||
try:
|
||||
config['logging']['loggers']['urllib3'].pop('level', None)
|
||||
@@ -279,7 +314,7 @@ class Session(_Session):
|
||||
def get(self, service, action, version=None, headers=None,
|
||||
data=None, json=None, async_enable=False, **kwargs):
|
||||
return self._manual_request(service=service, action=action,
|
||||
version=version, method="get", headers=headers,
|
||||
version=version, method=Request.def_method, headers=headers,
|
||||
data=data, async_enable=async_enable,
|
||||
json=json or kwargs)
|
||||
|
||||
@@ -290,7 +325,7 @@ class Session(_Session):
|
||||
data=data, async_enable=async_enable,
|
||||
json=json or kwargs)
|
||||
|
||||
def _manual_request(self, service, action, version=None, method="get", headers=None,
|
||||
def _manual_request(self, service, action, version=None, method=Request.def_method, headers=None,
|
||||
data=None, json=None, async_enable=False, **kwargs):
|
||||
|
||||
res = self.send_request(service=service, action=action,
|
||||
@@ -318,6 +353,23 @@ class Session(_Session):
|
||||
def command(self, *args):
|
||||
return Argv(*args, log=self.get_logger(Argv.__module__))
|
||||
|
||||
@staticmethod
|
||||
def set_nvidia_visible_env(gpus):
|
||||
if not gpus:
|
||||
gpus = ""
|
||||
visible_env = gpus.replace(".", ":") if isinstance(gpus, str) else \
|
||||
','.join(str(g).replace(".", ":") for g in gpus)
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = visible_env
|
||||
|
||||
@staticmethod
|
||||
def get_nvidia_visible_env():
|
||||
visible_env = os.environ.get('NVIDIA_VISIBLE_DEVICES') or os.environ.get('CUDA_VISIBLE_DEVICES')
|
||||
if visible_env is None:
|
||||
return None
|
||||
visible_env = str(visible_env).replace(":", ".")
|
||||
return visible_env
|
||||
|
||||
|
||||
@attr.s
|
||||
class TrainsAgentLogger(object):
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '0.17.0'
|
||||
__version__ = '1.9.2'
|
||||
|
||||
@@ -10,9 +10,9 @@ RUN apt-get dist-upgrade -y
|
||||
RUN apt-get install -y curl python3-pip git
|
||||
RUN curl -sSL https://get.docker.com/ | sh
|
||||
RUN python3 -m pip install -U pip
|
||||
RUN python3 -m pip install trains-agent
|
||||
RUN python3 -m pip install clearml-agent
|
||||
RUN python3 -m pip install -U "cryptography>=2.9"
|
||||
|
||||
ENV TRAINS_DOCKER_SKIP_GPUS_FLAG=1
|
||||
ENV CLEARML_DOCKER_SKIP_GPUS_FLAG=1
|
||||
|
||||
ENTRYPOINT ["/usr/agent/entrypoint.sh"]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
LOWER_PIP_UPDATE_VERSION="$(echo "$PIP_UPDATE_VERSION" | tr '[:upper:]' '[:lower:]')"
|
||||
LOWER_TRAINS_AGENT_UPDATE_VERSION="$(echo "$TRAINS_AGENT_UPDATE_VERSION" | tr '[:upper:]' '[:lower:]')"
|
||||
LOWER_CLEARML_AGENT_UPDATE_VERSION="$(echo "${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}" | tr '[:upper:]' '[:lower:]')"
|
||||
|
||||
if [ "$LOWER_PIP_UPDATE_VERSION" = "yes" ] || [ "$LOWER_PIP_UPDATE_VERSION" = "true" ] ; then
|
||||
python3 -m pip install -U pip
|
||||
@@ -9,11 +9,11 @@ elif [ ! -z "$LOWER_PIP_UPDATE_VERSION" ] ; then
|
||||
python3 -m pip install pip$LOWER_PIP_UPDATE_VERSION ;
|
||||
fi
|
||||
|
||||
echo "TRAINS_AGENT_UPDATE_VERSION = $LOWER_TRAINS_AGENT_UPDATE_VERSION"
|
||||
if [ "$LOWER_TRAINS_AGENT_UPDATE_VERSION" = "yes" ] || [ "$LOWER_TRAINS_AGENT_UPDATE_VERSION" = "true" ] ; then
|
||||
python3 -m pip install trains-agent -U
|
||||
elif [ ! -z "$LOWER_TRAINS_AGENT_UPDATE_VERSION" ] ; then
|
||||
python3 -m pip install trains-agent$LOWER_TRAINS_AGENT_UPDATE_VERSION ;
|
||||
echo "CLEARML_AGENT_UPDATE_VERSION = $LOWER_CLEARML_AGENT_UPDATE_VERSION"
|
||||
if [ "$LOWER_CLEARML_AGENT_UPDATE_VERSION" = "yes" ] || [ "$LOWER_CLEARML_AGENT_UPDATE_VERSION" = "true" ] ; then
|
||||
python3 -m pip install clearml-agent -U
|
||||
elif [ ! -z "$LOWER_CLEARML_AGENT_UPDATE_VERSION" ] ; then
|
||||
python3 -m pip install clearml-agent$LOWER_CLEARML_AGENT_UPDATE_VERSION ;
|
||||
fi
|
||||
|
||||
python3 -m trains_agent daemon --docker "$TRAINS_AGENT_DEFAULT_BASE_DOCKER" --force-current-version $TRAINS_AGENT_EXTRA_ARGS
|
||||
python3 -m clearml_agent daemon --docker "${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}" --force-current-version ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
|
||||
14
docker/k8s-glue/README.md
Normal file
14
docker/k8s-glue/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
This folder contains an example docker and templates for running the k8s glue as a pod in a k8s cluster
|
||||
|
||||
Please note that ClearML credentials and server addresses should either be filled in the clearml.conf file before
|
||||
building the glue docker or provided in the k8s-glue.yml template.
|
||||
|
||||
To run, you'll need to:
|
||||
* Create a secret from pod_template.yml:
|
||||
```bash
|
||||
kubectl -n clearml create secret generic k8s-glue-pod-template --from-file=pod_template.yml
|
||||
```
|
||||
* Apply the k8s glue template:
|
||||
```bash
|
||||
kubectl -n clearml apply -f k8s-glue.yml
|
||||
```
|
||||
37
docker/k8s-glue/build-image-helper.sh
Normal file
37
docker/k8s-glue/build-image-helper.sh
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Check if image name and Dockerfile path are provided
|
||||
if [ -z "$1" ] || [ -z "$2" ]; then
|
||||
echo "Usage: $0 <image_name> <dockerfile_path> <build_context>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Build the Docker image
|
||||
image_name=$1
|
||||
dockerfile_path=$2
|
||||
build_context=$3
|
||||
|
||||
if [ $build_context == "glue-build-aws" ] || [ $build_context == "glue-build-gcp" ]; then
|
||||
if [ ! -f $build_context/clearml.conf ]; then
|
||||
cp build-resources/clearml.conf $build_context
|
||||
fi
|
||||
if [ ! -f $build_context/entrypoint.sh ]; then
|
||||
cp build-resources/entrypoint.sh $build_context
|
||||
chmod +x $build_context/entrypoint.sh
|
||||
fi
|
||||
if [ ! -f $build_context/setup.sh ]; then
|
||||
cp build-resources/setup.sh $build_context
|
||||
chmod +x $build_context/setup.sh
|
||||
fi
|
||||
fi
|
||||
cp ../../examples/k8s_glue_example.py $build_context
|
||||
|
||||
docker build -f $dockerfile_path -t $image_name $build_context
|
||||
|
||||
# cleanup
|
||||
if [ $build_context == "glue-build-aws" ] || [ $build_context == "glue-build-gcp" ]; then
|
||||
rm $build_context/clearml.conf
|
||||
rm $build_context/entrypoint.sh
|
||||
rm $build_context/setup.sh
|
||||
fi
|
||||
rm $build_context/k8s_glue_example.py
|
||||
402
docker/k8s-glue/build-resources/clearml.conf
Normal file
402
docker/k8s-glue/build-resources/clearml.conf
Normal file
@@ -0,0 +1,402 @@
|
||||
# CLEARML-AGENT configuration file
|
||||
api {
|
||||
# Notice: 'host' is the api server (default port 8008), not the web server.
|
||||
api_server: ""
|
||||
web_server: ""
|
||||
files_server: ""
|
||||
# Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
|
||||
credentials {"access_key": "", "secret_key": ""}
|
||||
}
|
||||
|
||||
# Set GIT user/pass credentials
|
||||
# leave blank for GIT SSH credentials
|
||||
agent.git_user=""
|
||||
agent.git_pass=""
|
||||
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
agent.package_manager.extra_index_url= [
|
||||
|
||||
]
|
||||
|
||||
agent {
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Override with os environment: CLEARML_WORKER_ID
|
||||
# worker_id: "clearml-agent-machine1:gpu0"
|
||||
worker_id: ""
|
||||
|
||||
# worker name, replaces the hostname when creating a unique name for this worker
|
||||
# Override with os environment: CLEARML_WORKER_NAME
|
||||
# worker_name: "clearml-agent-machine1"
|
||||
worker_name: ""
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: ["<20.2 ; python_version < '3.10'", "<22.3 ; python_version >= '3.10'"],
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
# install with --upgrade
|
||||
force_upgrade: false,
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||
torch_nightly: false,
|
||||
},
|
||||
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
# path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.clearml/vcs-cache
|
||||
},
|
||||
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
|
||||
# cached folder for specific python package download (used for pytorch package caching)
|
||||
pip_download_cache {
|
||||
enabled: true,
|
||||
path: ~/.clearml/pip-download-cache
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
# pip cache folder mapped into docker, used for python package caching
|
||||
docker_pip_cache = ~/.clearml/pip-cache
|
||||
# apt cache folder mapped into docker, used for ubuntu package caching
|
||||
docker_apt_cache = ~/.clearml/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
# Defined using a list of items of the format: "<hours> <days>".
|
||||
# hours - use values 0-23, single values would count as start hour and end at midnight.
|
||||
# days - use days in abbreviated format (SUN-SAT)
|
||||
# use '-' for ranges and ',' to separate singular values.
|
||||
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
|
||||
# uptime: ["17-20 SUN,TUE"]
|
||||
|
||||
# optional downtime configuration, can be used only when uptime is not used.
|
||||
# If downtime is specified, agent will be idle in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
|
||||
# Use the same format as described above for uptime
|
||||
# downtime: []
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
# docker_init_bash_script = [
|
||||
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||
# "chown -R root /root/.cache/pip",
|
||||
# "apt-get update",
|
||||
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
|
||||
# ]
|
||||
|
||||
# set the preprocessing bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# docker_preprocess_bash_script = [
|
||||
# "echo \"starting docker\"",
|
||||
#]
|
||||
|
||||
# If False replace \r with \n and display full console output
|
||||
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||
# suppress_carriage_return: true
|
||||
|
||||
# cuda versions used for solving pytorch wheel packages
|
||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
}
|
||||
}
|
||||
|
||||
sdk {
|
||||
# ClearML - default SDK configuration
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to <system_temp_folder>/clearml_cache
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
size {
|
||||
# max_used_bytes = -1
|
||||
min_free_bytes = 10GB
|
||||
# cleanup_margin_percent = 5%
|
||||
}
|
||||
}
|
||||
|
||||
direct_access: [
|
||||
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
|
||||
# or cached, and any download request will return a direct reference.
|
||||
# Objects are specified in glob format, available for url and content_type.
|
||||
{ url: "file://*" } # file-urls are always directly referenced
|
||||
]
|
||||
}
|
||||
|
||||
metrics {
|
||||
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
|
||||
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
|
||||
# X files are stored in the upload destination for each metric/variant combination.
|
||||
file_history_size: 100
|
||||
|
||||
# Max history size for matplotlib imshow files per plot title.
|
||||
# File names for the uploaded images will be recycled in such a way that no more than
|
||||
# X images are stored in the upload destination for each matplotlib plot title.
|
||||
matplotlib_untitled_history_size: 100
|
||||
|
||||
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
|
||||
# plot_max_num_digits: 5
|
||||
|
||||
# Settings for generated debug images
|
||||
images {
|
||||
format: JPEG
|
||||
quality: 87
|
||||
subsampling: 0
|
||||
}
|
||||
|
||||
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
|
||||
tensorboard_single_series_per_graph: false
|
||||
}
|
||||
|
||||
network {
|
||||
metrics {
|
||||
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
|
||||
# a specific iteration
|
||||
file_upload_threads: 4
|
||||
|
||||
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
|
||||
# being sent for upload
|
||||
file_upload_starvation_warning_sec: 120
|
||||
}
|
||||
|
||||
iteration {
|
||||
# Max number of retries when getting frames if the server returned an error (http code 500)
|
||||
max_retries_on_server_error: 5
|
||||
# Backoff factory for consecutive retry attempts.
|
||||
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
|
||||
retry_backoff_factor_sec: 10
|
||||
}
|
||||
}
|
||||
aws {
|
||||
s3 {
|
||||
# S3 credentials, used for read/write access by various SDK elements
|
||||
|
||||
# default, used for any bucket not specified below
|
||||
key: ""
|
||||
secret: ""
|
||||
region: ""
|
||||
|
||||
credentials: [
|
||||
# specifies key/secret credentials to use when handling s3 urls (read or write)
|
||||
# {
|
||||
# bucket: "my-bucket-name"
|
||||
# key: "my-access-key"
|
||||
# secret: "my-secret-key"
|
||||
# },
|
||||
# {
|
||||
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
|
||||
# host: "my-minio-host:9000"
|
||||
# key: "12345678"
|
||||
# secret: "12345678"
|
||||
# multipart: false
|
||||
# secure: false
|
||||
# }
|
||||
]
|
||||
}
|
||||
boto3 {
|
||||
pool_connections: 512
|
||||
max_multipart_concurrency: 16
|
||||
}
|
||||
}
|
||||
google.storage {
|
||||
# # Default project and credentials file
|
||||
# # Will be used when no bucket configuration is found
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
|
||||
# # Specific credentials per bucket and sub directory
|
||||
# credentials = [
|
||||
# {
|
||||
# bucket: "my-bucket"
|
||||
# subdir: "path/in/bucket" # Not required
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
# },
|
||||
# ]
|
||||
}
|
||||
azure.storage {
|
||||
# containers: [
|
||||
# {
|
||||
# account_name: "clearml"
|
||||
# account_key: "secret"
|
||||
# # container_name:
|
||||
# }
|
||||
# ]
|
||||
}
|
||||
|
||||
log {
|
||||
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
|
||||
null_log_propagate: false
|
||||
task_log_buffer_capacity: 66
|
||||
|
||||
# disable urllib info and lower levels
|
||||
disable_urllib3_info: true
|
||||
}
|
||||
|
||||
development {
|
||||
# Development-mode options
|
||||
|
||||
# dev task reuse window
|
||||
task_reuse_time_window_in_hours: 72.0
|
||||
|
||||
# Run VCS repository detection asynchronously
|
||||
vcs_repo_detect_async: true
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff: true
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
support_stopping: true
|
||||
|
||||
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
|
||||
default_output_uri: ""
|
||||
|
||||
# Default auto generated requirements optimize for smaller requirements
|
||||
# If True, analyze the entire repository regardless of the entry point.
|
||||
# If False, first analyze the entry point script, if it does not contain other to local files,
|
||||
# do not analyze the entire repository.
|
||||
force_analyze_entire_repo: false
|
||||
|
||||
# If set to true, *clearml* update message will not be printed to the console
|
||||
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
|
||||
suppress_update_message: false
|
||||
|
||||
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
|
||||
detect_with_pip_freeze: false
|
||||
|
||||
# Development mode worker
|
||||
worker {
|
||||
# Status report period in seconds
|
||||
report_period_sec: 2
|
||||
|
||||
# ping to the server - check connectivity
|
||||
ping_period_sec: 30
|
||||
|
||||
# Log all stdout & stderr
|
||||
log_stdout: true
|
||||
|
||||
# compatibility feature, report memory usage for the entire machine
|
||||
# default (false), report only on the running process and its sub-processes
|
||||
report_global_mem_used: false
|
||||
}
|
||||
}
|
||||
}
|
||||
41
docker/k8s-glue/build-resources/entrypoint.sh
Normal file
41
docker/k8s-glue/build-resources/entrypoint.sh
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
|
||||
|
||||
if [ -z "$CLEARML_FILES_HOST" ]; then
|
||||
CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
|
||||
fi
|
||||
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
|
||||
export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
|
||||
export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
|
||||
|
||||
echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
|
||||
|
||||
if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
|
||||
if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
|
||||
python3 -m pip install -q -U "$CLEARML_AGENT_UPDATE_REPO"
|
||||
else
|
||||
python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
|
||||
fi
|
||||
fi
|
||||
|
||||
QUEUE=${K8S_GLUE_QUEUE:-k8s_glue}
|
||||
MAX_PODS=${K8S_GLUE_MAX_PODS:-2}
|
||||
EXTRA_ARGS=${K8S_GLUE_EXTRA_ARGS:-}
|
||||
|
||||
# shellcheck disable=SC2129
|
||||
echo "api.credentials.access_key: ${CLEARML_API_ACCESS_KEY}" >> ~/clearml.conf
|
||||
echo "api.credentials.secret_key: ${CLEARML_API_SECRET_KEY}" >> ~/clearml.conf
|
||||
echo "api.api_server: ${CLEARML_API_HOST}" >> ~/clearml.conf
|
||||
echo "api.web_server: ${CLEARML_WEB_HOST}" >> ~/clearml.conf
|
||||
echo "api.files_server: ${CLEARML_FILES_HOST}" >> ~/clearml.conf
|
||||
|
||||
./provider_entrypoint.sh
|
||||
|
||||
if [[ -z "${K8S_GLUE_MAX_PODS}" ]]
|
||||
then
|
||||
python3 k8s_glue_example.py --queue ${QUEUE} ${EXTRA_ARGS}
|
||||
else
|
||||
python3 k8s_glue_example.py --queue ${QUEUE} --max-pods ${MAX_PODS} ${EXTRA_ARGS}
|
||||
fi
|
||||
18
docker/k8s-glue/build-resources/setup.sh
Normal file
18
docker/k8s-glue/build-resources/setup.sh
Normal file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
chmod +x /root/entrypoint.sh
|
||||
|
||||
apt-get update -qqy
|
||||
apt-get dist-upgrade -qqy
|
||||
apt-get install -qqy curl unzip less locales
|
||||
|
||||
locale-gen en_US.UTF-8
|
||||
|
||||
apt-get update -qqy
|
||||
apt-get install -qqy curl gcc python3-dev python3-pip apt-transport-https lsb-release openssh-client git gnupg
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
apt clean
|
||||
|
||||
python3 -m pip install -U pip
|
||||
python3 -m pip install --no-cache-dir clearml-agent
|
||||
python3 -m pip install -U --no-cache-dir "cryptography>=2.9"
|
||||
23
docker/k8s-glue/glue-build-aws/Dockerfile
Normal file
23
docker/k8s-glue/glue-build-aws/Dockerfile
Normal file
@@ -0,0 +1,23 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LANGUAGE=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY ./setup.sh /root/setup.sh
|
||||
RUN /root/setup.sh
|
||||
|
||||
COPY ./setup_aws.sh /root/setup_aws.sh
|
||||
RUN chmod +x /root/setup_aws.sh && /root/setup_aws.sh
|
||||
|
||||
COPY ./entrypoint.sh /root/entrypoint.sh
|
||||
COPY ./provider_entrypoint.sh /root/provider_entrypoint.sh
|
||||
RUN chmod +x /root/provider_entrypoint.sh
|
||||
COPY ./k8s_glue_example.py /root/k8s_glue_example.py
|
||||
COPY ./clearml.conf /root/clearml.conf
|
||||
|
||||
ENTRYPOINT ["/root/entrypoint.sh"]
|
||||
4
docker/k8s-glue/glue-build-aws/provider_entrypoint.sh
Normal file
4
docker/k8s-glue/glue-build-aws/provider_entrypoint.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
source /root/.bashrc
|
||||
export PATH=$PATH:$HOME/bin
|
||||
15
docker/k8s-glue/glue-build-aws/setup_aws.sh
Normal file
15
docker/k8s-glue/glue-build-aws/setup_aws.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
||||
unzip awscliv2.zip
|
||||
./aws/install
|
||||
|
||||
curl -o kubectl https://s3.us-west-2.amazonaws.com/amazon-eks/1.29.3/2024-04-19/bin/linux/amd64/kubectl
|
||||
#curl -o kubectl https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/kubectl
|
||||
#curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl
|
||||
chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin
|
||||
|
||||
curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator
|
||||
#curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/aws-iam-authenticator
|
||||
chmod +x ./aws-iam-authenticator && mkdir -p $HOME/bin && cp ./aws-iam-authenticator $HOME/bin/aws-iam-authenticator && export PATH=$PATH:$HOME/bin
|
||||
echo 'export PATH=$PATH:$HOME/bin' >> ~/.bashrc
|
||||
22
docker/k8s-glue/glue-build-gcp/Dockerfile
Normal file
22
docker/k8s-glue/glue-build-gcp/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LANGUAGE=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY ./setup.sh /root/setup.sh
|
||||
RUN /root/setup.sh
|
||||
|
||||
COPY ./setup_gcp.sh /root/setup_gcp.sh
|
||||
RUN chmod +x /root/setup_gcp.sh && /root/setup_gcp.sh
|
||||
|
||||
COPY ./entrypoint.sh /root/entrypoint.sh
|
||||
COPY ./provider_entrypoint.sh /root/provider_entrypoint.sh
|
||||
COPY ./k8s_glue_example.py /root/k8s_glue_example.py
|
||||
COPY ./clearml.conf /root/clearml.conf
|
||||
|
||||
ENTRYPOINT ["/root/entrypoint.sh"]
|
||||
4
docker/k8s-glue/glue-build-gcp/provider_entrypoint.sh
Normal file
4
docker/k8s-glue/glue-build-gcp/provider_entrypoint.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
gcloud auth activate-service-account ${CLEARML_SERVICE_ACC} --key-file=/root/keys/${SERVICE_ACC_KEY_JSON}
|
||||
gcloud container clusters get-credentials ${CLUSTER_CRED}
|
||||
14
docker/k8s-glue/glue-build-gcp/setup_gcp.sh
Normal file
14
docker/k8s-glue/glue-build-gcp/setup_gcp.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
curl -LO https://dl.k8s.io/release/v1.29.3/bin/linux/amd64/kubectl
|
||||
|
||||
install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
|
||||
|
||||
sudo apt-get install -y apt-transport-https ca-certificates gnupg
|
||||
|
||||
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
|
||||
|
||||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
|
||||
|
||||
apt-get update -y
|
||||
apt-get install -y google-cloud-sdk
|
||||
75
docker/k8s-glue/glue-build/Dockerfile.alpine
Normal file
75
docker/k8s-glue/glue-build/Dockerfile.alpine
Normal file
@@ -0,0 +1,75 @@
|
||||
ARG TAG=3.7.17-alpine3.18
|
||||
|
||||
FROM python:${TAG} as build
|
||||
|
||||
RUN apk add --no-cache \
|
||||
gcc \
|
||||
musl-dev \
|
||||
libffi-dev
|
||||
|
||||
RUN python3 \
|
||||
-m pip \
|
||||
install \
|
||||
--prefix=/install \
|
||||
--no-cache-dir \
|
||||
-U \
|
||||
clearml-agent \
|
||||
cryptography>=2.9
|
||||
|
||||
FROM python:${TAG} as target
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ARG KUBECTL_VERSION=1.29.3
|
||||
|
||||
# Not sure about these ENV vars
|
||||
# ENV LC_ALL=en_US.UTF-8
|
||||
# ENV LANG=en_US.UTF-8
|
||||
# ENV LANGUAGE=en_US.UTF-8
|
||||
# ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY --from=build /install /usr/local
|
||||
|
||||
ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/
|
||||
|
||||
RUN chmod +x /usr/bin/kubectl
|
||||
|
||||
RUN apk add --no-cache \
|
||||
bash
|
||||
|
||||
COPY k8s_glue_example.py .
|
||||
|
||||
# AWS CLI
|
||||
# https://github.com/kyleknap/aws-cli/blob/source-proposal/proposals/source-install.md#alpine-linux
|
||||
# https://github.com/aws/aws-cli/issues/4685
|
||||
# https://github.com/aws/aws-cli/pull/6352
|
||||
|
||||
# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/alpine/Dockerfile
|
||||
|
||||
FROM target as gcp
|
||||
|
||||
ARG CLOUD_SDK_VERSION=371.0.0
|
||||
ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION
|
||||
ENV PATH /google-cloud-sdk/bin:$PATH
|
||||
|
||||
WORKDIR /
|
||||
|
||||
RUN apk --no-cache add \
|
||||
curl \
|
||||
python3 \
|
||||
py3-crcmod \
|
||||
py3-openssl \
|
||||
bash \
|
||||
libc6-compat \
|
||||
openssh-client \
|
||||
git \
|
||||
gnupg \
|
||||
&& curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
|
||||
tar xzf google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
|
||||
rm google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \
|
||||
gcloud config set core/disable_usage_reporting true && \
|
||||
gcloud config set component_manager/disable_update_check true && \
|
||||
gcloud config set metrics/environment github_docker_image && \
|
||||
gcloud --version
|
||||
|
||||
WORKDIR /app
|
||||
82
docker/k8s-glue/glue-build/Dockerfile.bullseye
Normal file
82
docker/k8s-glue/glue-build/Dockerfile.bullseye
Normal file
@@ -0,0 +1,82 @@
|
||||
ARG TAG=3.7.17-slim-bullseye
|
||||
|
||||
FROM python:${TAG} as target
|
||||
|
||||
ARG KUBECTL_VERSION=1.29.3
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN python3 \
|
||||
-m pip \
|
||||
install \
|
||||
--no-cache-dir \
|
||||
-U \
|
||||
clearml-agent \
|
||||
cryptography>=2.9
|
||||
|
||||
# Not sure about these ENV vars
|
||||
# ENV LC_ALL=en_US.UTF-8
|
||||
# ENV LANG=en_US.UTF-8
|
||||
# ENV LANGUAGE=en_US.UTF-8
|
||||
# ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/
|
||||
|
||||
RUN chmod +x /usr/bin/kubectl
|
||||
|
||||
COPY k8s_glue_example.py .
|
||||
|
||||
CMD ["python3", "k8s_glue_example.py"]
|
||||
|
||||
FROM target as aws
|
||||
|
||||
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
|
||||
# https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html
|
||||
|
||||
RUN apt-get update -qqy && \
|
||||
apt-get install -qqy \
|
||||
unzip && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ADD https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip awscliv2.zip
|
||||
ADD https://amazon-eks.s3.us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator
|
||||
|
||||
RUN unzip awscliv2.zip && \
|
||||
./aws/install && \
|
||||
rm -r awscliv2.zip aws/ && \
|
||||
chmod +x /usr/local/bin/aws-iam-authenticator && \
|
||||
aws --version && \
|
||||
aws-iam-authenticator version
|
||||
|
||||
# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/debian_slim/Dockerfile
|
||||
|
||||
FROM target as gcp
|
||||
|
||||
ARG CLOUD_SDK_VERSION=371.0.0
|
||||
ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION
|
||||
|
||||
ENV PATH "$PATH:/opt/google-cloud-sdk/bin/"
|
||||
|
||||
ARG INSTALL_COMPONENTS
|
||||
RUN mkdir -p /usr/share/man/man1/
|
||||
RUN apt-get update -qqy && \
|
||||
apt-get install -qqy \
|
||||
curl \
|
||||
gcc \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
apt-transport-https \
|
||||
lsb-release \
|
||||
openssh-client \
|
||||
git \
|
||||
gnupg && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
pip3 install -U crcmod && \
|
||||
export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \
|
||||
echo "deb https://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" > /etc/apt/sources.list.d/google-cloud-sdk.list && \
|
||||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
|
||||
apt-get update && apt-get install -y google-cloud-sdk=${CLOUD_SDK_VERSION}-0 $INSTALL_COMPONENTS && \
|
||||
gcloud config set core/disable_usage_reporting true && \
|
||||
gcloud config set component_manager/disable_update_check true && \
|
||||
gcloud config set metrics/environment github_docker_image && \
|
||||
gcloud --version
|
||||
47
docker/k8s-glue/k8s-glue-aws.yml
Normal file
47
docker/k8s-glue/k8s-glue-aws.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: k8s-glue
|
||||
spec:
|
||||
serviceAccountName: ""
|
||||
containers:
|
||||
- name: k8s-glue-container
|
||||
image: allegroai/clearml-agent-k8s:aws-latest-1.21
|
||||
imagePullPolicy: Always
|
||||
command: [
|
||||
"/bin/bash",
|
||||
"-c",
|
||||
"source /root/.bashrc && /root/entrypoint.sh"
|
||||
]
|
||||
volumeMounts:
|
||||
- name: pod-template
|
||||
mountPath: /root/template
|
||||
env:
|
||||
- name: CLEARML_API_HOST
|
||||
value: ""
|
||||
- name: CLEARML_WEB_HOST
|
||||
value: ""
|
||||
- name: CLEARML_FILES_HOST
|
||||
value: ""
|
||||
# - name: K8S_GLUE_MAX_PODS
|
||||
# value: "2"
|
||||
- name: K8S_GLUE_QUEUE
|
||||
value: "k8s-glue"
|
||||
- name: K8S_GLUE_EXTRA_ARGS
|
||||
value: "--template-yaml /root/template/pod_template.yml"
|
||||
- name: CLEARML_API_ACCESS_KEY
|
||||
value: ""
|
||||
- name: CLEARML_API_SECRET_KEY
|
||||
value: ""
|
||||
- name: CLEARML_WORKER_ID
|
||||
value: "k8s-glue-agent"
|
||||
- name: CLEARML_AGENT_UPDATE_REPO
|
||||
value: ""
|
||||
- name: FORCE_CLEARML_AGENT_REPO
|
||||
value: ""
|
||||
- name: CLEARML_DOCKER_IMAGE
|
||||
value: "ubuntu:18.04"
|
||||
volumes:
|
||||
- name: pod-template
|
||||
secret:
|
||||
secretName: k8s-glue-pod-template
|
||||
58
docker/k8s-glue/k8s-glue-gcp.yml
Normal file
58
docker/k8s-glue/k8s-glue-gcp.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: k8s-glue
|
||||
spec:
|
||||
serviceAccountName: ""
|
||||
containers:
|
||||
- name: k8s-glue-container
|
||||
image: allegroai/clearml-agent-k8s:gcp-latest-1.21
|
||||
imagePullPolicy: Always
|
||||
command: [
|
||||
"/bin/bash",
|
||||
"-c",
|
||||
"source /root/.bashrc && /root/entrypoint.sh"
|
||||
]
|
||||
volumeMounts:
|
||||
- name: pod-template
|
||||
mountPath: /root/template
|
||||
- name: service-acc-key
|
||||
mountPath: /root/keys
|
||||
env:
|
||||
- name: CLEARML_API_HOST
|
||||
value: ""
|
||||
- name: CLEARML_WEB_HOST
|
||||
value: ""
|
||||
- name: CLEARML_FILES_HOST
|
||||
value: ""
|
||||
# - name: K8S_GLUE_MAX_PODS
|
||||
# value: "2"
|
||||
- name: K8S_GLUE_QUEUE
|
||||
value: "k8s-glue"
|
||||
- name: K8S_GLUE_EXTRA_ARGS
|
||||
value: "--template-yaml /root/template/pod_template.yml"
|
||||
- name: CLEARML_API_ACCESS_KEY
|
||||
value: ""
|
||||
- name: CLEARML_API_SECRET_KEY
|
||||
value: ""
|
||||
- name: CLEARML_WORKER_ID
|
||||
value: "k8s-glue-agent"
|
||||
- name: CLEARML_AGENT_UPDATE_REPO
|
||||
value: ""
|
||||
- name: FORCE_CLEARML_AGENT_REPO
|
||||
value: ""
|
||||
- name: CLEARML_DOCKER_IMAGE
|
||||
value: "ubuntu:18.04"
|
||||
- name: CLEARML_SERVICE_ACC
|
||||
value: ""
|
||||
- name: SERVICE_ACC_KEY_JSON
|
||||
value: service-account-key.json
|
||||
- name: CLUSTER_CRED
|
||||
value: ""
|
||||
volumes:
|
||||
- name: pod-template
|
||||
secret:
|
||||
secretName: k8s-glue-pod-template
|
||||
- name: service-acc-key
|
||||
secret:
|
||||
secretName: k8s-glue-service-acc-key
|
||||
13
docker/k8s-glue/pod_template.yml
Normal file
13
docker/k8s-glue/pod_template.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
namespace: clearml
|
||||
spec:
|
||||
containers:
|
||||
- resources:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 4G
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 4G
|
||||
restartPolicy: Never
|
||||
7
docker/k8s-glue/task-pod-build/Dockerfile
Normal file
7
docker/k8s-glue/task-pod-build/Dockerfile
Normal file
@@ -0,0 +1,7 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
COPY ./setup.sh /root/setup.sh
|
||||
|
||||
RUN /root/setup.sh
|
||||
10
docker/k8s-glue/task-pod-build/setup.sh
Normal file
10
docker/k8s-glue/task-pod-build/setup.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean
|
||||
chown -R root /root/.cache/pip
|
||||
|
||||
apt-get update -y
|
||||
apt-get dist-upgrade -y
|
||||
apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0 curl python3-pip
|
||||
|
||||
python3 -m pip install -U pip
|
||||
python3 -m pip install clearml-agent
|
||||
python3 -m pip install -U "cryptography>=2.9"
|
||||
@@ -19,7 +19,7 @@ RUN locale-gen en_US.UTF-8
|
||||
RUN apt-get install -y curl python3-pip git
|
||||
RUN curl -sSL https://get.docker.com/ | sh
|
||||
RUN python3 -m pip install -U pip
|
||||
RUN python3 -m pip install trains-agent
|
||||
RUN python3 -m pip install clearml-agent
|
||||
RUN python3 -m pip install -U "cryptography>=2.9"
|
||||
|
||||
ENTRYPOINT ["/usr/agent/entrypoint.sh"]
|
||||
|
||||
@@ -1,14 +1,42 @@
|
||||
#!/bin/sh
|
||||
#!/bin/bash +x
|
||||
|
||||
if [ -z "$TRAINS_FILES_HOST" ]; then
|
||||
TRAINS_HOST_IP=${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}
|
||||
if [ -n "$SHUTDOWN_IF_NO_ACCESS_KEY" ] && [ -z "$CLEARML_API_ACCESS_KEY" ] && [ -z "$TRAINS_API_ACCESS_KEY" ]; then
|
||||
echo "CLEARML_API_ACCESS_KEY was not provided, service will not be started"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
TRAINS_FILES_HOST=${TRAINS_FILES_HOST:-"http://$TRAINS_HOST_IP:8081"}
|
||||
TRAINS_WEB_HOST=${TRAINS_WEB_HOST:-"http://$TRAINS_HOST_IP:8080"}
|
||||
TRAINS_API_HOST=${TRAINS_API_HOST:-"http://$TRAINS_HOST_IP:8008"}
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
|
||||
|
||||
echo $TRAINS_FILES_HOST $TRAINS_WEB_HOST $TRAINS_API_HOST 1>&2
|
||||
if [ -z "$CLEARML_FILES_HOST" ]; then
|
||||
CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
|
||||
fi
|
||||
|
||||
python3 -m pip install -q -U "trains-agent${TRAINS_AGENT_UPDATE_VERSION}"
|
||||
trains-agent daemon --services-mode --queue services --create-queue --docker "$TRAINS_AGENT_DEFAULT_BASE_DOCKER" --cpu-only $TRAINS_AGENT_EXTRA_ARGS
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
|
||||
export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
|
||||
export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
|
||||
|
||||
echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
|
||||
|
||||
if [[ "$CLEARML_AGENT_UPDATE_VERSION" =~ ^[0-9]{1,3}\.[0-9]{1,3}(\.[0-9]{1,3}([a-zA-Z]{1,3}[0-9]{1,3})?)?$ ]]
|
||||
then
|
||||
CLEARML_AGENT_UPDATE_VERSION="==$CLEARML_AGENT_UPDATE_VERSION"
|
||||
fi
|
||||
|
||||
DAEMON_OPTIONS=${CLEARML_AGENT_DAEMON_OPTIONS:---services-mode --create-queue}
|
||||
QUEUES=${CLEARML_AGENT_QUEUES:-services}
|
||||
|
||||
if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
|
||||
if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
|
||||
python3 -m pip install -q -U $CLEARML_AGENT_UPDATE_REPO
|
||||
else
|
||||
python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
|
||||
fi
|
||||
fi
|
||||
|
||||
DOCKER_ARGS="--docker \"${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}\""
|
||||
|
||||
if [ -n "$CLEARML_AGENT_NO_DOCKER" ]; then
|
||||
DOCKER_ARGS=""
|
||||
fi
|
||||
|
||||
clearml-agent daemon $DAEMON_OPTIONS --queue $QUEUES $DOCKER_ARGS --cpu-only ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
|
||||
|
||||
@@ -4,7 +4,7 @@ api {
|
||||
web_server: https://demoapp.demo.clear.ml
|
||||
files_server: https://demofiles.demo.clear.ml
|
||||
|
||||
# Credentials are generated in the webapp, https://demoapp.demo.clear.ml/profile
|
||||
# Credentials are generated in the webapp, https://app.clear.ml/settings/workspace-configuration
|
||||
# Overridden with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
|
||||
credentials {"access_key": "EGRTCO8JMSIGI6S39GTP43NFWXDQOW", "secret_key": "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"}
|
||||
|
||||
@@ -13,43 +13,76 @@ api {
|
||||
}
|
||||
|
||||
agent {
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
git_user=""
|
||||
git_pass=""
|
||||
# Limit credentials to a single domain, for example: github.com,
|
||||
# all other domains will use public access (no user/pass). Default: always send user/pass for any VCS domain
|
||||
git_host=""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: ""
|
||||
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Overridden with os environment: CLEARML_WORKER_NAME
|
||||
# Override with os environment: CLEARML_WORKER_ID
|
||||
# worker_id: "clearml-agent-machine1:gpu0"
|
||||
worker_id: ""
|
||||
|
||||
# worker name, replaces the hostname when creating a unique name for this worker
|
||||
# Overridden with os environment: CLEARML_WORKER_ID
|
||||
# Override with os environment: CLEARML_WORKER_NAME
|
||||
# worker_name: "clearml-agent-machine1"
|
||||
worker_name: ""
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# To learn how to generate git token GitHub/Bitbucket/GitLab:
|
||||
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
|
||||
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
|
||||
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# Limit credentials to a single domain, for example: github.com,
|
||||
# all other domains will use public access (no user/pass). Default: always send user/pass for any VCS domain
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# Force the root folder of the git repository (instead of the working directory) into the PYHTONPATH
|
||||
# default false, only the working directory will be added to the PYHTONPATH
|
||||
# force_git_root_python_path: false
|
||||
|
||||
# if set, use GIT_ASKPASS to pass user/pass when cloning / fetch repositories
|
||||
# it solves passing user/token to git submodules.
|
||||
# this is a safer way to ensure multiple users using the same repository will
|
||||
# not accidentally leak credentials
|
||||
# Note: this is only supported on Linux systems
|
||||
# enable_git_ask_pass: true
|
||||
|
||||
# in docker mode, if container's entrypoint automatically activated a virtual environment
|
||||
# use the activated virtual environment and install everything there
|
||||
# set to False to disable, and always create a new venv inheriting from the system_site_packages
|
||||
# docker_use_activated_venv: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
# pip_version: "<20"
|
||||
# specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
|
||||
# pip_version: ["<20.2 ; python_version < '3.10'", "<22.3 ; python_version >= '3.10'"]
|
||||
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
|
||||
# poetry_version: "<2",
|
||||
# poetry_install_extra_args: ["-v"]
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
@@ -60,25 +93,50 @@ agent {
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
extra_index_url: []
|
||||
|
||||
# additional flags to use when calling pip install, example: ["--use-deprecated=legacy-resolver", ]
|
||||
# extra_pip_install_flags: []
|
||||
|
||||
# control the pytorch wheel resolving algorithm, options are: "pip", "direct", "none"
|
||||
# Override with environment variable CLEARML_AGENT_PACKAGE_PYTORCH_RESOLVE
|
||||
# "pip" (default): would automatically detect the cuda version, and supply pip with the correct
|
||||
# extra-index-url, based on pytorch.org tables
|
||||
# "direct": would resolve a direct link to the pytorch wheel by parsing the pytorch.org pip repository
|
||||
# and matching the automatically detected cuda version with the required pytorch wheel.
|
||||
# if the exact cuda version is not found for the required pytorch wheel, it will try
|
||||
# a lower cuda version until a match is found
|
||||
# "none": No resolver used, install pytorch like any other package
|
||||
# pytorch_resolve: "pip"
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", ]
|
||||
conda_channels: ["pytorch", "conda-forge", "nvidia", "defaults", ]
|
||||
# conda_full_env_update: false
|
||||
|
||||
# notice this will not install any additional packages into the selected environment, should be used in
|
||||
# conjunction with CLEARML_CONDA_ENV_PACKAGE which points to an existing conda environment directory
|
||||
# conda_env_as_base_docker: false
|
||||
|
||||
# install into base conda environment
|
||||
# (should only be used if running in docker mode, because it will change the conda base enrichment)
|
||||
# use_conda_base_env: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# Note: this only controls the installation order of existing requirement packages (and does not add additional packages)
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
@@ -89,17 +147,31 @@ agent {
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.clearml/vcs-cache
|
||||
|
||||
# if git pull failed, always revert to re-cloning the repo, it protects against old user name changes
|
||||
# clone_on_pull_fail: false
|
||||
},
|
||||
|
||||
# DEPRECATED: please use `venvs_cache` and set `venvs_cache.path`
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
# venv_update: {
|
||||
# enabled: false,
|
||||
# },
|
||||
|
||||
# cached folder for specific python package download (mostly pytorch versions)
|
||||
pip_download_cache {
|
||||
@@ -108,6 +180,12 @@ agent {
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
|
||||
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
|
||||
# default is false, automatically mounts ~/.ssh
|
||||
# Must be set to True if using "clearml-session" with this agent!
|
||||
# disable_ssh_mount: false
|
||||
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
@@ -118,27 +196,128 @@ agent {
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
# You can also pass host environments into the container with ["-e", "HOST_NAME=$HOST_NAME"]
|
||||
# extra_docker_arguments: ["--ipc=host", "-v", "/mnt/host/data:/mnt/data"]
|
||||
|
||||
# Allow the extra docker arg to override task level docker arg (if the same argument is passed on both),
|
||||
# if set to False, a task docker arg will override the docker extra arg
|
||||
# docker_args_extra_precedes_task: true
|
||||
|
||||
# prevent a task docker args to be used if already specified in the extra_docker_arguments
|
||||
# protected_docker_extra_args: ["privileged", "security-opt", "network", "ipc"]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# Allow passing host environments into docker container with Task's docker container args
|
||||
# Example "-e HOST_NAME=$HOST_NAME"
|
||||
# NOTICE this might introduce security risk allowing access to keys/secret on the host machine1
|
||||
# Use with care!
|
||||
# docker_allow_host_environ: false
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
|
||||
image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host"]
|
||||
|
||||
# lookup table rules for default container
|
||||
# first matched rule will be picked, according to rule order
|
||||
# enterprise version only
|
||||
# match_rules: [
|
||||
# {
|
||||
# image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
|
||||
# arguments: "-e define=value"
|
||||
# match: {
|
||||
# script{
|
||||
# # Optional: must match all requirements (not partial)
|
||||
# requirements: {
|
||||
# # version selection matching PEP-440
|
||||
# pip: {
|
||||
# tensorflow: "~=2.6"
|
||||
# },
|
||||
# }
|
||||
# # Optional: matching based on regular expression, example: "^exact_match$"
|
||||
# repository: "/my_repository/"
|
||||
# branch: "main"
|
||||
# binary: "python3.6"
|
||||
# }
|
||||
# # Optional: matching based on regular expression, example: "^exact_match$"
|
||||
# project: "project/sub_project"
|
||||
# }
|
||||
# },
|
||||
# {
|
||||
# image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
|
||||
# arguments: "-e define=value"
|
||||
# match: {
|
||||
# # must match all requirements (not partial)
|
||||
# script{
|
||||
# requirements: {
|
||||
# conda: {
|
||||
# torch: ">=2.6,<2.8"
|
||||
# }
|
||||
# }
|
||||
# # no repository matching required
|
||||
# repository: ""
|
||||
# }
|
||||
# # no repository matching required
|
||||
# project: ""
|
||||
# }
|
||||
# },
|
||||
# ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# CUDA versions used for Conda setup & solving PyTorch wheel packages
|
||||
# it Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
parse_embedded_urls: true
|
||||
}
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
# docker_internal_mounts {
|
||||
# sdk_cache: "/clearml_agent_cache"
|
||||
# apt_cache: "/var/cache/apt/archives"
|
||||
# ssh_folder: "/root/.ssh"
|
||||
# ssh_ro_folder: "/.ssh"
|
||||
# pip_cache: "/root/.cache/pip"
|
||||
# poetry_cache: "/root/.cache/pypoetry"
|
||||
# vcs_cache: "/root/.clearml/vcs-cache"
|
||||
# venvs_cache: "/root/.clearml/venvs-cache"
|
||||
# venv_build: "~/.clearml/venvs-builds"
|
||||
# pip_download: "/root/.clearml/pip-download-cache"
|
||||
# }
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
|
||||
# Note: resulting name must start with an alphanumeric character and
|
||||
# continue with alphanumeric characters, underscores (_), dots (.) and/or dashes (-)
|
||||
# docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
}
|
||||
|
||||
sdk {
|
||||
@@ -146,7 +325,7 @@ sdk {
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
# Defaults to <system_temp_folder>/clearml_cache
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
}
|
||||
|
||||
@@ -210,6 +389,11 @@ sdk {
|
||||
key: ""
|
||||
secret: ""
|
||||
region: ""
|
||||
# Or enable credentials chain to let Boto3 pick the right credentials.
|
||||
# This includes picking credentials from environment variables,
|
||||
# credential file and IAM role using metadata service.
|
||||
# Refer to the latest Boto3 docs
|
||||
use_credentials_chain: false
|
||||
|
||||
credentials: [
|
||||
# specifies key/secret credentials to use when handling s3 urls (read or write)
|
||||
@@ -225,6 +409,7 @@ sdk {
|
||||
# secret: "12345678"
|
||||
# multipart: false
|
||||
# secure: false
|
||||
# verify: /path/to/ca/bundle.crt OR false to not verify
|
||||
# }
|
||||
]
|
||||
}
|
||||
@@ -278,7 +463,7 @@ sdk {
|
||||
vcs_repo_detect_async: True
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||
# This stores "git diff" or into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff_on_train: True
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
@@ -299,5 +484,50 @@ sdk {
|
||||
log_stdout: True
|
||||
}
|
||||
}
|
||||
|
||||
# Apply top-level environment section from configuration into os.environ
|
||||
apply_environment: true
|
||||
# Apply top-level files section from configuration into local file system
|
||||
apply_files: true
|
||||
}
|
||||
|
||||
# Environment section (top-level) is applied to the OS environment as `key=value` for each key/value pair
|
||||
# * enable/disable with `agent.apply_environment` OR `sdk.apply_environment`
|
||||
# Example:
|
||||
#
|
||||
# environment {
|
||||
# key_a: value_a
|
||||
# key_b: value_b
|
||||
# }
|
||||
|
||||
# Files section (top-level) allows auto-generating files at designated paths with
|
||||
# predefined content and target format.
|
||||
# * enable/disable with `agent.apply_files` OR `sdk.apply_files`
|
||||
# Files content options include:
|
||||
# contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
|
||||
# format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
|
||||
# base64-encoded contents string, otherwise ignored
|
||||
# path: the target file's path, may include ~ and inplace env vars
|
||||
# target_format: format used to encode contents before writing into the target file. Supported values are json,
|
||||
# yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
|
||||
# overwrite: overwrite the target file in case it exists. Default is true.
|
||||
# mode: file-system mode to be applied to the file after its creation. The mode string will be parsed into an
|
||||
# integer (e.g. "0o777" for -rwxrwxrwx)
|
||||
# Example:
|
||||
# files {
|
||||
# myfile1 {
|
||||
# contents: "The quick brown fox jumped over the lazy dog"
|
||||
# path: "/tmp/fox.txt"
|
||||
# }
|
||||
# myjsonfile {
|
||||
# contents: {
|
||||
# some {
|
||||
# nested {
|
||||
# value: [1, 2, 3, 4]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# path: "/tmp/test.json"
|
||||
# target_format: json
|
||||
# }
|
||||
# }
|
||||
|
||||
BIN
docs/clearml_architecture.png
Normal file
BIN
docs/clearml_architecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 123 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 2.0 MiB After Width: | Height: | Size: 1018 KiB |
@@ -146,7 +146,7 @@ sdk {
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
# Defaults to <system_temp_folder>/clearml_cache
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
}
|
||||
|
||||
|
||||
@@ -5,27 +5,30 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Auto-Magically Spin AWS EC2 Instances On Demand \n",
|
||||
"# and Create a Dynamic Cluster Running *Trains-Agent*\n",
|
||||
"# and Create a Dynamic Cluster Running *ClearML-Agent*\n",
|
||||
"\n",
|
||||
"### Define your budget and execute the notebook, that's it\n",
|
||||
"### You now have a fully managed cluster on AWS 🎉 🎊 "
|
||||
"## Define your budget and execute the notebook, that's it\n",
|
||||
"## You now have a fully managed cluster on AWS 🎉 🎊"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**trains-agent**'s main goal is to quickly pull a job from an execution queue, setup the environment (as defined in the experiment, including git cloning, python packages etc.) then execute the experiment and monitor it.\n",
|
||||
"**clearml-agent**'s main goal is to quickly pull a job from an execution queue, set up the environment (as defined in the experiment, including git cloning, python packages etc.), then execute the experiment and monitor it.\n",
|
||||
"\n",
|
||||
"This notebook defines a cloud budget (currently only AWS is supported, but feel free to expand with PRs), and spins an instance the minute a job is waiting for execution. It will also spin down idle machines, saving you some $$$ :)\n",
|
||||
"\n",
|
||||
"Configuration steps\n",
|
||||
"> **Note:**\n",
|
||||
"> This is just an example of how you can use ClearML Agent to implement custom autoscaling. For a more structured autoscaler script, see [here](https://github.com/allegroai/clearml/blob/master/clearml/automation/auto_scaler.py).\n",
|
||||
"\n",
|
||||
"Configuration steps:\n",
|
||||
"- Define maximum budget to be used (instance type / number of instances).\n",
|
||||
"- Create new execution *queues* in the **trains-server**.\n",
|
||||
"- Define mapping between the created the *queues* and an instance budget.\n",
|
||||
"- Create new execution *queues* in the **clearml-server**.\n",
|
||||
"- Define mapping between the created *queues* and an instance budget.\n",
|
||||
"\n",
|
||||
"**TL;DR - This notebook:**\n",
|
||||
"- Will spin instances if there are jobs in the execution *queues*, until it will hit the budget limit. \n",
|
||||
"- Will spin instances if there are jobs in the execution *queues* until it will hit the budget limit.\n",
|
||||
"- If machines are idle, it will spin them down.\n",
|
||||
"\n",
|
||||
"The controller implementation itself is stateless, meaning you can always re-execute the notebook, if for some reason it stopped.\n",
|
||||
@@ -39,7 +42,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Install & import required packages"
|
||||
"### Install & import required packages"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -48,7 +51,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install trains-agent\n",
|
||||
"!pip install clearml-agent\n",
|
||||
"!pip install boto3"
|
||||
]
|
||||
},
|
||||
@@ -56,7 +59,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Define AWS instance types and configuration (Instance Type, EBS, AMI etc.)"
|
||||
"### Define AWS instance types and configuration (Instance Type, EBS, AMI etc.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -92,17 +95,17 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Define machine budget per execution queue\n",
|
||||
"### Define machine budget per execution queue\n",
|
||||
"\n",
|
||||
"Now that we defined our budget, we need to connect it with the **Trains** cluster.\n",
|
||||
"Now that we defined our budget, we need to connect it with the **ClearML** cluster.\n",
|
||||
"\n",
|
||||
"We map each queue to a resource type (instance type).\n",
|
||||
"\n",
|
||||
"Create two queues in the WebUI:\n",
|
||||
"- Browse to http://your_trains_server_ip:8080/workers-and-queues/queues\n",
|
||||
"Create two queues in the Web UI:\n",
|
||||
"- Browse to http://your_clearml_server_ip:8080/workers-and-queues/queues\n",
|
||||
"- Then click on the \"New Queue\" button and name your queues \"aws_normal\" and \"aws_high\" respectively\n",
|
||||
"\n",
|
||||
"The QUEUES dictionary hold the mapping between the queue name and the type/number of instances to spin connected to the specific queue.\n",
|
||||
"The QUEUES dictionary holds the mapping between the queue name and the type/number of instances to spin connected to the specific queue.\n",
|
||||
"```\n",
|
||||
"QUEUES = {\n",
|
||||
" 'queue_name': [(\"instance-type-as-defined-in-RESOURCE_CONFIGURATIONS\", max_number_of_instances), ]\n",
|
||||
@@ -116,7 +119,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Trains-Agent Queues - Machines budget per Queue\n",
|
||||
"# ClearML Agent Queues - Machines budget per Queue\n",
|
||||
"# Per queue: list of (machine type as defined in RESOURCE_CONFIGURATIONS,\n",
|
||||
"# max instances for the specific queue). Order machines from most preferred to least.\n",
|
||||
"QUEUES = {\n",
|
||||
@@ -129,7 +132,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Credentials for your AWS account, as well as for your **Trains-Server**"
|
||||
"### Credentials for your AWS account, as well as for your **ClearML Server**"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -143,24 +146,25 @@
|
||||
"CLOUD_CREDENTIALS_SECRET = \"\"\n",
|
||||
"CLOUD_CREDENTIALS_REGION = \"us-east-1\"\n",
|
||||
"\n",
|
||||
"# TRAINS configuration\n",
|
||||
"TRAINS_SERVER_WEB_SERVER = \"http://localhost:8080\"\n",
|
||||
"TRAINS_SERVER_API_SERVER = \"http://localhost:8008\"\n",
|
||||
"TRAINS_SERVER_FILES_SERVER = \"http://localhost:8081\"\n",
|
||||
"# TRAINS credentials\n",
|
||||
"TRAINS_ACCESS_KEY = \"\"\n",
|
||||
"TRAINS_SECRET_KEY = \"\"\n",
|
||||
"# Git User/Pass to be used by trains-agent,\n",
|
||||
"# CLEARML configuration\n",
|
||||
"CLEARML_WEB_SERVER = \"http://localhost:8080\"\n",
|
||||
"CLEARML_API_SERVER = \"http://localhost:8008\"\n",
|
||||
"CLEARML_FILES_SERVER = \"http://localhost:8081\"\n",
|
||||
"# CLEARML credentials\n",
|
||||
"CLEARML_API_ACCESS_KEY = \"\"\n",
|
||||
"CLEARML_API_SECRET_KEY = \"\"\n",
|
||||
"# Git User/Pass to be used by clearml-agent,\n",
|
||||
"# leave empty if image already contains git ssh-key\n",
|
||||
"TRAINS_GIT_USER = \"\"\n",
|
||||
"TRAINS_GIT_PASS = \"\"\n",
|
||||
"CLEARML_AGENT_GIT_USER = \"\"\n",
|
||||
"CLEARML_AGENT_GIT_PASS = \"\"\n",
|
||||
"\n",
|
||||
"# Additional fields for trains.conf file created on the remote instance\n",
|
||||
"# for example: 'agent.default_docker.image: \"nvidia/cuda:10.0-cudnn7-runtime\"'\n",
|
||||
"EXTRA_TRAINS_CONF = \"\"\"\n",
|
||||
"# Additional fields for clearml.conf file created on the remote instance\n",
|
||||
"# for example: 'agent.default_docker.image: \"nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04\"'\n",
|
||||
"\n",
|
||||
"EXTRA_CLEARML_CONF = \"\"\"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Bash script to run on instances before running trains-agent\n",
|
||||
"# Bash script to run on instances before running clearml-agent\n",
|
||||
"# Example: \"\"\"\n",
|
||||
"# echo \"This is the first line\"\n",
|
||||
"# echo \"This is the second line\"\n",
|
||||
@@ -168,9 +172,9 @@
|
||||
"EXTRA_BASH_SCRIPT = \"\"\"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Default docker for trains-agent when running in docker mode (requires docker v19.03 and above). \n",
|
||||
"# Leave empty to run trains-agent in non-docker mode.\n",
|
||||
"DEFAULT_DOCKER_IMAGE = \"nvidia/cuda\""
|
||||
"# Default docker for clearml-agent when running in docker mode (requires docker v19.03 and above).\n",
|
||||
"# Leave empty to run clearml-agent in non-docker mode.\n",
|
||||
"CLEARML_AGENT_DOCKER_IMAGE = \"nvidia/cuda\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -192,7 +196,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Import Packages and Budget Definition Sanity Check"
|
||||
"### Import Packages and Budget Definition Sanity Check"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -209,7 +213,7 @@
|
||||
"from time import sleep, time\n",
|
||||
"\n",
|
||||
"import boto3\n",
|
||||
"from trains_agent.backend_api.session.client import APIClient"
|
||||
"from clearml_agent.backend_api.session.client import APIClient"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -227,36 +231,36 @@
|
||||
" \"A resource name can only appear in a single queue definition.\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# Encode EXTRA_TRAINS_CONF for later bash script usage\n",
|
||||
"EXTRA_TRAINS_CONF_ENCODED = \"\\\\\\\"\".join(EXTRA_TRAINS_CONF.split(\"\\\"\"))"
|
||||
"# Encode EXTRA_CLEARML_CONF for later bash script usage\n",
|
||||
"EXTRA_CLEARML_CONF_ENCODED = \"\\\\\\\"\".join(EXTRA_CLEARML_CONF.split(\"\\\"\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Cloud specific implementation of spin up/down - currently supports AWS only"
|
||||
"### Cloud specific implementation of spin up/down - currently supports AWS only"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cloud-specific implementation (currently, only AWS EC2 is supported)\n",
|
||||
"def spin_up_worker(resource, worker_id_prefix, queue_name):\n",
|
||||
" \"\"\"\n",
|
||||
" Creates a new worker for trains.\n",
|
||||
" Creates a new worker for clearml.\n",
|
||||
" First, create an instance in the cloud and install some required packages.\n",
|
||||
" Then, define trains-agent environment variables and run \n",
|
||||
" trains-agent for the specified queue.\n",
|
||||
" Then, define clearml-agent environment variables and run\n",
|
||||
" clearml-agent for the specified queue.\n",
|
||||
" NOTE: - Will wait until instance is running\n",
|
||||
" - This implementation assumes the instance image already has docker installed\n",
|
||||
"\n",
|
||||
" :param str resource: resource name, as defined in BUDGET and QUEUES.\n",
|
||||
" :param str worker_id_prefix: worker name prefix\n",
|
||||
" :param str queue_name: trains queue to listen to\n",
|
||||
" :param str queue_name: clearml queue to listen to\n",
|
||||
" \"\"\"\n",
|
||||
" resource_conf = RESOURCE_CONFIGURATIONS[resource]\n",
|
||||
" # Add worker type and AWS instance type to the worker name.\n",
|
||||
@@ -267,8 +271,8 @@
|
||||
" )\n",
|
||||
"\n",
|
||||
" # user_data script will automatically run when the instance is started. \n",
|
||||
" # It will install the required packages for trains-agent configure it using \n",
|
||||
" # environment variables and run trains-agent on the required queue\n",
|
||||
" # It will install the required packages for clearml-agent configure it using\n",
|
||||
" # environment variables and run clearml-agent on the required queue\n",
|
||||
" user_data = \"\"\"#!/bin/bash\n",
|
||||
" sudo apt-get update\n",
|
||||
" sudo apt-get install -y python3-dev\n",
|
||||
@@ -278,36 +282,36 @@
|
||||
" sudo apt-get install -y build-essential\n",
|
||||
" python3 -m pip install -U pip\n",
|
||||
" python3 -m pip install virtualenv\n",
|
||||
" python3 -m virtualenv trains_agent_venv\n",
|
||||
" source trains_agent_venv/bin/activate\n",
|
||||
" python -m pip install trains-agent\n",
|
||||
" echo 'agent.git_user=\\\"{git_user}\\\"' >> /root/trains.conf\n",
|
||||
" echo 'agent.git_pass=\\\"{git_pass}\\\"' >> /root/trains.conf\n",
|
||||
" echo \"{trains_conf}\" >> /root/trains.conf\n",
|
||||
" export TRAINS_API_HOST={api_server}\n",
|
||||
" export TRAINS_WEB_HOST={web_server}\n",
|
||||
" export TRAINS_FILES_HOST={files_server}\n",
|
||||
" python3 -m virtualenv clearml_agent_venv\n",
|
||||
" source clearml_agent_venv/bin/activate\n",
|
||||
" python -m pip install clearml-agent\n",
|
||||
" echo 'agent.git_user=\\\"{git_user}\\\"' >> /root/clearml.conf\n",
|
||||
" echo 'agent.git_pass=\\\"{git_pass}\\\"' >> /root/clearml.conf\n",
|
||||
" echo \"{clearml_conf}\" >> /root/clearml.conf\n",
|
||||
" export CLEARML_API_HOST={api_server}\n",
|
||||
" export CLEARML_WEB_HOST={web_server}\n",
|
||||
" export CLEARML_FILES_HOST={files_server}\n",
|
||||
" export DYNAMIC_INSTANCE_ID=`curl http://169.254.169.254/latest/meta-data/instance-id`\n",
|
||||
" export TRAINS_WORKER_ID={worker_id}:$DYNAMIC_INSTANCE_ID\n",
|
||||
" export TRAINS_API_ACCESS_KEY='{access_key}'\n",
|
||||
" export TRAINS_API_SECRET_KEY='{secret_key}'\n",
|
||||
" export CLEARML_WORKER_ID={worker_id}:$DYNAMIC_INSTANCE_ID\n",
|
||||
" export CLEARML_API_ACCESS_KEY='{access_key}'\n",
|
||||
" export CLEARML_API_SECRET_KEY='{secret_key}'\n",
|
||||
" {bash_script}\n",
|
||||
" source ~/.bashrc\n",
|
||||
" python -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' {docker}\n",
|
||||
" python -m clearml_agent --config-file '/root/clearml.conf' daemon --queue '{queue}' {docker}\n",
|
||||
" shutdown\n",
|
||||
" \"\"\".format(\n",
|
||||
" api_server=TRAINS_SERVER_API_SERVER,\n",
|
||||
" web_server=TRAINS_SERVER_WEB_SERVER,\n",
|
||||
" files_server=TRAINS_SERVER_FILES_SERVER,\n",
|
||||
" api_server=CLEARML_API_SERVER,\n",
|
||||
" web_server=CLEARML_WEB_SERVER,\n",
|
||||
" files_server=CLEARML_FILES_SERVER,\n",
|
||||
" worker_id=worker_id,\n",
|
||||
" access_key=TRAINS_ACCESS_KEY,\n",
|
||||
" secret_key=TRAINS_SECRET_KEY,\n",
|
||||
" access_key=CLEARML_API_ACCESS_KEY,\n",
|
||||
" secret_key=CLEARML_API_SECRET_KEY,\n",
|
||||
" queue=queue_name,\n",
|
||||
" git_user=TRAINS_GIT_USER,\n",
|
||||
" git_pass=TRAINS_GIT_PASS,\n",
|
||||
" trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n",
|
||||
" git_user=CLEARML_AGENT_GIT_USER,\n",
|
||||
" git_pass=CLEARML_AGENT_GIT_PASS,\n",
|
||||
" clearml_conf=EXTRA_CLEARML_CONF_ENCODED,\n",
|
||||
" bash_script=EXTRA_BASH_SCRIPT,\n",
|
||||
" docker=\"--docker '{}'\".format(DEFAULT_DOCKER_IMAGE) if DEFAULT_DOCKER_IMAGE else \"\"\n",
|
||||
" docker=\"--docker '{}'\".format(CLEARML_AGENT_DOCKER_IMAGE) if CLEARML_AGENT_DOCKER_IMAGE else \"\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ec2 = boto3.client(\n",
|
||||
@@ -405,7 +409,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"###### Controller Implementation and Logic"
|
||||
"#### Controller Implementation and Logic"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -430,18 +434,18 @@
|
||||
"\n",
|
||||
" # Internal definitions\n",
|
||||
" workers_prefix = \"dynamic_aws\"\n",
|
||||
" # Worker's id in trains would be composed from:\n",
|
||||
" # Worker's id in clearml would be composed from:\n",
|
||||
" # prefix, name, instance_type and cloud_id separated by ';'\n",
|
||||
" workers_pattern = re.compile(\n",
|
||||
" r\"^(?P<prefix>[^:]+):(?P<name>[^:]+):(?P<instance_type>[^:]+):(?P<cloud_id>[^:]+)\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Set up the environment variables for trains\n",
|
||||
" os.environ[\"TRAINS_API_HOST\"] = TRAINS_SERVER_API_SERVER\n",
|
||||
" os.environ[\"TRAINS_WEB_HOST\"] = TRAINS_SERVER_WEB_SERVER\n",
|
||||
" os.environ[\"TRAINS_FILES_HOST\"] = TRAINS_SERVER_FILES_SERVER\n",
|
||||
" os.environ[\"TRAINS_API_ACCESS_KEY\"] = TRAINS_ACCESS_KEY\n",
|
||||
" os.environ[\"TRAINS_API_SECRET_KEY\"] = TRAINS_SECRET_KEY\n",
|
||||
" # Set up the environment variables for clearml\n",
|
||||
" os.environ[\"CLEARML_API_HOST\"] = CLEARML_API_SERVER\n",
|
||||
" os.environ[\"CLEARML_WEB_HOST\"] = CLEARML_WEB_SERVER\n",
|
||||
" os.environ[\"CLEARML_FILES_HOST\"] = CLEARML_FILES_SERVER\n",
|
||||
" os.environ[\"CLEARML_API_ACCESS_KEY\"] = CLEARM_API_ACCESS_KEY\n",
|
||||
" os.environ[\"CLEARML_API_SECRET_KEY\"] = CLEARML_API_SECRET_KEY\n",
|
||||
" api_client = APIClient()\n",
|
||||
"\n",
|
||||
" # Verify the requested queues exist and create those that doesn't exist\n",
|
||||
@@ -520,7 +524,7 @@
|
||||
" # skip resource types that might be needed\n",
|
||||
" if resources in required_idle_resources:\n",
|
||||
" continue\n",
|
||||
" # Remove from both aws and trains all instances that are \n",
|
||||
" # Remove from both aws and clearml all instances that are\n",
|
||||
" # idle for longer than MAX_IDLE_TIME_MIN\n",
|
||||
" if time() - timestamp > MAX_IDLE_TIME_MIN * 60.0:\n",
|
||||
" cloud_id = workers_pattern.match(worker.id)[\"cloud_id\"]\n",
|
||||
@@ -535,7 +539,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Execute Forever* (the controller is stateless, so you can always re-execute the notebook)"
|
||||
"### Execute Forever* (the controller is stateless, so you can always re-execute the notebook)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -584,4 +588,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,42 +10,89 @@ from clearml_agent.glue.k8s import K8sIntegration
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
|
||||
parser.add_argument(
|
||||
"--queue", type=str, help="Queue to pull tasks from"
|
||||
"--queue",
|
||||
type=str,
|
||||
help="Queues to pull tasks from. If multiple queues, use comma separated list, e.g. 'queue1,queue2'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ports-mode", action='store_true', default=False,
|
||||
group.add_argument(
|
||||
"--ports-mode",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
||||
"Should not be used with max-pods"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-of-services", type=int, default=20,
|
||||
help="Specify the number of k8s services to be used. Use only with ports-mode."
|
||||
"--num-of-services",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Specify the number of k8s services to be used. Use only with ports-mode.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-port", type=int,
|
||||
"--base-port",
|
||||
type=int,
|
||||
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
|
||||
"For pod #X, the port will be <base-port>+X"
|
||||
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
|
||||
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gateway-address", type=str, default=None,
|
||||
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
|
||||
"--base-pod-num",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
|
||||
"service (default: %(default)s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pod-clearml-conf", type=str,
|
||||
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
|
||||
"--gateway-address",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overrides-yaml", type=str,
|
||||
help="YAML file containing pod overrides to be used when launching a new pod"
|
||||
"--pod-clearml-conf",
|
||||
type=str,
|
||||
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template-yaml", type=str,
|
||||
"--overrides-yaml", type=str, help="YAML file containing pod overrides to be used when launching a new pod"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template-yaml",
|
||||
type=str,
|
||||
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
|
||||
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ssh-server-port", type=int, default=0,
|
||||
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
|
||||
"--ssh-server-port",
|
||||
type=int,
|
||||
default=0,
|
||||
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--namespace",
|
||||
type=str,
|
||||
help="Specify the namespace in which pods will be created (default: %(default)s)",
|
||||
default="clearml",
|
||||
)
|
||||
group.add_argument(
|
||||
"--max-pods",
|
||||
type=int,
|
||||
help="Limit the maximum number of pods that this service can run at the same time."
|
||||
"Should not be used with ports-mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-owner-token",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Generate and use task owner token for the execution of each task",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--create-queue",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Create the queue if it does not exist (default: %(default)s)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -55,20 +102,32 @@ def main():
|
||||
|
||||
user_props_cb = None
|
||||
if args.ports_mode and args.base_port:
|
||||
|
||||
def k8s_user_props_cb(pod_number=0):
|
||||
user_prop = {"k8s-pod-port": args.base_port + pod_number}
|
||||
if args.gateway_address:
|
||||
user_prop["k8s-gateway-address"] = args.gateway_address
|
||||
return user_prop
|
||||
|
||||
user_props_cb = k8s_user_props_cb
|
||||
|
||||
k8s = K8sIntegration(
|
||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, user_props_cb=user_props_cb,
|
||||
overrides_yaml=args.overrides_yaml, trains_conf_file=args.pod_trains_conf, template_yaml=args.template_yaml,
|
||||
extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None
|
||||
ports_mode=args.ports_mode,
|
||||
num_of_services=args.num_of_services,
|
||||
base_pod_num=args.base_pod_num,
|
||||
user_props_cb=user_props_cb,
|
||||
overrides_yaml=args.overrides_yaml,
|
||||
clearml_conf_file=args.pod_clearml_conf,
|
||||
template_yaml=args.template_yaml,
|
||||
extra_bash_init_script=K8sIntegration.get_ssh_server_bash(ssh_port_number=args.ssh_server_port)
|
||||
if args.ssh_server_port
|
||||
else None,
|
||||
namespace=args.namespace,
|
||||
max_pods_limit=args.max_pods or None,
|
||||
)
|
||||
k8s.k8s_daemon(args.queue)
|
||||
queue = [q.strip() for q in args.queue.split(",") if q.strip()] if args.queue else None
|
||||
|
||||
k8s.k8s_daemon(queue, use_owner_token=args.use_owner_token, create_queue=args.create_queue)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,20 +1,15 @@
|
||||
attrs>=18.0,<20.4.0
|
||||
attrs>=18.0,<24.0.0
|
||||
enum34>=0.9,<1.2.0 ; python_version < '3.6'
|
||||
furl>=2.0.0,<2.2.0
|
||||
future>=0.16.0,<0.19.0
|
||||
humanfriendly>=2.1,<9.2
|
||||
jsonschema>=2.6.0,<3.3.0
|
||||
jsonschema>=2.6.0,<5.0.0
|
||||
pathlib2>=2.3.0,<2.4.0
|
||||
psutil>=3.4.2,<5.9.0
|
||||
pyhocon>=0.3.38,<0.4.0
|
||||
pyparsing>=2.0.3,<2.5.0
|
||||
psutil>=3.4.2,<5.10.0
|
||||
pyparsing>=2.0.3,<3.2.0
|
||||
python-dateutil>=2.4.2,<2.9.0
|
||||
pyjwt>=1.6.4,<1.8.0
|
||||
PyYAML>=3.12,<5.4.0
|
||||
requests-file>=1.4.2,<1.6.0
|
||||
requests>=2.20.0,<2.26.0
|
||||
six>=1.11.0,<1.16.0
|
||||
tqdm>=4.19.5,<4.55.0
|
||||
typing>=3.6.4,<3.8.0
|
||||
urllib3>=1.21.1,<1.27.0
|
||||
virtualenv>=16,<20
|
||||
pyjwt>=2.4.0,<2.9.0
|
||||
PyYAML>=3.12,<6.1
|
||||
requests>=2.20.0,<=2.31.0
|
||||
six>=1.13.0,<1.17.0
|
||||
typing>=3.6.4,<3.8.0 ; python_version < '3.5'
|
||||
urllib3>=1.21.1,<2
|
||||
virtualenv>=16,<21
|
||||
|
||||
6
setup.py
6
setup.py
@@ -44,7 +44,7 @@ setup(
|
||||
author_email='clearml@allegro.ai',
|
||||
license='Apache License 2.0',
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: System Administrators',
|
||||
@@ -60,6 +60,10 @@ setup(
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
'Programming Language :: Python :: 3.11',
|
||||
'Programming Language :: Python :: 3.12',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
],
|
||||
|
||||
|
||||
Reference in New Issue
Block a user