mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
313 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9ffb8a053 | ||
|
|
2466eed23f | ||
|
|
6e31171d31 | ||
|
|
592254709e | ||
|
|
e43f31eb80 | ||
|
|
f50ba005b5 | ||
|
|
1011544533 | ||
|
|
6572023173 | ||
|
|
9c7e2aacd0 | ||
|
|
715f102f6d | ||
|
|
5446aed9cf | ||
|
|
b94ec85461 | ||
|
|
f55f4f7535 | ||
|
|
c87da3a079 | ||
|
|
c3590a53a8 | ||
|
|
a4315722ab | ||
|
|
c901bd331c | ||
|
|
df97f170a2 | ||
|
|
a30a2dad66 | ||
|
|
2432f5bb68 | ||
|
|
341086d86a | ||
|
|
1163c96438 | ||
|
|
4c120d7cd0 | ||
|
|
966a9758b8 | ||
|
|
f58071fc74 | ||
|
|
8712c5e636 | ||
|
|
a51f9bed49 | ||
|
|
531e514003 | ||
|
|
2cd9e706c8 | ||
|
|
e3e6a1dda8 | ||
|
|
92b5ce61a0 | ||
|
|
36073ad488 | ||
|
|
d89d0f9ff5 | ||
|
|
14c48d0a78 | ||
|
|
b1ee3e105b | ||
|
|
1f53c4fd1b | ||
|
|
bfed3ccf4d | ||
|
|
d521482409 | ||
|
|
53eba5658f | ||
|
|
bb64e4a850 | ||
|
|
771690d5c0 | ||
|
|
d39e30995a | ||
|
|
363aaeaba8 | ||
|
|
fa1307e62c | ||
|
|
e7c9e9695b | ||
|
|
bf07b7f76d | ||
|
|
5afb604e3d | ||
|
|
b3e8be6296 | ||
|
|
2cb452b1c2 | ||
|
|
938fcc4530 | ||
|
|
73625bf00f | ||
|
|
f41ed09dc1 | ||
|
|
f03c4576f7 | ||
|
|
6c5087e425 | ||
|
|
5a6caf6399 | ||
|
|
a07053d961 | ||
|
|
aa9a9a25fb | ||
|
|
cd4a39d8fc | ||
|
|
92e3f00435 | ||
|
|
a890e36a36 | ||
|
|
bed94ee431 | ||
|
|
175e99b12b | ||
|
|
2a941e3abf | ||
|
|
3c8e0ae5db | ||
|
|
e416ab526b | ||
|
|
e17246d8ea | ||
|
|
f6f043d1ca | ||
|
|
db57441c5d | ||
|
|
31d90be0a1 | ||
|
|
5a080798cb | ||
|
|
21c4857795 | ||
|
|
4149afa896 | ||
|
|
b196ab5793 | ||
|
|
b39b54bbaf | ||
|
|
26d76f52ac | ||
|
|
2fff28845d | ||
|
|
5e4c495d62 | ||
|
|
5c5802c089 | ||
|
|
06010ef1b7 | ||
|
|
bd411a1984 | ||
|
|
29d24e3eaa | ||
|
|
0fbbe774fa | ||
|
|
aede6f4bac | ||
|
|
84706ba66d | ||
|
|
6b602889a5 | ||
|
|
cd046927f3 | ||
|
|
5ed47d2d2c | ||
|
|
fd068c0933 | ||
|
|
9456e493ac | ||
|
|
3b08a73245 | ||
|
|
42606d9247 | ||
|
|
499b3dfa66 | ||
|
|
ca360b7d43 | ||
|
|
6470b16b70 | ||
|
|
4c9410c5fe | ||
|
|
351f0657c3 | ||
|
|
382604e923 | ||
|
|
b48f25a7f9 | ||
|
|
b76e4fc02b | ||
|
|
27cf7dd67f | ||
|
|
05ec45352c | ||
|
|
0e7546f248 | ||
|
|
e3c8bd5666 | ||
|
|
3ae1741343 | ||
|
|
53c106c3af | ||
|
|
44fc7dffe6 | ||
|
|
aaa6b32f9f | ||
|
|
821a0c4a2b | ||
|
|
6373237960 | ||
|
|
1caf7b104f | ||
|
|
176b4a4cde | ||
|
|
29bf993be7 | ||
|
|
eda597dea5 | ||
|
|
8c56777125 | ||
|
|
7e90ebd5db | ||
|
|
3a07bfe1d7 | ||
|
|
0694b9e8af | ||
|
|
742cbf5767 | ||
|
|
e93384b99b | ||
|
|
3c4e976093 | ||
|
|
1e795beec8 | ||
|
|
4f7407084d | ||
|
|
ae3d034531 | ||
|
|
a2db1f5ab5 | ||
|
|
cec6420c8f | ||
|
|
4f18bb7ea0 | ||
|
|
3ec2a3a92e | ||
|
|
823b67a3ce | ||
|
|
24dc59e31f | ||
|
|
08ff5e6db7 | ||
|
|
e60a6f9d14 | ||
|
|
161656d9e4 | ||
|
|
8569c02b33 | ||
|
|
35e714d8d9 | ||
|
|
6f8d5710d6 | ||
|
|
a671692832 | ||
|
|
5c8675e43a | ||
|
|
60a58f6fad | ||
|
|
948fc4c6ce | ||
|
|
5be5f3209d | ||
|
|
537b67e0cd | ||
|
|
82c5e55fe4 | ||
|
|
5f0d51d485 | ||
|
|
945dd816ad | ||
|
|
45009e6cc2 | ||
|
|
8eace6d57b | ||
|
|
3774fa6abd | ||
|
|
e71e6865d2 | ||
|
|
0e8f1528b1 | ||
|
|
c331babf51 | ||
|
|
c59d268995 | ||
|
|
9e9fcb0ba9 | ||
|
|
f33e0b2f78 | ||
|
|
0e4b99351f | ||
|
|
81edd2860f | ||
|
|
14ac584577 | ||
|
|
9ce6baf074 | ||
|
|
92a1e07b33 | ||
|
|
cb6bdece39 | ||
|
|
2ea38364bb | ||
|
|
cf6fdc0d81 | ||
|
|
91eec99563 | ||
|
|
f8cbaa9a06 | ||
|
|
d9b9b4984b | ||
|
|
8a46dc6b03 | ||
|
|
205f9dd816 | ||
|
|
9dfa1294e2 | ||
|
|
f019905720 | ||
|
|
9c257858dd | ||
|
|
2006ab20dd | ||
|
|
0caf31719c | ||
|
|
5da7184276 | ||
|
|
50fccdab96 | ||
|
|
77d6ff6630 | ||
|
|
99614702ea | ||
|
|
58cb344ee6 | ||
|
|
22d5892b12 | ||
|
|
f619969efc | ||
|
|
ca242424ab | ||
|
|
407deb84e9 | ||
|
|
14589aa094 | ||
|
|
1260e3d942 | ||
|
|
b22d926d94 | ||
|
|
410cc8c7be | ||
|
|
784c676f5b | ||
|
|
296f7970df | ||
|
|
cd59933c9c | ||
|
|
b95d3f5300 | ||
|
|
fa0d5d8469 | ||
|
|
8229843018 | ||
|
|
c578b37c6d | ||
|
|
8ea062c0bd | ||
|
|
5d8bbde434 | ||
|
|
0462af6a3d | ||
|
|
5a94a4048e | ||
|
|
2602301e1d | ||
|
|
161993f66f | ||
|
|
b7f87fb8d3 | ||
|
|
8fdb87f1f5 | ||
|
|
a9a68d230e | ||
|
|
a1f2941ffd | ||
|
|
c548eeacfc | ||
|
|
428781af86 | ||
|
|
72efe2e9fe | ||
|
|
a455003c7f | ||
|
|
8c46cc55a3 | ||
|
|
d1e3d93332 | ||
|
|
b4d143812e | ||
|
|
6e1f74402e | ||
|
|
090327234a | ||
|
|
3620c3a12d | ||
|
|
9a3f950ac6 | ||
|
|
0b36cb0f85 | ||
|
|
dd42423482 | ||
|
|
69eb25db1f | ||
|
|
a41ea52f87 | ||
|
|
259113c989 | ||
|
|
1afa3a3914 | ||
|
|
448e23825c | ||
|
|
b0c0f41f62 | ||
|
|
d2c5fb6512 | ||
|
|
b89cf4ec23 | ||
|
|
74b646af9e | ||
|
|
0cf485f7a9 | ||
|
|
ea63e4f66e | ||
|
|
58eb5fbd5f | ||
|
|
a8c543ef7b | ||
|
|
64e198a57a | ||
|
|
de332b9e6b | ||
|
|
60eeff292d | ||
|
|
52f30b306a | ||
|
|
6df0f81ca0 | ||
|
|
40b3c1502d | ||
|
|
a61265effe | ||
|
|
92efea6b76 | ||
|
|
216b3e2179 | ||
|
|
293a92f486 | ||
|
|
6bad2b5352 | ||
|
|
a09a638b9c | ||
|
|
24f57270ed | ||
|
|
1b7964ce98 | ||
|
|
5a510882b8 | ||
|
|
601ed03198 | ||
|
|
90fe4570b9 | ||
|
|
92fc8e838f | ||
|
|
89a3020c5e | ||
|
|
fc3e47b67e | ||
|
|
b2a80ca314 | ||
|
|
14655f19a0 | ||
|
|
47092c47db | ||
|
|
8e6fce8d63 | ||
|
|
3c514e3418 | ||
|
|
8a425b100b | ||
|
|
eb942cfedd | ||
|
|
0a7fc06108 | ||
|
|
0ae35afa76 | ||
|
|
a2156e73bf | ||
|
|
9fe77f3c28 | ||
|
|
6f078afafd | ||
|
|
15f4aa613e | ||
|
|
7cd9fa6c41 | ||
|
|
234d5fac2c | ||
|
|
6cbfb96ff8 | ||
|
|
6e54e55c31 | ||
|
|
3ff85b7b85 | ||
|
|
5640489f57 | ||
|
|
8135a6facf | ||
|
|
b6ae4f211d | ||
|
|
a56f032ec4 | ||
|
|
075736de20 | ||
|
|
d8543c892e | ||
|
|
ca0870b048 | ||
|
|
c7a739fafa | ||
|
|
7170296162 | ||
|
|
3bed0ef33c | ||
|
|
d419fa1e4f | ||
|
|
31a56c71bd | ||
|
|
28f47419b0 | ||
|
|
6a24da2849 | ||
|
|
782668fd21 | ||
|
|
aaf8d802e7 | ||
|
|
ca89a1e322 | ||
|
|
121dec2a62 | ||
|
|
4aacf9005e | ||
|
|
6b333202e9 | ||
|
|
ce6831368f | ||
|
|
e4111c830b | ||
|
|
52c1772b04 | ||
|
|
699d13bbb3 | ||
|
|
2c8d7d3d9a | ||
|
|
b13cc1e8e7 | ||
|
|
17d2bf2a3e | ||
|
|
94997f9c88 | ||
|
|
c6d998c4df | ||
|
|
f8ea445339 | ||
|
|
712efa208b | ||
|
|
09b6b6a9de | ||
|
|
98ff9a50e6 | ||
|
|
1f4d358316 | ||
|
|
f693fa165c | ||
|
|
c43084825c | ||
|
|
f1abee91dd | ||
|
|
c6b04edc34 | ||
|
|
50b847f4f7 | ||
|
|
1f53a06299 | ||
|
|
257dd95401 | ||
|
|
1736d205bb | ||
|
|
6fef58df6c | ||
|
|
473a8de8bb | ||
|
|
ff6272f48f | ||
|
|
1b5bcebd10 | ||
|
|
c4344d3afd | ||
|
|
45a44b087a |
293
README.md
293
README.md
@@ -1,80 +1,109 @@
|
||||
# TRAINS Agent
|
||||
## Deep Learning DevOps For Everyone - Now supporting all platforms (Linux, macOS, and Windows)
|
||||
<div align="center">
|
||||
|
||||
"All the Deep-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_agent_logo.png?raw=true" width="250px">
|
||||
|
||||
[](https://img.shields.io/github/license/allegroai/trains-agent.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/trains-agent.svg)
|
||||
[](https://img.shields.io/pypi/v/trains-agent.svg)
|
||||
[](https://pypi.python.org/pypi/trains-agent/)
|
||||
**ClearML Agent - ML-Ops made easy
|
||||
ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
|
||||
### Help improve Trains by filling our 2-min [user survey](https://allegro.ai/lp/trains-user-survey/)
|
||||
[](https://img.shields.io/github/license/allegroai/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/v/clearml-agent.svg)
|
||||
[](https://pypi.org/project/clearml-agent/)
|
||||
[](https://artifacthub.io/packages/search?repo=allegroai)
|
||||
</div>
|
||||
|
||||
**TRAINS Agent is an AI experiment cluster solution.**
|
||||
---
|
||||
|
||||
It is a zero configuration fire-and-forget execution agent, which combined with trains-server provides a full AI cluster solution.
|
||||
### ClearML-Agent
|
||||
#### *Formerly known as Trains Agent*
|
||||
|
||||
**Full AutoML in 5 steps**
|
||||
1. Install the [TRAINS server](https://github.com/allegroai/trains-agent) (or use our [open server](https://demoapp.trains.allegro.ai))
|
||||
2. `pip install trains-agent` ([install](#installing-the-trains-agent) the TRAINS agent on any GPU machine: on-premises / cloud / ...)
|
||||
3. Add [TRAINS](https://github.com/allegroai/trains) to your code with just 2 lines & run it once (on your machine / laptop)
|
||||
4. Change the [parameters](#using-the-trains-agent) in the UI & schedule for [execution](#using-the-trains-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
|
||||
* Run jobs (experiments) on any local or cloud based resource
|
||||
* Implement optimized resource utilization policies
|
||||
* Deploy execution environments with either virtualenv or fully docker containerized with zero effort
|
||||
* Launch-and-Forget service containers
|
||||
* [Cloud autoscaling](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler)
|
||||
* [Customizable cleanup](https://clear.ml/docs/latest/docs/guides/services/cleanup_service)
|
||||
* Advanced [pipeline building and execution](https://clear.ml/docs/latest/docs/guides/frameworks/pytorch/notebooks/table/tabular_training_pipeline)
|
||||
|
||||
It is a zero configuration fire-and-forget execution agent, providing a full ML/DL cluster solution.
|
||||
|
||||
**Full Automation in 5 steps**
|
||||
1. ClearML Server [self-hosted](https://github.com/allegroai/clearml-server) or [free tier hosting](https://app.clear.ml)
|
||||
2. `pip install clearml-agent` ([install](#installing-the-clearml-agent) the ClearML Agent on any GPU machine: on-premises / cloud / ...)
|
||||
3. Create a [job](https://github.com/allegroai/clearml/docs/clearml-task.md) or Add [ClearML](https://github.com/allegroai/clearml) to your code with just 2 lines
|
||||
4. Change the [parameters](#using-the-clearml-agent) in the UI & schedule for [execution](#using-the-clearml-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
5. :chart_with_downwards_trend: :chart_with_upwards_trend: :eyes: :beer:
|
||||
|
||||
"All the Deep/Machine-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
|
||||
**Using the TRAINS agent, you can now set up a dynamic cluster with \*epsilon DevOps**
|
||||
**Try ClearML now** [Self Hosted](https://github.com/allegroai/clearml-server) or [Free tier Hosting](https://app.clear.ml)
|
||||
<a href="https://app.clear.ml"><img src="https://github.com/allegroai/clearml-agent/blob/master/docs/screenshots.gif?raw=true" width="100%"></a>
|
||||
|
||||
*epsilon - Because we are scientists :triangular_ruler: and nothing is really zero work
|
||||
|
||||
(Experience TRAINS live at [https://demoapp.trains.allegro.ai](https://demoapp.trains.allegro.ai))
|
||||
<a href="https://demoapp.trains.allegro.ai"><img src="https://raw.githubusercontent.com/allegroai/trains-agent/9f1e86c1ca45c984ee13edc9353c7b10c55d7257/docs/screenshots.gif" width="100%"></a>
|
||||
|
||||
## Simple, Flexible Experiment Orchestration
|
||||
**The TRAINS Agent was built to address the DL/ML R&D DevOps needs:**
|
||||
### Simple, Flexible Experiment Orchestration
|
||||
**The ClearML Agent was built to address the DL/ML R&D DevOps needs:**
|
||||
|
||||
* Easily add & remove machines from the cluster
|
||||
* Reuse machines without the need for any dedicated containers or images
|
||||
* **Combine GPU resources across any cloud and on-prem**
|
||||
* **No need for yaml/json/template configuration of any kind**
|
||||
* **No need for yaml / json / template configuration of any kind**
|
||||
* **User friendly UI**
|
||||
* Manageable resource allocation that can be used by researchers and engineers
|
||||
* Flexible and controllable scheduler with priority support
|
||||
* Automatic instance spinning in the cloud **(coming soon)**
|
||||
* Automatic instance spinning in the cloud
|
||||
|
||||
**Using the ClearML Agent, you can now set up a dynamic cluster with \*epsilon DevOps**
|
||||
|
||||
*epsilon - Because we are :triangular_ruler: and nothing is really zero work
|
||||
|
||||
|
||||
## But ... K8S?
|
||||
We think Kubernetes is awesome.
|
||||
Combined with KubeFlow it is a robust solution for production-grade DevOps.
|
||||
We've observed, however, that it can be a bit of an overkill as an R&D DL/ML solution.
|
||||
If you are considering K8S for your research, also consider that you will soon be managing **hundreds** of containers...
|
||||
### Kubernetes Integration (Optional)
|
||||
We think Kubernetes is awesome, but it should be a choice.
|
||||
We designed `clearml-agent` so you can run bare-metal or inside a pod with any mix that fits your environment.
|
||||
|
||||
In our experience, handling and building the environments, having to package every experiment in a docker, managing those hundreds (or more) containers and building pipelines on top of it all, is very complicated (also, it’s usually out of scope for the research team, and overwhelming even for the DevOps team).
|
||||
Find Dockerfiles in the [docker](./docker) dir and a helm Chart in https://github.com/allegroai/clearml-helm-charts
|
||||
#### Benefits of integrating existing K8s with ClearML-Agent
|
||||
- ClearML-Agent adds the missing scheduling capabilities to K8s
|
||||
- Allowing for more flexible automation from code
|
||||
- A programmatic interface for easier learning curve (and debugging)
|
||||
- Seamless integration with ML/DL experiment manager
|
||||
- Web UI for customization, scheduling & prioritization of jobs
|
||||
|
||||
We feel there has to be a better way, that can be just as powerful for R&D and at the same time allow integration with K8S **when the need arises**.
|
||||
(If you already have a K8S cluster for AI, detailed instructions on how to integrate TRAINS into your K8S cluster are *coming soon*.)
|
||||
**Two K8s integration flavours**
|
||||
- Spin ClearML-Agent as a long-lasting service pod
|
||||
- use [clearml-agent](https://hub.docker.com/r/allegroai/clearml-agent) docker image
|
||||
- map docker socket into the pod (soon replaced by [podman](https://github.com/containers/podman))
|
||||
- allow the clearml-agent to manage sibling dockers
|
||||
- benefits: full use of the ClearML scheduling, no need to worry about wrong container images / lost pods etc.
|
||||
- downside: Sibling containers
|
||||
- Kubernetes Glue, map ClearML jobs directly to K8s jobs
|
||||
- Run the [clearml-k8s glue](https://github.com/allegroai/clearml-agent/blob/master/examples/k8s_glue_example.py) on a K8s cpu node
|
||||
- The clearml-k8s glue pulls jobs from the ClearML job execution queue and prepares a K8s job (based on provided yaml template)
|
||||
- Inside the pod itself the clearml-agent will install the job (experiment) environment and spin and monitor the experiment's process
|
||||
- benefits: Kubernetes full view of all running jobs in the system
|
||||
- downside: No real scheduling (k8s scheduler), no docker image verification (post-mortem only)
|
||||
|
||||
|
||||
## Using the TRAINS Agent
|
||||
### Using the ClearML Agent
|
||||
**Full scale HPC with a click of a button**
|
||||
|
||||
TRAINS Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the job and monitors its progress.
|
||||
The ClearML Agent is a job scheduler that listens on job queue(s), pulls jobs, sets the job environments, executes the job and monitors its progress.
|
||||
|
||||
Any 'Draft' experiment can be scheduled for execution by a TRAINS agent.
|
||||
Any 'Draft' experiment can be scheduled for execution by a ClearML agent.
|
||||
|
||||
A previously run experiment can be put into 'Draft' state by either of two methods:
|
||||
* Using the **'Reset'** action from the experiment right-click context menu in the
|
||||
TRAINS UI - This will clear any results and artifacts the previous run had created.
|
||||
ClearML UI - This will clear any results and artifacts the previous run had created.
|
||||
* Using the **'Clone'** action from the experiment right-click context menu in the
|
||||
TRAINS UI - This will create a new 'Draft' experiment with the same configuration as the original experiment.
|
||||
ClearML UI - This will create a new 'Draft' experiment with the same configuration as the original experiment.
|
||||
|
||||
An experiment is scheduled for execution using the **'Enqueue'** action from the experiment
|
||||
right-click context menu in the TRAINS UI and selecting the execution queue.
|
||||
right-click context menu in the ClearML UI and selecting the execution queue.
|
||||
|
||||
See [creating an experiment and enqueuing it for execution](#from-scratch).
|
||||
|
||||
Once an experiment is enqueued, it will be picked up and executed by a TRAINS agent monitoring this queue.
|
||||
Once an experiment is enqueued, it will be picked up and executed by a ClearML agent monitoring this queue.
|
||||
|
||||
The TRAINS UI Workers & Queues page provides ongoing execution information:
|
||||
The ClearML UI Workers & Queues page provides ongoing execution information:
|
||||
- Workers Tab: Monitor you cluster
|
||||
- Review available resources
|
||||
- Monitor machines statistics (CPU / GPU / Disk / Network)
|
||||
@@ -83,154 +112,129 @@ The TRAINS UI Workers & Queues page provides ongoing execution information:
|
||||
- Cancel or abort job execution
|
||||
- Move jobs between execution queues
|
||||
|
||||
### What The TRAINS Agent Actually Does
|
||||
The TRAINS agent executes experiments using the following process:
|
||||
#### What The ClearML Agent Actually Does
|
||||
The ClearML Agent executes experiments using the following process:
|
||||
- Create a new virtual environment (or launch the selected docker image)
|
||||
- Clone the code into the virtual-environment (or inside the docker)
|
||||
- Install python packages based on the package requirements listed for the experiment
|
||||
- Special note for PyTorch: The TRAINS agent will automatically select the
|
||||
- Special note for PyTorch: The ClearML Agent will automatically select the
|
||||
torch packages based on the CUDA_VERSION environment variable of the machine
|
||||
- Execute the code, while monitoring the process
|
||||
- Log all stdout/stderr in the TRAINS UI, including the cloning and installation process, for easy debugging
|
||||
- Monitor the execution and allow you to manually abort the job using the TRAINS UI (or, in the unfortunate case of a code crash, catch the error and signal the experiment has failed)
|
||||
- Log all stdout/stderr in the ClearML UI, including the cloning and installation process, for easy debugging
|
||||
- Monitor the execution and allow you to manually abort the job using the ClearML UI (or, in the unfortunate case of a code crash, catch the error and signal the experiment has failed)
|
||||
|
||||
### System Design & Flow
|
||||
```text
|
||||
+-----------------+
|
||||
| GPU Machine |
|
||||
Development Machine | |
|
||||
+------------------------+ | +-------------+ |
|
||||
| Data Scientist's | +--------------+ | |TRAINS Agent | |
|
||||
| DL/ML Code | | WEB UI | | | | |
|
||||
| | | | | | +---------+ | |
|
||||
| | | | | | | DL/ML | | |
|
||||
| | +--------------+ | | | Code | | |
|
||||
| | User Clones Exp #1 / . . . . . . . / | | | | | |
|
||||
| +-------------------+ | into Exp #2 / . . . . . . . / | | +---------+ | |
|
||||
| | TRAINS | | +---------------/-_____________-/ | | | |
|
||||
| +---------+---------+ | | | | ^ | |
|
||||
+-----------|------------+ | | +------|------+ |
|
||||
| | +--------|--------+
|
||||
Auto-Magically | |
|
||||
Creates Exp #1 | The TRAINS Agent
|
||||
\ User Change Hyper-Parameters Pulls Exp #2, setup the
|
||||
| | environment & clone code.
|
||||
| | Start execution with the
|
||||
+------------|------------+ | +--------------------+ new set of Hyper-Parameters.
|
||||
| +---------v---------+ | | | TRAINS-SERVER | |
|
||||
| | Experiment #1 | | | | | |
|
||||
| +-------------------+ | | | Execution Queue | |
|
||||
| || | | | | |
|
||||
| +-------------------+<----------+ | | |
|
||||
| | | | | | |
|
||||
| | Experiment #2 | | | | |
|
||||
| +-------------------<------------\ | | |
|
||||
| | ------------->---------------+ | |
|
||||
| | User Send Exp #2 | |Execute Exp #2 +--------------------+
|
||||
| | For Execution | +---------------+ |
|
||||
| TRAINS-SERVER | | |
|
||||
+-------------------------+ +--------------------+
|
||||
```
|
||||
#### System Design & Flow
|
||||
|
||||
### Installing the TRAINS Agent
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_architecture.png" width="100%" alt="clearml-architecture">
|
||||
|
||||
|
||||
#### Installing the ClearML Agent
|
||||
|
||||
```bash
|
||||
pip install trains-agent
|
||||
pip install clearml-agent
|
||||
```
|
||||
|
||||
### TRAINS Agent Usage Examples
|
||||
#### ClearML Agent Usage Examples
|
||||
|
||||
Full Interface and capabilities are available with
|
||||
```bash
|
||||
trains-agent --help
|
||||
trains-agent daemon --help
|
||||
clearml-agent --help
|
||||
clearml-agent daemon --help
|
||||
```
|
||||
|
||||
### Configuring the TRAINS Agent
|
||||
#### Configuring the ClearML Agent
|
||||
|
||||
```bash
|
||||
trains-agent init
|
||||
clearml-agent init
|
||||
```
|
||||
|
||||
Note: The TRAINS agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default TRAINS Agent cache folder is `~/.trains`
|
||||
Note: The ClearML Agent uses a cache folder to cache pip packages, apt packages and cloned repositories. The default ClearML Agent cache folder is `~/.clearml`
|
||||
|
||||
See full details in your configuration file at `~/trains.conf`
|
||||
See full details in your configuration file at `~/clearml.conf`
|
||||
|
||||
Note: The **TRAINS agent** extends the **TRAINS** configuration file `~/trains.conf`
|
||||
They are designed to share the same configuration file, see example [here](docs/trains.conf)
|
||||
Note: The **ClearML agent** extends the **ClearML** configuration file `~/clearml.conf`
|
||||
They are designed to share the same configuration file, see example [here](docs/clearml.conf)
|
||||
|
||||
### Running the TRAINS Agent
|
||||
#### Running the ClearML Agent
|
||||
|
||||
For debug and experimentation, start the TRAINS agent in `foreground` mode, where all the output is printed to screen
|
||||
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen
|
||||
```bash
|
||||
trains-agent daemon --queue default --foreground
|
||||
clearml-agent daemon --queue default --foreground
|
||||
```
|
||||
|
||||
For actual service mode, all the stdout will be stored automatically into a temporary file (no need to pipe)
|
||||
Notice: with `--detached` flag, the *trains-agent* will be running in the background
|
||||
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
|
||||
```bash
|
||||
trains-agent daemon --detached --queue default
|
||||
clearml-agent daemon --detached --queue default
|
||||
```
|
||||
|
||||
GPU allocation is controlled via the standard OS environment `NVIDIA_VISIBLE_DEVICES` or `--gpus` flag (or disabled with `--cpu-only`).
|
||||
|
||||
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for the `trains-agent` <br>
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for the `trains-agent`
|
||||
If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU's will be allocated for the `clearml-agent` <br>
|
||||
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for the `clearml-agent`
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine:
|
||||
Notice: with `--detached` flag, the *trains-agent* will be running in the background
|
||||
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0 --queue default
|
||||
trains-agent daemon --detached --gpus 1 --queue default
|
||||
clearml-agent daemon --detached --gpus 0 --queue default
|
||||
clearml-agent daemon --detached --gpus 1 --queue default
|
||||
```
|
||||
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu
|
||||
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu
|
||||
```
|
||||
|
||||
#### Starting the TRAINS Agent in docker mode
|
||||
##### Starting the ClearML Agent in docker mode
|
||||
|
||||
For debug and experimentation, start the TRAINS agent in `foreground` mode, where all the output is printed to screen
|
||||
For debug and experimentation, start the ClearML agent in `foreground` mode, where all the output is printed to screen
|
||||
```bash
|
||||
trains-agent daemon --queue default --docker --foreground
|
||||
clearml-agent daemon --queue default --docker --foreground
|
||||
```
|
||||
|
||||
For actual service mode, all the stdout will be stored automatically into a file (no need to pipe)
|
||||
Notice: with `--detached` flag, the *trains-agent* will be running in the background
|
||||
Notice: with `--detached` flag, the *clearml-agent* will be running in the background
|
||||
```bash
|
||||
trains-agent daemon --detached --queue default --docker
|
||||
clearml-agent daemon --detached --queue default --docker
|
||||
```
|
||||
|
||||
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda docker:
|
||||
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 docker:
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda
|
||||
trains-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
clearml-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
```
|
||||
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda docker:
|
||||
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 docker:
|
||||
```bash
|
||||
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda
|
||||
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda
|
||||
clearml-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
clearml-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
|
||||
```
|
||||
|
||||
#### Starting the TRAINS Agent - Priority Queues
|
||||
##### Starting the ClearML Agent - Priority Queues
|
||||
|
||||
Priority Queues are also supported, example use case:
|
||||
|
||||
High priority queue: `important_jobs` Low priority queue: `default`
|
||||
```bash
|
||||
trains-agent daemon --queue important_jobs default
|
||||
clearml-agent daemon --queue important_jobs default
|
||||
```
|
||||
The **TRAINS agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
|
||||
The **ClearML Agent** will first try to pull jobs from the `important_jobs` queue, only then it will fetch a job from the `default` queue.
|
||||
|
||||
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [open server](https://demoapp.trains.allegro.ai/workers-and-queues/queues)
|
||||
Adding queues, managing job order within a queue and moving jobs between queues, is available using the Web UI, see example on our [free server](https://app.clear.ml/workers-and-queues/queues)
|
||||
|
||||
# How do I create an experiment on the TRAINS server? <a name="from-scratch"></a>
|
||||
* Integrate [TRAINS](https://github.com/allegroai/trains) with your code
|
||||
##### Stopping the ClearML Agent
|
||||
|
||||
To stop a **ClearML Agent** running in the background, run the same command line used to start the agent with `--stop` appended.
|
||||
For example, to stop the first of the above shown same machine, single gpu agents:
|
||||
```bash
|
||||
clearml-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 --stop
|
||||
```
|
||||
|
||||
### How do I create an experiment on the ClearML Server? <a name="from-scratch"></a>
|
||||
* Integrate [ClearML](https://github.com/allegroai/clearml) with your code
|
||||
* Execute the code on your machine (Manually / PyCharm / Jupyter Notebook)
|
||||
* As your code is running, **TRAINS** creates an experiment logging all the necessary execution information:
|
||||
* As your code is running, **ClearML** creates an experiment logging all the necessary execution information:
|
||||
- Git repository link and commit ID (or an entire jupyter notebook)
|
||||
- Git diff (we’re not saying you never commit and push, but still...)
|
||||
- Python packages used by your code (including specific versions used)
|
||||
@@ -239,7 +243,7 @@ Adding queues, managing job order within a queue and moving jobs between queues,
|
||||
|
||||
You now have a 'template' of your experiment with everything required for automated execution
|
||||
|
||||
* In the TRAINS UI, Right click on the experiment and select 'clone'. A copy of your experiment will be created.
|
||||
* In the ClearML UI, Right click on the experiment and select 'clone'. A copy of your experiment will be created.
|
||||
* You now have a new draft experiment cloned from your original experiment, feel free to edit it
|
||||
- Change the Hyper-Parameters
|
||||
- Switch to the latest code base of the repository
|
||||
@@ -248,23 +252,44 @@ Adding queues, managing job order within a queue and moving jobs between queues,
|
||||
- Or simply change nothing to run the same experiment again...
|
||||
* Schedule the newly created experiment for execution: Right-click the experiment and select 'enqueue'
|
||||
|
||||
# AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
|
||||
The TRAINS Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the TRAINS package.
|
||||
### ClearML-Agent Services Mode <a name="services"></a>
|
||||
|
||||
Sample AutoML & Orchestration examples can be found in the TRAINS [example/automl](https://github.com/allegroai/trains/tree/master/examples/automl) folder.
|
||||
ClearML-Agent Services is a special mode of ClearML-Agent that provides the ability to launch long-lasting jobs
|
||||
that previously had to be executed on local / dedicated machines. It allows a single agent to
|
||||
launch multiple dockers (Tasks) for different use cases. To name a few use cases, auto-scaler service (spinning instances
|
||||
when the need arises and the budget allows), Controllers (Implementing pipelines and more sophisticated DevOps logic),
|
||||
Optimizer (such as Hyper-parameter Optimization or sweeping), and Application (such as interactive Bokeh apps for
|
||||
increased data transparency)
|
||||
|
||||
ClearML-Agent Services mode will spin **any** task enqueued into the specified queue.
|
||||
Every task launched by ClearML-Agent Services will be registered as a new node in the system,
|
||||
providing tracking and transparency capabilities.
|
||||
Currently clearml-agent in services-mode supports cpu only configuration. ClearML-agent services mode can be launched alongside GPU agents.
|
||||
|
||||
```bash
|
||||
clearml-agent daemon --services-mode --detached --queue services --create-queue --docker ubuntu:18.04 --cpu-only
|
||||
```
|
||||
|
||||
**Note**: It is the user's responsibility to make sure the proper tasks are pushed into the specified queue.
|
||||
|
||||
|
||||
### AutoML and Orchestration Pipelines <a name="automl-pipes"></a>
|
||||
The ClearML Agent can also be used to implement AutoML orchestration and Experiment Pipelines in conjunction with the ClearML package.
|
||||
|
||||
Sample AutoML & Orchestration examples can be found in the ClearML [example/automation](https://github.com/allegroai/clearml/tree/master/examples/automation) folder.
|
||||
|
||||
AutoML examples
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/trains/blob/master/examples/automl/automl_base_template_keras_simple.py)
|
||||
- [Toy Keras training experiment](https://github.com/allegroai/clearml/blob/master/examples/optimization/hyper-parameter-optimization/base_template_keras_simple.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/trains/blob/master/examples/automl/automl_random_search_example.py)
|
||||
- [Random Search over the above Keras experiment-template](https://github.com/allegroai/clearml/blob/master/examples/automation/manual_random_param_search_example.py)
|
||||
- This example will create multiple copies of the Keras experiment-template, with different hyper-parameter combinations
|
||||
|
||||
Experiment Pipeline examples
|
||||
- [First step experiment](https://github.com/allegroai/trains/blob/master/examples/automl/task_piping_example.py)
|
||||
- [First step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/task_piping_example.py)
|
||||
- This example will "process data", and once done, will launch a copy of the 'second step' experiment-template
|
||||
- [Second step experiment](https://github.com/allegroai/trains/blob/master/examples/automl/toy_base_task.py)
|
||||
- [Second step experiment](https://github.com/allegroai/clearml/blob/master/examples/automation/toy_base_task.py)
|
||||
- In order to create an experiment-template in the system, this code must be executed once manually
|
||||
|
||||
# License
|
||||
### License
|
||||
|
||||
Apache License, Version 2.0 (see the [LICENSE](https://www.apache.org/licenses/LICENSE-2.0.html) for more information)
|
||||
|
||||
@@ -4,15 +4,15 @@ import argparse
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
from trains_agent.backend_api.session.datamodel import UnusedKwargsWarning
|
||||
from clearml_agent.backend_api.session.datamodel import UnusedKwargsWarning
|
||||
|
||||
import trains_agent
|
||||
from trains_agent.config import get_config
|
||||
from trains_agent.definitions import FileBuffering, CONFIG_FILE
|
||||
from trains_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
|
||||
from trains_agent.helper.process import ExitStatus
|
||||
import clearml_agent
|
||||
from clearml_agent.config import get_config
|
||||
from clearml_agent.definitions import FileBuffering, CONFIG_FILE
|
||||
from clearml_agent.helper.base import reverse_home_folder_expansion, chain_map, named_temporary_file
|
||||
from clearml_agent.helper.process import ExitStatus
|
||||
from . import interface, session, definitions, commands
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError
|
||||
from .errors import ConfigFileNotFound, Sigterm, APIError, CustomBuildScriptFailed
|
||||
from .helper.trace import PackageTrace
|
||||
from .interface import get_parser
|
||||
|
||||
@@ -44,10 +44,12 @@ def run_command(parser, args, command_name):
|
||||
debug = command._session.debug_mode
|
||||
func = getattr(command, command_name)
|
||||
return func(**args_dict)
|
||||
except CustomBuildScriptFailed as e:
|
||||
command_class.exit(e.message, e.errno)
|
||||
except ConfigFileNotFound:
|
||||
message = 'Cannot find configuration file in "{}".\n' \
|
||||
'To create a configuration file, run:\n' \
|
||||
'$ trains_agent init'.format(reverse_home_folder_expansion(CONFIG_FILE))
|
||||
'$ clearml_agent init'.format(reverse_home_folder_expansion(CONFIG_FILE))
|
||||
command_class.exit(message)
|
||||
except APIError as api_error:
|
||||
if not debug:
|
||||
311
clearml_agent/backend_api/config/default/agent.conf
Normal file
311
clearml_agent/backend_api/config/default/agent.conf
Normal file
@@ -0,0 +1,311 @@
|
||||
{
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Override with os environment: CLEARML_WORKER_ID
|
||||
# worker_id: "clearml-agent-machine1:gpu0"
|
||||
worker_id: ""
|
||||
|
||||
# worker name, replaces the hostname when creating a unique name for this worker
|
||||
# Override with os environment: CLEARML_WORKER_NAME
|
||||
# worker_name: "clearml-agent-machine1"
|
||||
worker_name: ""
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
|
||||
# To learn how to generate git token GitHub/Bitbucket/GitLab:
|
||||
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
|
||||
# https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
|
||||
# https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# Force the root folder of the git repository (instead of the working directory) into the PYHTONPATH
|
||||
# default false, only the working directory will be added to the PYHTONPATH
|
||||
# force_git_root_python_path: false
|
||||
|
||||
# in docker mode, if container's entrypoint automatically activated a virtual environment
|
||||
# use the activated virtual environment and install everything there
|
||||
# set to False to disable, and always create a new venv inheriting from the system_site_packages
|
||||
# docker_use_activated_venv: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: "<20.2",
|
||||
# specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
|
||||
# poetry_version: "<2",
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
# install with --upgrade
|
||||
force_upgrade: false,
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||
torch_nightly: false,
|
||||
},
|
||||
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
# path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.clearml/vcs-cache
|
||||
},
|
||||
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
|
||||
# cached folder for specific python package download (used for pytorch package caching)
|
||||
pip_download_cache {
|
||||
enabled: true,
|
||||
path: ~/.clearml/pip-download-cache
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
# pip cache folder mapped into docker, used for python package caching
|
||||
docker_pip_cache = ~/.clearml/pip-cache
|
||||
# apt cache folder mapped into docker, used for ubuntu package caching
|
||||
docker_apt_cache = ~/.clearml/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
# Defined using a list of items of the format: "<hours> <days>".
|
||||
# hours - use values 0-23, single values would count as start hour and end at midnight.
|
||||
# days - use days in abbreviated format (SUN-SAT)
|
||||
# use '-' for ranges and ',' to separate singular values.
|
||||
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
|
||||
# uptime: ["17-20 SUN,TUE"]
|
||||
|
||||
# optional downtime configuration, can be used only when uptime is not used.
|
||||
# If downtime is specified, agent will be idle in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
|
||||
# Use the same format as described above for uptime
|
||||
# downtime: []
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
# docker_init_bash_script = [
|
||||
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||
# "chown -R root /root/.cache/pip",
|
||||
# "apt-get update",
|
||||
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
|
||||
# ]
|
||||
|
||||
# set the preprocessing bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# docker_preprocess_bash_script = [
|
||||
# "echo \"starting docker\"",
|
||||
#]
|
||||
|
||||
# If False replace \r with \n and display full console output
|
||||
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||
# suppress_carriage_return: true
|
||||
|
||||
# cuda versions used for solving pytorch wheel packages
|
||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
parse_embedded_urls: true
|
||||
}
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
docker_internal_mounts {
|
||||
sdk_cache: "/clearml_agent_cache"
|
||||
apt_cache: "/var/cache/apt/archives"
|
||||
ssh_folder: "/root/.ssh"
|
||||
pip_cache: "/root/.cache/pip"
|
||||
poetry_cache: "/root/.cache/pypoetry"
|
||||
vcs_cache: "/root/.clearml/vcs-cache"
|
||||
venv_build: "/root/.clearml/venvs-builds"
|
||||
pip_download: "/root/.clearml/pip-download-cache"
|
||||
}
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
|
||||
# Note: resulting name must start with an alphanumeric character and continue with alphanumeric characters,
|
||||
# underscores (_), dots (.) and/or dashes (-)
|
||||
#docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
|
||||
# Apply top-level environment section from configuration into os.environ
|
||||
apply_environment: true
|
||||
# Top-level environment section is in the form of:
|
||||
# environment {
|
||||
# key: value
|
||||
# ...
|
||||
# }
|
||||
# and is applied to the OS environment as `key=value` for each key/value pair
|
||||
|
||||
# Apply top-level files section from configuration into local file system
|
||||
apply_files: true
|
||||
# Top-level files section allows auto-generating files at designated paths with a predefined contents
|
||||
# and target format. Options include:
|
||||
# contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
|
||||
# format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
|
||||
# base64-encoded contents string, otherwise ignored
|
||||
# path: the target file's path, may include ~ and inplace env vars
|
||||
# target_format: format used to encode contents before writing into the target file. Supported values are json,
|
||||
# yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
|
||||
# overwrite: overwrite the target file in case it exists. Default is true.
|
||||
#
|
||||
# Example:
|
||||
# files {
|
||||
# myfile1 {
|
||||
# contents: "The quick brown fox jumped over the lazy dog"
|
||||
# path: "/tmp/fox.txt"
|
||||
# }
|
||||
# myjsonfile {
|
||||
# contents: {
|
||||
# some {
|
||||
# nested {
|
||||
# value: [1, 2, 3, 4]
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# path: "/tmp/test.json"
|
||||
# target_format: json
|
||||
# }
|
||||
# }
|
||||
|
||||
# Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
# If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
# will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
# The script can also be specified using the CLEARML_AGENT_CUSTOM_BUILD_SCRIPT environment variable.
|
||||
#
|
||||
# When running the script, the following environment variables will be set:
|
||||
# - CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
# contents in JSON format
|
||||
# - CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
# - CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
# - CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
# - CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
# - CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
# this file must be in the following JSON format:
|
||||
# ```json
|
||||
# {
|
||||
# "binary": "/absolute/path/to/python-executable",
|
||||
# "entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
# "working_dir": "/absolute/path/to/task-working/dir"
|
||||
# }
|
||||
# ```
|
||||
# If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
# skip virtual environment creation.
|
||||
#
|
||||
# In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
# In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
# into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
# standard flow.
|
||||
custom_build_script: ""
|
||||
}
|
||||
@@ -31,7 +31,9 @@
|
||||
}
|
||||
|
||||
auth {
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token
|
||||
token_expiration_threshold_sec = 360
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token. Default 12 hours
|
||||
token_expiration_threshold_sec: 43200
|
||||
# When requesting a token, request specific expiration time. Server default (and maximum) is 30 days
|
||||
# request_token_expiration_sec: 2592000
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
# TRAINS - default SDK configuration
|
||||
# ClearML - default SDK configuration
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
default_base_dir: "~/.trains/cache"
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
size {
|
||||
# max_used_bytes = -1
|
||||
min_free_bytes = 10GB
|
||||
@@ -31,12 +31,18 @@
|
||||
# X images are stored in the upload destination for each matplotlib plot title.
|
||||
matplotlib_untitled_history_size: 100
|
||||
|
||||
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
|
||||
# plot_max_num_digits: 5
|
||||
|
||||
# Settings for generated debug images
|
||||
images {
|
||||
format: JPEG
|
||||
quality: 87
|
||||
subsampling: 0
|
||||
}
|
||||
|
||||
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
|
||||
tensorboard_single_series_per_graph: false
|
||||
}
|
||||
|
||||
network {
|
||||
@@ -92,7 +98,7 @@
|
||||
google.storage {
|
||||
# # Default project and credentials file
|
||||
# # Will be used when no bucket configuration is found
|
||||
# project: "trains"
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
|
||||
# # Specific credentials per bucket and sub directory
|
||||
@@ -100,7 +106,7 @@
|
||||
# {
|
||||
# bucket: "my-bucket"
|
||||
# subdir: "path/in/bucket" # Not required
|
||||
# project: "trains"
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
# },
|
||||
# ]
|
||||
@@ -108,7 +114,7 @@
|
||||
azure.storage {
|
||||
# containers: [
|
||||
# {
|
||||
# account_name: "trains"
|
||||
# account_name: "clearml"
|
||||
# account_key: "secret"
|
||||
# # container_name:
|
||||
# }
|
||||
@@ -117,11 +123,11 @@
|
||||
|
||||
log {
|
||||
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
|
||||
null_log_propagate: False
|
||||
null_log_propagate: false
|
||||
task_log_buffer_capacity: 66
|
||||
|
||||
# disable urllib info and lower levels
|
||||
disable_urllib3_info: True
|
||||
disable_urllib3_info: true
|
||||
}
|
||||
|
||||
development {
|
||||
@@ -131,14 +137,30 @@
|
||||
task_reuse_time_window_in_hours: 72.0
|
||||
|
||||
# Run VCS repository detection asynchronously
|
||||
vcs_repo_detect_async: True
|
||||
vcs_repo_detect_async: true
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff_on_train: True
|
||||
store_uncommitted_code_diff: true
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
support_stopping: True
|
||||
support_stopping: true
|
||||
|
||||
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
|
||||
default_output_uri: ""
|
||||
|
||||
# Default auto generated requirements optimize for smaller requirements
|
||||
# If True, analyze the entire repository regardless of the entry point.
|
||||
# If False, first analyze the entry point script, if it does not contain other to local files,
|
||||
# do not analyze the entire repository.
|
||||
force_analyze_entire_repo: false
|
||||
|
||||
# If set to true, *clearml* update message will not be printed to the console
|
||||
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
|
||||
suppress_update_message: false
|
||||
|
||||
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
|
||||
detect_with_pip_freeze: false
|
||||
|
||||
# Development mode worker
|
||||
worker {
|
||||
@@ -149,7 +171,11 @@
|
||||
ping_period_sec: 30
|
||||
|
||||
# Log all stdout & stderr
|
||||
log_stdout: True
|
||||
log_stdout: true
|
||||
|
||||
# compatibility feature, report memory usage for the entire machine
|
||||
# default (false), report only on the running process and its sub-processes
|
||||
report_global_mem_used: false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -106,15 +106,15 @@ class StrictSession(Session):
|
||||
init()
|
||||
return
|
||||
|
||||
original = os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR, None)
|
||||
original = LOCAL_CONFIG_FILE_OVERRIDE_VAR.get() or None
|
||||
try:
|
||||
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = str(config_file)
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR.set(str(config_file))
|
||||
init()
|
||||
finally:
|
||||
if original is None:
|
||||
os.environ.pop(LOCAL_CONFIG_FILE_OVERRIDE_VAR, None)
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR.pop()
|
||||
else:
|
||||
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = original
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR.set(original)
|
||||
|
||||
def send(self, request, *args, **kwargs):
|
||||
result = super(StrictSession, self).send(request, *args, **kwargs)
|
||||
@@ -222,7 +222,7 @@ class TableResponse(Response):
|
||||
return "" if result is None else result
|
||||
|
||||
fields = fields or self.fields
|
||||
from trains_agent.helper.base import create_table
|
||||
from clearml_agent.helper.base import create_table
|
||||
return create_table(
|
||||
(dict((attr, getter(item, attr)) for attr in fields) for item in self),
|
||||
titles=fields, columns=fields, headers=True,
|
||||
31
clearml_agent/backend_api/session/defs.py
Normal file
31
clearml_agent/backend_api/session/defs.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from ...backend_config.converters import safe_text_to_bool
|
||||
from ...backend_config.environment import EnvEntry
|
||||
|
||||
|
||||
ENV_HOST = EnvEntry("CLEARML_API_HOST", "TRAINS_API_HOST")
|
||||
ENV_WEB_HOST = EnvEntry("CLEARML_WEB_HOST", "TRAINS_WEB_HOST")
|
||||
ENV_FILES_HOST = EnvEntry("CLEARML_FILES_HOST", "TRAINS_FILES_HOST")
|
||||
ENV_ACCESS_KEY = EnvEntry("CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY")
|
||||
ENV_SECRET_KEY = EnvEntry("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AUTH_TOKEN = EnvEntry("CLEARML_AUTH_TOKEN")
|
||||
ENV_VERBOSE = EnvEntry("CLEARML_API_VERBOSE", "TRAINS_API_VERBOSE", type=bool, default=False)
|
||||
ENV_HOST_VERIFY_CERT = EnvEntry("CLEARML_API_HOST_VERIFY_CERT", "TRAINS_API_HOST_VERIFY_CERT", type=bool, default=True)
|
||||
ENV_CONDA_ENV_PACKAGE = EnvEntry("CLEARML_CONDA_ENV_PACKAGE", "TRAINS_CONDA_ENV_PACKAGE")
|
||||
ENV_NO_DEFAULT_SERVER = EnvEntry("CLEARML_NO_DEFAULT_SERVER", "TRAINS_NO_DEFAULT_SERVER", type=bool, default=True)
|
||||
ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type=bool)
|
||||
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
|
||||
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
|
||||
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
|
||||
ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
|
||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||
)
|
||||
|
||||
"""
|
||||
Experimental option to set the request method for all API requests and auth login.
|
||||
This could be useful when GET requests with payloads are blocked by a server as
|
||||
POST requests can be used instead.
|
||||
|
||||
However this has not been vigorously tested and may have unintended consequences.
|
||||
"""
|
||||
ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD", default="GET")
|
||||
9
clearml_agent/backend_api/session/jsonmodels/__init__.py
Normal file
9
clearml_agent/backend_api/session/jsonmodels/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
# coding: utf-8
|
||||
|
||||
__author__ = 'Szczepan Cieślik'
|
||||
__email__ = 'szczepan.cieslik@gmail.com'
|
||||
__version__ = '2.4'
|
||||
|
||||
from . import models
|
||||
from . import fields
|
||||
from . import errors
|
||||
230
clearml_agent/backend_api/session/jsonmodels/builders.py
Normal file
230
clearml_agent/backend_api/session/jsonmodels/builders.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""Builders to generate in memory representation of model and fields tree."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import six
|
||||
|
||||
from . import errors
|
||||
from .fields import NotSet
|
||||
|
||||
|
||||
class Builder(object):
|
||||
|
||||
def __init__(self, parent=None, nullable=False, default=NotSet):
|
||||
self.parent = parent
|
||||
self.types_builders = {}
|
||||
self.types_count = defaultdict(int)
|
||||
self.definitions = set()
|
||||
self.nullable = nullable
|
||||
self.default = default
|
||||
|
||||
@property
|
||||
def has_default(self):
|
||||
return self.default is not NotSet
|
||||
|
||||
def register_type(self, type, builder):
|
||||
if self.parent:
|
||||
return self.parent.register_type(type, builder)
|
||||
|
||||
self.types_count[type] += 1
|
||||
if type not in self.types_builders:
|
||||
self.types_builders[type] = builder
|
||||
|
||||
def get_builder(self, type):
|
||||
if self.parent:
|
||||
return self.parent.get_builder(type)
|
||||
|
||||
return self.types_builders[type]
|
||||
|
||||
def count_type(self, type):
|
||||
if self.parent:
|
||||
return self.parent.count_type(type)
|
||||
|
||||
return self.types_count[type]
|
||||
|
||||
@staticmethod
|
||||
def maybe_build(value):
|
||||
return value.build() if isinstance(value, Builder) else value
|
||||
|
||||
def add_definition(self, builder):
|
||||
if self.parent:
|
||||
return self.parent.add_definition(builder)
|
||||
|
||||
self.definitions.add(builder)
|
||||
|
||||
|
||||
class ObjectBuilder(Builder):
|
||||
|
||||
def __init__(self, model_type, *args, **kwargs):
|
||||
super(ObjectBuilder, self).__init__(*args, **kwargs)
|
||||
self.properties = {}
|
||||
self.required = []
|
||||
self.type = model_type
|
||||
|
||||
self.register_type(self.type, self)
|
||||
|
||||
def add_field(self, name, field, schema):
|
||||
_apply_validators_modifications(schema, field)
|
||||
self.properties[name] = schema
|
||||
if field.required:
|
||||
self.required.append(name)
|
||||
|
||||
def build(self):
|
||||
builder = self.get_builder(self.type)
|
||||
if self.is_definition and not self.is_root:
|
||||
self.add_definition(builder)
|
||||
[self.maybe_build(value) for _, value in self.properties.items()]
|
||||
return '#/definitions/{name}'.format(name=self.type_name)
|
||||
else:
|
||||
return builder.build_definition(nullable=self.nullable)
|
||||
|
||||
@property
|
||||
def type_name(self):
|
||||
module_name = '{module}.{name}'.format(
|
||||
module=self.type.__module__,
|
||||
name=self.type.__name__,
|
||||
)
|
||||
return module_name.replace('.', '_').lower()
|
||||
|
||||
def build_definition(self, add_defintitions=True, nullable=False):
|
||||
properties = dict(
|
||||
(name, self.maybe_build(value))
|
||||
for name, value
|
||||
in self.properties.items()
|
||||
)
|
||||
schema = {
|
||||
'type': 'object',
|
||||
'additionalProperties': False,
|
||||
'properties': properties,
|
||||
}
|
||||
if self.required:
|
||||
schema['required'] = self.required
|
||||
if self.definitions and add_defintitions:
|
||||
schema['definitions'] = dict(
|
||||
(builder.type_name, builder.build_definition(False, False))
|
||||
for builder in self.definitions
|
||||
)
|
||||
return schema
|
||||
|
||||
@property
|
||||
def is_definition(self):
|
||||
if self.count_type(self.type) > 1:
|
||||
return True
|
||||
elif self.parent:
|
||||
return self.parent.is_definition
|
||||
else:
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_root(self):
|
||||
return not bool(self.parent)
|
||||
|
||||
|
||||
def _apply_validators_modifications(field_schema, field):
|
||||
for validator in field.validators:
|
||||
try:
|
||||
validator.modify_schema(field_schema)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
|
||||
class PrimitiveBuilder(Builder):
|
||||
|
||||
def __init__(self, type, *args, **kwargs):
|
||||
super(PrimitiveBuilder, self).__init__(*args, **kwargs)
|
||||
self.type = type
|
||||
|
||||
def build(self):
|
||||
schema = {}
|
||||
if issubclass(self.type, six.string_types):
|
||||
obj_type = 'string'
|
||||
elif issubclass(self.type, bool):
|
||||
obj_type = 'boolean'
|
||||
elif issubclass(self.type, int):
|
||||
obj_type = 'number'
|
||||
elif issubclass(self.type, float):
|
||||
obj_type = 'number'
|
||||
else:
|
||||
raise errors.FieldNotSupported(
|
||||
"Can't specify value schema!", self.type
|
||||
)
|
||||
|
||||
if self.nullable:
|
||||
obj_type = [obj_type, 'null']
|
||||
schema['type'] = obj_type
|
||||
|
||||
if self.has_default:
|
||||
schema["default"] = self.default
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
class ListBuilder(Builder):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ListBuilder, self).__init__(*args, **kwargs)
|
||||
self.schemas = []
|
||||
|
||||
def add_type_schema(self, schema):
|
||||
self.schemas.append(schema)
|
||||
|
||||
def build(self):
|
||||
schema = {'type': 'array'}
|
||||
if self.nullable:
|
||||
self.add_type_schema({'type': 'null'})
|
||||
|
||||
if self.has_default:
|
||||
schema["default"] = [self.to_struct(i) for i in self.default]
|
||||
|
||||
schemas = [self.maybe_build(s) for s in self.schemas]
|
||||
if len(schemas) == 1:
|
||||
items = schemas[0]
|
||||
else:
|
||||
items = {'oneOf': schemas}
|
||||
|
||||
schema['items'] = items
|
||||
return schema
|
||||
|
||||
@property
|
||||
def is_definition(self):
|
||||
return self.parent.is_definition
|
||||
|
||||
@staticmethod
|
||||
def to_struct(item):
|
||||
from .models import Base
|
||||
if isinstance(item, Base):
|
||||
return item.to_struct()
|
||||
return item
|
||||
|
||||
|
||||
class EmbeddedBuilder(Builder):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(EmbeddedBuilder, self).__init__(*args, **kwargs)
|
||||
self.schemas = []
|
||||
|
||||
def add_type_schema(self, schema):
|
||||
self.schemas.append(schema)
|
||||
|
||||
def build(self):
|
||||
if self.nullable:
|
||||
self.add_type_schema({'type': 'null'})
|
||||
|
||||
schemas = [self.maybe_build(schema) for schema in self.schemas]
|
||||
if len(schemas) == 1:
|
||||
schema = schemas[0]
|
||||
else:
|
||||
schema = {'oneOf': schemas}
|
||||
|
||||
if self.has_default:
|
||||
# The default value of EmbeddedField is expected to be an instance
|
||||
# of a subclass of models.Base, thus have `to_struct`
|
||||
schema["default"] = self.default.to_struct()
|
||||
|
||||
return schema
|
||||
|
||||
@property
|
||||
def is_definition(self):
|
||||
return self.parent.is_definition
|
||||
21
clearml_agent/backend_api/session/jsonmodels/collections.py
Normal file
21
clearml_agent/backend_api/session/jsonmodels/collections.py
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
|
||||
class ModelCollection(list):
|
||||
|
||||
"""`ModelCollection` is list which validates stored values.
|
||||
|
||||
Validation is made with use of field passed to `__init__` at each point,
|
||||
when new value is assigned.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, field):
|
||||
self.field = field
|
||||
|
||||
def append(self, value):
|
||||
self.field.validate_single_value(value)
|
||||
super(ModelCollection, self).append(value)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.field.validate_single_value(value)
|
||||
super(ModelCollection, self).__setitem__(key, value)
|
||||
15
clearml_agent/backend_api/session/jsonmodels/errors.py
Normal file
15
clearml_agent/backend_api/session/jsonmodels/errors.py
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
|
||||
class ValidationError(RuntimeError):
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FieldNotFound(RuntimeError):
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FieldNotSupported(ValueError):
|
||||
|
||||
pass
|
||||
488
clearml_agent/backend_api/session/jsonmodels/fields.py
Normal file
488
clearml_agent/backend_api/session/jsonmodels/fields.py
Normal file
@@ -0,0 +1,488 @@
|
||||
import datetime
|
||||
import re
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
import six
|
||||
from dateutil.parser import parse
|
||||
|
||||
from .errors import ValidationError
|
||||
from .collections import ModelCollection
|
||||
|
||||
|
||||
# unique marker for "no default value specified". None is not good enough since
|
||||
# it is a completely valid default value.
|
||||
NotSet = object()
|
||||
|
||||
|
||||
class BaseField(object):
|
||||
|
||||
"""Base class for all fields."""
|
||||
|
||||
types = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
required=False,
|
||||
nullable=False,
|
||||
help_text=None,
|
||||
validators=None,
|
||||
default=NotSet,
|
||||
name=None):
|
||||
self.memory = WeakKeyDictionary()
|
||||
self.required = required
|
||||
self.help_text = help_text
|
||||
self.nullable = nullable
|
||||
self._assign_validators(validators)
|
||||
self.name = name
|
||||
self._validate_name()
|
||||
if default is not NotSet:
|
||||
self.validate(default)
|
||||
self._default = default
|
||||
|
||||
@property
|
||||
def has_default(self):
|
||||
return self._default is not NotSet
|
||||
|
||||
def _assign_validators(self, validators):
|
||||
if validators and not isinstance(validators, list):
|
||||
validators = [validators]
|
||||
self.validators = validators or []
|
||||
|
||||
def __set__(self, instance, value):
|
||||
self._finish_initialization(type(instance))
|
||||
value = self.parse_value(value)
|
||||
self.validate(value)
|
||||
self.memory[instance._cache_key] = value
|
||||
|
||||
def __get__(self, instance, owner=None):
|
||||
if instance is None:
|
||||
self._finish_initialization(owner)
|
||||
return self
|
||||
|
||||
self._finish_initialization(type(instance))
|
||||
|
||||
self._check_value(instance)
|
||||
return self.memory[instance._cache_key]
|
||||
|
||||
def _finish_initialization(self, owner):
|
||||
pass
|
||||
|
||||
def _check_value(self, obj):
|
||||
if obj._cache_key not in self.memory:
|
||||
self.__set__(obj, self.get_default_value())
|
||||
|
||||
def validate_for_object(self, obj):
|
||||
value = self.__get__(obj)
|
||||
self.validate(value)
|
||||
|
||||
def validate(self, value):
|
||||
self._check_types()
|
||||
self._validate_against_types(value)
|
||||
self._check_against_required(value)
|
||||
self._validate_with_custom_validators(value)
|
||||
|
||||
def _check_against_required(self, value):
|
||||
if value is None and self.required:
|
||||
raise ValidationError('Field is required!')
|
||||
|
||||
def _validate_against_types(self, value):
|
||||
if value is not None and not isinstance(value, self.types):
|
||||
raise ValidationError(
|
||||
'Value is wrong, expected type "{types}"'.format(
|
||||
types=', '.join([t.__name__ for t in self.types])
|
||||
),
|
||||
value,
|
||||
)
|
||||
|
||||
def _check_types(self):
|
||||
if self.types is None:
|
||||
raise ValidationError(
|
||||
'Field "{type}" is not usable, try '
|
||||
'different field type.'.format(type=type(self).__name__))
|
||||
|
||||
def to_struct(self, value):
|
||||
"""Cast value to Python structure."""
|
||||
return value
|
||||
|
||||
def parse_value(self, value):
|
||||
"""Parse value from primitive to desired format.
|
||||
|
||||
Each field can parse value to form it wants it to be (like string or
|
||||
int).
|
||||
|
||||
"""
|
||||
return value
|
||||
|
||||
def _validate_with_custom_validators(self, value):
|
||||
if value is None and self.nullable:
|
||||
return
|
||||
|
||||
for validator in self.validators:
|
||||
try:
|
||||
validator.validate(value)
|
||||
except AttributeError:
|
||||
validator(value)
|
||||
|
||||
def get_default_value(self):
|
||||
"""Get default value for field.
|
||||
|
||||
Each field can specify its default.
|
||||
|
||||
"""
|
||||
return self._default if self.has_default else None
|
||||
|
||||
def _validate_name(self):
|
||||
if self.name is None:
|
||||
return
|
||||
if not re.match('^[A-Za-z_](([\w\-]*)?\w+)?$', self.name):
|
||||
raise ValueError('Wrong name', self.name)
|
||||
|
||||
def structue_name(self, default):
|
||||
return self.name if self.name is not None else default
|
||||
|
||||
|
||||
class StringField(BaseField):
|
||||
|
||||
"""String field."""
|
||||
|
||||
types = six.string_types
|
||||
|
||||
|
||||
class IntField(BaseField):
|
||||
|
||||
"""Integer field."""
|
||||
|
||||
types = (int,)
|
||||
|
||||
def parse_value(self, value):
|
||||
"""Cast value to `int`, e.g. from string or long"""
|
||||
parsed = super(IntField, self).parse_value(value)
|
||||
if parsed is None:
|
||||
return parsed
|
||||
return int(parsed)
|
||||
|
||||
|
||||
class FloatField(BaseField):
|
||||
|
||||
"""Float field."""
|
||||
|
||||
types = (float, int)
|
||||
|
||||
|
||||
class BoolField(BaseField):
|
||||
|
||||
"""Bool field."""
|
||||
|
||||
types = (bool,)
|
||||
|
||||
def parse_value(self, value):
|
||||
"""Cast value to `bool`."""
|
||||
parsed = super(BoolField, self).parse_value(value)
|
||||
return bool(parsed) if parsed is not None else None
|
||||
|
||||
|
||||
class ListField(BaseField):
|
||||
|
||||
"""List field."""
|
||||
|
||||
types = (list,)
|
||||
|
||||
def __init__(self, items_types=None, *args, **kwargs):
|
||||
"""Init.
|
||||
|
||||
`ListField` is **always not required**. If you want to control number
|
||||
of items use validators.
|
||||
|
||||
"""
|
||||
self._assign_types(items_types)
|
||||
super(ListField, self).__init__(*args, **kwargs)
|
||||
self.required = False
|
||||
|
||||
def get_default_value(self):
|
||||
default = super(ListField, self).get_default_value()
|
||||
if default is None:
|
||||
return ModelCollection(self)
|
||||
return default
|
||||
|
||||
def _assign_types(self, items_types):
|
||||
if items_types:
|
||||
try:
|
||||
self.items_types = tuple(items_types)
|
||||
except TypeError:
|
||||
self.items_types = items_types,
|
||||
else:
|
||||
self.items_types = tuple()
|
||||
|
||||
types = []
|
||||
for type_ in self.items_types:
|
||||
if isinstance(type_, six.string_types):
|
||||
types.append(_LazyType(type_))
|
||||
else:
|
||||
types.append(type_)
|
||||
self.items_types = tuple(types)
|
||||
|
||||
def validate(self, value):
|
||||
super(ListField, self).validate(value)
|
||||
|
||||
if len(self.items_types) == 0:
|
||||
return
|
||||
|
||||
for item in value:
|
||||
self.validate_single_value(item)
|
||||
|
||||
def validate_single_value(self, item):
|
||||
if len(self.items_types) == 0:
|
||||
return
|
||||
|
||||
if not isinstance(item, self.items_types):
|
||||
raise ValidationError(
|
||||
'All items must be instances '
|
||||
'of "{types}", and not "{type}".'.format(
|
||||
types=', '.join([t.__name__ for t in self.items_types]),
|
||||
type=type(item).__name__,
|
||||
))
|
||||
|
||||
def parse_value(self, values):
|
||||
"""Cast value to proper collection."""
|
||||
result = self.get_default_value()
|
||||
|
||||
if not values:
|
||||
return result
|
||||
|
||||
if not isinstance(values, list):
|
||||
return values
|
||||
|
||||
return [self._cast_value(value) for value in values]
|
||||
|
||||
def _cast_value(self, value):
|
||||
if isinstance(value, self.items_types):
|
||||
return value
|
||||
else:
|
||||
if len(self.items_types) != 1:
|
||||
tpl = 'Cannot decide which type to choose from "{types}".'
|
||||
raise ValidationError(
|
||||
tpl.format(
|
||||
types=', '.join([t.__name__ for t in self.items_types])
|
||||
)
|
||||
)
|
||||
return self.items_types[0](**value)
|
||||
|
||||
def _finish_initialization(self, owner):
|
||||
super(ListField, self)._finish_initialization(owner)
|
||||
|
||||
types = []
|
||||
for type in self.items_types:
|
||||
if isinstance(type, _LazyType):
|
||||
types.append(type.evaluate(owner))
|
||||
else:
|
||||
types.append(type)
|
||||
self.items_types = tuple(types)
|
||||
|
||||
def _elem_to_struct(self, value):
|
||||
try:
|
||||
return value.to_struct()
|
||||
except AttributeError:
|
||||
return value
|
||||
|
||||
def to_struct(self, values):
|
||||
return [self._elem_to_struct(v) for v in values]
|
||||
|
||||
|
||||
class EmbeddedField(BaseField):
|
||||
|
||||
"""Field for embedded models."""
|
||||
|
||||
def __init__(self, model_types, *args, **kwargs):
|
||||
self._assign_model_types(model_types)
|
||||
super(EmbeddedField, self).__init__(*args, **kwargs)
|
||||
|
||||
def _assign_model_types(self, model_types):
|
||||
if not isinstance(model_types, (list, tuple)):
|
||||
model_types = (model_types,)
|
||||
|
||||
types = []
|
||||
for type_ in model_types:
|
||||
if isinstance(type_, six.string_types):
|
||||
types.append(_LazyType(type_))
|
||||
else:
|
||||
types.append(type_)
|
||||
self.types = tuple(types)
|
||||
|
||||
def _finish_initialization(self, owner):
|
||||
super(EmbeddedField, self)._finish_initialization(owner)
|
||||
|
||||
types = []
|
||||
for type in self.types:
|
||||
if isinstance(type, _LazyType):
|
||||
types.append(type.evaluate(owner))
|
||||
else:
|
||||
types.append(type)
|
||||
self.types = tuple(types)
|
||||
|
||||
def validate(self, value):
|
||||
super(EmbeddedField, self).validate(value)
|
||||
try:
|
||||
value.validate()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def parse_value(self, value):
|
||||
"""Parse value to proper model type."""
|
||||
if not isinstance(value, dict):
|
||||
return value
|
||||
|
||||
embed_type = self._get_embed_type()
|
||||
return embed_type(**value)
|
||||
|
||||
def _get_embed_type(self):
|
||||
if len(self.types) != 1:
|
||||
raise ValidationError(
|
||||
'Cannot decide which type to choose from "{types}".'.format(
|
||||
types=', '.join([t.__name__ for t in self.types])
|
||||
)
|
||||
)
|
||||
return self.types[0]
|
||||
|
||||
def to_struct(self, value):
|
||||
return value.to_struct()
|
||||
|
||||
|
||||
class _LazyType(object):
|
||||
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
|
||||
def evaluate(self, base_cls):
|
||||
module, type_name = _evaluate_path(self.path, base_cls)
|
||||
return _import(module, type_name)
|
||||
|
||||
|
||||
def _evaluate_path(relative_path, base_cls):
|
||||
base_module = base_cls.__module__
|
||||
|
||||
modules = _get_modules(relative_path, base_module)
|
||||
|
||||
type_name = modules.pop()
|
||||
module = '.'.join(modules)
|
||||
if not module:
|
||||
module = base_module
|
||||
return module, type_name
|
||||
|
||||
|
||||
def _get_modules(relative_path, base_module):
|
||||
canonical_path = relative_path.lstrip('.')
|
||||
canonical_modules = canonical_path.split('.')
|
||||
|
||||
if not relative_path.startswith('.'):
|
||||
return canonical_modules
|
||||
|
||||
parents_amount = len(relative_path) - len(canonical_path)
|
||||
parent_modules = base_module.split('.')
|
||||
parents_amount = max(0, parents_amount - 1)
|
||||
if parents_amount > len(parent_modules):
|
||||
raise ValueError("Can't evaluate path '{}'".format(relative_path))
|
||||
return parent_modules[:parents_amount * -1] + canonical_modules
|
||||
|
||||
|
||||
def _import(module_name, type_name):
|
||||
module = __import__(module_name, fromlist=[type_name])
|
||||
try:
|
||||
return getattr(module, type_name)
|
||||
except AttributeError:
|
||||
raise ValueError(
|
||||
"Can't find type '{}.{}'.".format(module_name, type_name))
|
||||
|
||||
|
||||
class TimeField(StringField):
|
||||
|
||||
"""Time field."""
|
||||
|
||||
types = (datetime.time,)
|
||||
|
||||
def __init__(self, str_format=None, *args, **kwargs):
|
||||
"""Init.
|
||||
|
||||
:param str str_format: Format to cast time to (if `None` - casting to
|
||||
ISO 8601 format).
|
||||
|
||||
"""
|
||||
self.str_format = str_format
|
||||
super(TimeField, self).__init__(*args, **kwargs)
|
||||
|
||||
def to_struct(self, value):
|
||||
"""Cast `time` object to string."""
|
||||
if self.str_format:
|
||||
return value.strftime(self.str_format)
|
||||
return value.isoformat()
|
||||
|
||||
def parse_value(self, value):
|
||||
"""Parse string into instance of `time`."""
|
||||
if value is None:
|
||||
return value
|
||||
if isinstance(value, datetime.time):
|
||||
return value
|
||||
return parse(value).timetz()
|
||||
|
||||
|
||||
class DateField(StringField):
|
||||
|
||||
"""Date field."""
|
||||
|
||||
types = (datetime.date,)
|
||||
default_format = '%Y-%m-%d'
|
||||
|
||||
def __init__(self, str_format=None, *args, **kwargs):
|
||||
"""Init.
|
||||
|
||||
:param str str_format: Format to cast date to (if `None` - casting to
|
||||
%Y-%m-%d format).
|
||||
|
||||
"""
|
||||
self.str_format = str_format
|
||||
super(DateField, self).__init__(*args, **kwargs)
|
||||
|
||||
def to_struct(self, value):
|
||||
"""Cast `date` object to string."""
|
||||
if self.str_format:
|
||||
return value.strftime(self.str_format)
|
||||
return value.strftime(self.default_format)
|
||||
|
||||
def parse_value(self, value):
|
||||
"""Parse string into instance of `date`."""
|
||||
if value is None:
|
||||
return value
|
||||
if isinstance(value, datetime.date):
|
||||
return value
|
||||
return parse(value).date()
|
||||
|
||||
|
||||
class DateTimeField(StringField):
|
||||
|
||||
"""Datetime field."""
|
||||
|
||||
types = (datetime.datetime,)
|
||||
|
||||
def __init__(self, str_format=None, *args, **kwargs):
|
||||
"""Init.
|
||||
|
||||
:param str str_format: Format to cast datetime to (if `None` - casting
|
||||
to ISO 8601 format).
|
||||
|
||||
"""
|
||||
self.str_format = str_format
|
||||
super(DateTimeField, self).__init__(*args, **kwargs)
|
||||
|
||||
def to_struct(self, value):
|
||||
"""Cast `datetime` object to string."""
|
||||
if self.str_format:
|
||||
return value.strftime(self.str_format)
|
||||
return value.isoformat()
|
||||
|
||||
def parse_value(self, value):
|
||||
"""Parse string into instance of `datetime`."""
|
||||
if isinstance(value, datetime.datetime):
|
||||
return value
|
||||
if value:
|
||||
return parse(value)
|
||||
else:
|
||||
return None
|
||||
154
clearml_agent/backend_api/session/jsonmodels/models.py
Normal file
154
clearml_agent/backend_api/session/jsonmodels/models.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import six
|
||||
|
||||
from . import parsers, errors
|
||||
from .fields import BaseField
|
||||
from .errors import ValidationError
|
||||
|
||||
|
||||
class JsonmodelMeta(type):
|
||||
|
||||
def __new__(cls, name, bases, attributes):
|
||||
cls.validate_fields(attributes)
|
||||
return super(cls, cls).__new__(cls, name, bases, attributes)
|
||||
|
||||
@staticmethod
|
||||
def validate_fields(attributes):
|
||||
fields = {
|
||||
key: value for key, value in attributes.items()
|
||||
if isinstance(value, BaseField)
|
||||
}
|
||||
taken_names = set()
|
||||
for name, field in fields.items():
|
||||
structue_name = field.structue_name(name)
|
||||
if structue_name in taken_names:
|
||||
raise ValueError('Name taken', structue_name, name)
|
||||
taken_names.add(structue_name)
|
||||
|
||||
|
||||
class Base(six.with_metaclass(JsonmodelMeta, object)):
|
||||
|
||||
"""Base class for all models."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._cache_key = _CacheKey()
|
||||
self.populate(**kwargs)
|
||||
|
||||
def populate(self, **values):
|
||||
"""Populate values to fields. Skip non-existing."""
|
||||
values = values.copy()
|
||||
fields = list(self.iterate_with_name())
|
||||
for _, structure_name, field in fields:
|
||||
if structure_name in values:
|
||||
field.__set__(self, values.pop(structure_name))
|
||||
for name, _, field in fields:
|
||||
if name in values:
|
||||
field.__set__(self, values.pop(name))
|
||||
|
||||
def get_field(self, field_name):
|
||||
"""Get field associated with given attribute."""
|
||||
for attr_name, field in self:
|
||||
if field_name == attr_name:
|
||||
return field
|
||||
|
||||
raise errors.FieldNotFound('Field not found', field_name)
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate through fields and values."""
|
||||
for name, field in self.iterate_over_fields():
|
||||
yield name, field
|
||||
|
||||
def validate(self):
|
||||
"""Explicitly validate all the fields."""
|
||||
for name, field in self:
|
||||
try:
|
||||
field.validate_for_object(self)
|
||||
except ValidationError as error:
|
||||
raise ValidationError(
|
||||
"Error for field '{name}'.".format(name=name),
|
||||
error,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def iterate_over_fields(cls):
|
||||
"""Iterate through fields as `(attribute_name, field_instance)`."""
|
||||
for attr in dir(cls):
|
||||
clsattr = getattr(cls, attr)
|
||||
if isinstance(clsattr, BaseField):
|
||||
yield attr, clsattr
|
||||
|
||||
@classmethod
|
||||
def iterate_with_name(cls):
|
||||
"""Iterate over fields, but also give `structure_name`.
|
||||
|
||||
Format is `(attribute_name, structue_name, field_instance)`.
|
||||
Structure name is name under which value is seen in structure and
|
||||
schema (in primitives) and only there.
|
||||
"""
|
||||
for attr_name, field in cls.iterate_over_fields():
|
||||
structure_name = field.structue_name(attr_name)
|
||||
yield attr_name, structure_name, field
|
||||
|
||||
def to_struct(self):
|
||||
"""Cast model to Python structure."""
|
||||
return parsers.to_struct(self)
|
||||
|
||||
@classmethod
|
||||
def to_json_schema(cls):
|
||||
"""Generate JSON schema for model."""
|
||||
return parsers.to_json_schema(cls)
|
||||
|
||||
def __repr__(self):
|
||||
attrs = {}
|
||||
for name, _ in self:
|
||||
try:
|
||||
attr = getattr(self, name)
|
||||
if attr is not None:
|
||||
attrs[name] = repr(attr)
|
||||
except ValidationError:
|
||||
pass
|
||||
|
||||
return '{class_name}({fields})'.format(
|
||||
class_name=self.__class__.__name__,
|
||||
fields=', '.join(
|
||||
'{0[0]}={0[1]}'.format(x) for x in sorted(attrs.items())
|
||||
),
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return '{name} object'.format(name=self.__class__.__name__)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
try:
|
||||
return super(Base, self).__setattr__(name, value)
|
||||
except ValidationError as error:
|
||||
raise ValidationError(
|
||||
"Error for field '{name}'.".format(name=name),
|
||||
error
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
if type(other) is not type(self):
|
||||
return False
|
||||
|
||||
for name, _ in self.iterate_over_fields():
|
||||
try:
|
||||
our = getattr(self, name)
|
||||
except errors.ValidationError:
|
||||
our = None
|
||||
|
||||
try:
|
||||
their = getattr(other, name)
|
||||
except errors.ValidationError:
|
||||
their = None
|
||||
|
||||
if our != their:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
|
||||
class _CacheKey(object):
|
||||
"""Object to identify model in memory."""
|
||||
106
clearml_agent/backend_api/session/jsonmodels/parsers.py
Normal file
106
clearml_agent/backend_api/session/jsonmodels/parsers.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Parsers to change model structure into different ones."""
|
||||
import inspect
|
||||
|
||||
from . import fields, builders, errors
|
||||
|
||||
|
||||
def to_struct(model):
|
||||
"""Cast instance of model to python structure.
|
||||
|
||||
:param model: Model to be casted.
|
||||
:rtype: ``dict``
|
||||
|
||||
"""
|
||||
model.validate()
|
||||
|
||||
resp = {}
|
||||
for _, name, field in model.iterate_with_name():
|
||||
value = field.__get__(model)
|
||||
if value is None:
|
||||
continue
|
||||
|
||||
value = field.to_struct(value)
|
||||
resp[name] = value
|
||||
return resp
|
||||
|
||||
|
||||
def to_json_schema(cls):
|
||||
"""Generate JSON schema for given class.
|
||||
|
||||
:param cls: Class to be casted.
|
||||
:rtype: ``dict``
|
||||
|
||||
"""
|
||||
builder = build_json_schema(cls)
|
||||
return builder.build()
|
||||
|
||||
|
||||
def build_json_schema(value, parent_builder=None):
|
||||
from .models import Base
|
||||
|
||||
cls = value if inspect.isclass(value) else value.__class__
|
||||
if issubclass(cls, Base):
|
||||
return build_json_schema_object(cls, parent_builder)
|
||||
else:
|
||||
return build_json_schema_primitive(cls, parent_builder)
|
||||
|
||||
|
||||
def build_json_schema_object(cls, parent_builder=None):
|
||||
builder = builders.ObjectBuilder(cls, parent_builder)
|
||||
if builder.count_type(builder.type) > 1:
|
||||
return builder
|
||||
for _, name, field in cls.iterate_with_name():
|
||||
if isinstance(field, fields.EmbeddedField):
|
||||
builder.add_field(name, field, _parse_embedded(field, builder))
|
||||
elif isinstance(field, fields.ListField):
|
||||
builder.add_field(name, field, _parse_list(field, builder))
|
||||
else:
|
||||
builder.add_field(
|
||||
name, field, _create_primitive_field_schema(field))
|
||||
return builder
|
||||
|
||||
|
||||
def _parse_list(field, parent_builder):
|
||||
builder = builders.ListBuilder(
|
||||
parent_builder, field.nullable, default=field._default)
|
||||
for type in field.items_types:
|
||||
builder.add_type_schema(build_json_schema(type, builder))
|
||||
return builder
|
||||
|
||||
|
||||
def _parse_embedded(field, parent_builder):
|
||||
builder = builders.EmbeddedBuilder(
|
||||
parent_builder, field.nullable, default=field._default)
|
||||
for type in field.types:
|
||||
builder.add_type_schema(build_json_schema(type, builder))
|
||||
return builder
|
||||
|
||||
|
||||
def build_json_schema_primitive(cls, parent_builder):
|
||||
builder = builders.PrimitiveBuilder(cls, parent_builder)
|
||||
return builder
|
||||
|
||||
|
||||
def _create_primitive_field_schema(field):
|
||||
if isinstance(field, fields.StringField):
|
||||
obj_type = 'string'
|
||||
elif isinstance(field, fields.IntField):
|
||||
obj_type = 'number'
|
||||
elif isinstance(field, fields.FloatField):
|
||||
obj_type = 'float'
|
||||
elif isinstance(field, fields.BoolField):
|
||||
obj_type = 'boolean'
|
||||
else:
|
||||
raise errors.FieldNotSupported(
|
||||
'Field {field} is not supported!'.format(
|
||||
field=type(field).__class__.__name__))
|
||||
|
||||
if field.nullable:
|
||||
obj_type = [obj_type, 'null']
|
||||
|
||||
schema = {'type': obj_type}
|
||||
|
||||
if field.has_default:
|
||||
schema["default"] = field._default
|
||||
|
||||
return schema
|
||||
156
clearml_agent/backend_api/session/jsonmodels/utilities.py
Normal file
156
clearml_agent/backend_api/session/jsonmodels/utilities.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import six
|
||||
import re
|
||||
from collections import namedtuple
|
||||
|
||||
SCALAR_TYPES = tuple(list(six.string_types) + [int, float, bool])
|
||||
|
||||
ECMA_TO_PYTHON_FLAGS = {
|
||||
'i': re.I,
|
||||
'm': re.M,
|
||||
}
|
||||
|
||||
PYTHON_TO_ECMA_FLAGS = dict(
|
||||
(value, key) for key, value in ECMA_TO_PYTHON_FLAGS.items()
|
||||
)
|
||||
|
||||
PythonRegex = namedtuple('PythonRegex', ['regex', 'flags'])
|
||||
|
||||
|
||||
def _normalize_string_type(value):
|
||||
if isinstance(value, six.string_types):
|
||||
return six.text_type(value)
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
def _compare_dicts(one, two):
|
||||
if len(one) != len(two):
|
||||
return False
|
||||
|
||||
for key, value in one.items():
|
||||
if key not in one or key not in two:
|
||||
return False
|
||||
|
||||
if not compare_schemas(one[key], two[key]):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _compare_lists(one, two):
|
||||
if len(one) != len(two):
|
||||
return False
|
||||
|
||||
they_match = False
|
||||
for first_item in one:
|
||||
for second_item in two:
|
||||
if they_match:
|
||||
continue
|
||||
they_match = compare_schemas(first_item, second_item)
|
||||
return they_match
|
||||
|
||||
|
||||
def _assert_same_types(one, two):
|
||||
if not isinstance(one, type(two)) or not isinstance(two, type(one)):
|
||||
raise RuntimeError('Types mismatch! "{type1}" and "{type2}".'.format(
|
||||
type1=type(one).__name__, type2=type(two).__name__))
|
||||
|
||||
|
||||
def compare_schemas(one, two):
|
||||
"""Compare two structures that represents JSON schemas.
|
||||
|
||||
For comparison you can't use normal comparison, because in JSON schema
|
||||
lists DO NOT keep order (and Python lists do), so this must be taken into
|
||||
account during comparison.
|
||||
|
||||
Note this wont check all configurations, only first one that seems to
|
||||
match, which can lead to wrong results.
|
||||
|
||||
:param one: First schema to compare.
|
||||
:param two: Second schema to compare.
|
||||
:rtype: `bool`
|
||||
|
||||
"""
|
||||
one = _normalize_string_type(one)
|
||||
two = _normalize_string_type(two)
|
||||
|
||||
_assert_same_types(one, two)
|
||||
|
||||
if isinstance(one, list):
|
||||
return _compare_lists(one, two)
|
||||
elif isinstance(one, dict):
|
||||
return _compare_dicts(one, two)
|
||||
elif isinstance(one, SCALAR_TYPES):
|
||||
return one == two
|
||||
elif one is None:
|
||||
return one is two
|
||||
else:
|
||||
raise RuntimeError('Not allowed type "{type}"'.format(
|
||||
type=type(one).__name__))
|
||||
|
||||
|
||||
def is_ecma_regex(regex):
|
||||
"""Check if given regex is of type ECMA 262 or not.
|
||||
|
||||
:rtype: bool
|
||||
|
||||
"""
|
||||
parts = regex.split('/')
|
||||
|
||||
if len(parts) == 1:
|
||||
return False
|
||||
|
||||
if len(parts) < 3:
|
||||
raise ValueError('Given regex isn\'t ECMA regex nor Python regex.')
|
||||
parts.pop()
|
||||
parts.append('')
|
||||
|
||||
raw_regex = '/'.join(parts)
|
||||
if raw_regex.startswith('/') and raw_regex.endswith('/'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def convert_ecma_regex_to_python(value):
|
||||
"""Convert ECMA 262 regex to Python tuple with regex and flags.
|
||||
|
||||
If given value is already Python regex it will be returned unchanged.
|
||||
|
||||
:param string value: ECMA regex.
|
||||
:return: 2-tuple with `regex` and `flags`
|
||||
:rtype: namedtuple
|
||||
|
||||
"""
|
||||
if not is_ecma_regex(value):
|
||||
return PythonRegex(value, [])
|
||||
|
||||
parts = value.split('/')
|
||||
flags = parts.pop()
|
||||
|
||||
try:
|
||||
result_flags = [ECMA_TO_PYTHON_FLAGS[f] for f in flags]
|
||||
except KeyError:
|
||||
raise ValueError('Wrong flags "{}".'.format(flags))
|
||||
|
||||
return PythonRegex('/'.join(parts[1:]), result_flags)
|
||||
|
||||
|
||||
def convert_python_regex_to_ecma(value, flags=[]):
|
||||
"""Convert Python regex to ECMA 262 regex.
|
||||
|
||||
If given value is already ECMA regex it will be returned unchanged.
|
||||
|
||||
:param string value: Python regex.
|
||||
:param list flags: List of flags (allowed flags: `re.I`, `re.M`)
|
||||
:return: ECMA 262 regex
|
||||
:rtype: str
|
||||
|
||||
"""
|
||||
if is_ecma_regex(value):
|
||||
return value
|
||||
|
||||
result_flags = [PYTHON_TO_ECMA_FLAGS[f] for f in flags]
|
||||
result_flags = ''.join(result_flags)
|
||||
|
||||
return '/{value}/{flags}'.format(value=value, flags=result_flags)
|
||||
202
clearml_agent/backend_api/session/jsonmodels/validators.py
Normal file
202
clearml_agent/backend_api/session/jsonmodels/validators.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""Predefined validators."""
|
||||
import re
|
||||
|
||||
from six.moves import reduce
|
||||
|
||||
from .errors import ValidationError
|
||||
from . import utilities
|
||||
|
||||
|
||||
class Min(object):
|
||||
|
||||
"""Validator for minimum value."""
|
||||
|
||||
def __init__(self, minimum_value, exclusive=False):
|
||||
"""Init.
|
||||
|
||||
:param minimum_value: Minimum value for validator.
|
||||
:param bool exclusive: If `True`, then validated value must be strongly
|
||||
lower than given threshold.
|
||||
|
||||
"""
|
||||
self.minimum_value = minimum_value
|
||||
self.exclusive = exclusive
|
||||
|
||||
def validate(self, value):
|
||||
"""Validate value."""
|
||||
if self.exclusive:
|
||||
if value <= self.minimum_value:
|
||||
tpl = "'{value}' is lower or equal than minimum ('{min}')."
|
||||
raise ValidationError(
|
||||
tpl.format(value=value, min=self.minimum_value))
|
||||
else:
|
||||
if value < self.minimum_value:
|
||||
raise ValidationError(
|
||||
"'{value}' is lower than minimum ('{min}').".format(
|
||||
value=value, min=self.minimum_value))
|
||||
|
||||
def modify_schema(self, field_schema):
|
||||
"""Modify field schema."""
|
||||
field_schema['minimum'] = self.minimum_value
|
||||
if self.exclusive:
|
||||
field_schema['exclusiveMinimum'] = True
|
||||
|
||||
|
||||
class Max(object):
|
||||
|
||||
"""Validator for maximum value."""
|
||||
|
||||
def __init__(self, maximum_value, exclusive=False):
|
||||
"""Init.
|
||||
|
||||
:param maximum_value: Maximum value for validator.
|
||||
:param bool exclusive: If `True`, then validated value must be strongly
|
||||
bigger than given threshold.
|
||||
|
||||
"""
|
||||
self.maximum_value = maximum_value
|
||||
self.exclusive = exclusive
|
||||
|
||||
def validate(self, value):
|
||||
"""Validate value."""
|
||||
if self.exclusive:
|
||||
if value >= self.maximum_value:
|
||||
tpl = "'{val}' is bigger or equal than maximum ('{max}')."
|
||||
raise ValidationError(
|
||||
tpl.format(val=value, max=self.maximum_value))
|
||||
else:
|
||||
if value > self.maximum_value:
|
||||
raise ValidationError(
|
||||
"'{value}' is bigger than maximum ('{max}').".format(
|
||||
value=value, max=self.maximum_value))
|
||||
|
||||
def modify_schema(self, field_schema):
|
||||
"""Modify field schema."""
|
||||
field_schema['maximum'] = self.maximum_value
|
||||
if self.exclusive:
|
||||
field_schema['exclusiveMaximum'] = True
|
||||
|
||||
|
||||
class Regex(object):
|
||||
|
||||
"""Validator for regular expressions."""
|
||||
|
||||
FLAGS = {
|
||||
'ignorecase': re.I,
|
||||
'multiline': re.M,
|
||||
}
|
||||
|
||||
def __init__(self, pattern, **flags):
|
||||
"""Init.
|
||||
|
||||
Note, that if given pattern is ECMA regex, given flags will be
|
||||
**completely ignored** and taken from given regex.
|
||||
|
||||
|
||||
:param string pattern: Pattern of regex.
|
||||
:param bool flags: Flags used for the regex matching.
|
||||
Allowed flag names are in the `FLAGS` attribute. The flag value
|
||||
does not matter as long as it evaluates to True.
|
||||
Flags with False values will be ignored.
|
||||
Invalid flags will be ignored.
|
||||
|
||||
"""
|
||||
if utilities.is_ecma_regex(pattern):
|
||||
result = utilities.convert_ecma_regex_to_python(pattern)
|
||||
self.pattern, self.flags = result
|
||||
else:
|
||||
self.pattern = pattern
|
||||
self.flags = [self.FLAGS[key] for key, value in flags.items()
|
||||
if key in self.FLAGS and value]
|
||||
|
||||
def validate(self, value):
|
||||
"""Validate value."""
|
||||
flags = self._calculate_flags()
|
||||
|
||||
try:
|
||||
result = re.search(self.pattern, value, flags)
|
||||
except TypeError as te:
|
||||
raise ValidationError(*te.args)
|
||||
|
||||
if not result:
|
||||
raise ValidationError(
|
||||
'Value "{value}" did not match pattern "{pattern}".'.format(
|
||||
value=value, pattern=self.pattern
|
||||
))
|
||||
|
||||
def _calculate_flags(self):
|
||||
return reduce(lambda x, y: x | y, self.flags, 0)
|
||||
|
||||
def modify_schema(self, field_schema):
|
||||
"""Modify field schema."""
|
||||
field_schema['pattern'] = utilities.convert_python_regex_to_ecma(
|
||||
self.pattern, self.flags)
|
||||
|
||||
|
||||
class Length(object):
|
||||
|
||||
"""Validator for length."""
|
||||
|
||||
def __init__(self, minimum_value=None, maximum_value=None):
|
||||
"""Init.
|
||||
|
||||
Note that if no `minimum_value` neither `maximum_value` will be
|
||||
specified, `ValueError` will be raised.
|
||||
|
||||
:param int minimum_value: Minimum value (optional).
|
||||
:param int maximum_value: Maximum value (optional).
|
||||
|
||||
"""
|
||||
if minimum_value is None and maximum_value is None:
|
||||
raise ValueError(
|
||||
"Either 'minimum_value' or 'maximum_value' must be specified.")
|
||||
|
||||
self.minimum_value = minimum_value
|
||||
self.maximum_value = maximum_value
|
||||
|
||||
def validate(self, value):
|
||||
"""Validate value."""
|
||||
len_ = len(value)
|
||||
|
||||
if self.minimum_value is not None and len_ < self.minimum_value:
|
||||
tpl = "Value '{val}' length is lower than allowed minimum '{min}'."
|
||||
raise ValidationError(tpl.format(
|
||||
val=value, min=self.minimum_value
|
||||
))
|
||||
|
||||
if self.maximum_value is not None and len_ > self.maximum_value:
|
||||
raise ValidationError(
|
||||
"Value '{val}' length is bigger than "
|
||||
"allowed maximum '{max}'.".format(
|
||||
val=value,
|
||||
max=self.maximum_value,
|
||||
))
|
||||
|
||||
def modify_schema(self, field_schema):
|
||||
"""Modify field schema."""
|
||||
if self.minimum_value:
|
||||
field_schema['minLength'] = self.minimum_value
|
||||
|
||||
if self.maximum_value:
|
||||
field_schema['maxLength'] = self.maximum_value
|
||||
|
||||
|
||||
class Enum(object):
|
||||
|
||||
"""Validator for enums."""
|
||||
|
||||
def __init__(self, *choices):
|
||||
"""Init.
|
||||
|
||||
:param [] choices: Valid choices for the field.
|
||||
"""
|
||||
|
||||
self.choices = list(choices)
|
||||
|
||||
def validate(self, value):
|
||||
if value not in self.choices:
|
||||
tpl = "Value '{val}' is not a valid choice."
|
||||
raise ValidationError(tpl.format(val=value))
|
||||
|
||||
def modify_schema(self, field_schema):
|
||||
field_schema['enum'] = self.choices
|
||||
@@ -5,10 +5,17 @@ import six
|
||||
|
||||
from .apimodel import ApiModel
|
||||
from .datamodel import DataModel
|
||||
from .defs import ENV_API_DEFAULT_REQ_METHOD
|
||||
|
||||
|
||||
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST"):
|
||||
raise ValueError(
|
||||
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
|
||||
)
|
||||
|
||||
|
||||
class Request(ApiModel):
|
||||
_method = 'get'
|
||||
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs:
|
||||
@@ -1,10 +1,8 @@
|
||||
import requests
|
||||
|
||||
import six
|
||||
import jsonmodels.models
|
||||
import jsonmodels.fields
|
||||
import jsonmodels.errors
|
||||
|
||||
from . import jsonmodels
|
||||
from .apimodel import ApiModel
|
||||
from .datamodel import NonStrictDataModelMixin
|
||||
|
||||
@@ -1,17 +1,21 @@
|
||||
|
||||
import json as json_lib
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
from socket import gethostname
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
from typing import Optional
|
||||
|
||||
import jwt
|
||||
import requests
|
||||
import six
|
||||
from pyhocon import ConfigTree
|
||||
from pyhocon import ConfigTree, ConfigFactory
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
|
||||
from .callresult import CallResult
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD
|
||||
from .request import Request, BatchRequest
|
||||
from .token_manager import TokenManager
|
||||
from ..config import load
|
||||
@@ -29,24 +33,26 @@ class MaxRequestSizeError(Exception):
|
||||
|
||||
|
||||
class Session(TokenManager):
|
||||
""" TRAINS API Session class. """
|
||||
""" ClearML API Session class. """
|
||||
|
||||
_AUTHORIZATION_HEADER = "Authorization"
|
||||
_WORKER_HEADER = "X-Trains-Worker"
|
||||
_ASYNC_HEADER = "X-Trains-Async"
|
||||
_CLIENT_HEADER = "X-Trains-Agent"
|
||||
_WORKER_HEADER = ("X-ClearML-Worker", "X-Trains-Worker", )
|
||||
_ASYNC_HEADER = ("X-ClearML-Async", "X-Trains-Async", )
|
||||
_CLIENT_HEADER = ("X-ClearML-Agent", "X-Trains-Agent", )
|
||||
|
||||
_async_status_code = 202
|
||||
_session_requests = 0
|
||||
_session_initial_timeout = (3.0, 10.)
|
||||
_session_timeout = (10.0, 30.)
|
||||
_session_initial_retry_connect_override = 4
|
||||
_write_session_data_size = 15000
|
||||
_write_session_timeout = (30.0, 30.)
|
||||
|
||||
api_version = '2.1'
|
||||
default_host = "https://demoapi.trains.allegro.ai"
|
||||
default_web = "https://demoapp.trains.allegro.ai"
|
||||
default_files = "https://demofiles.trains.allegro.ai"
|
||||
feature_set = 'basic'
|
||||
default_host = "https://demoapi.demo.clear.ml"
|
||||
default_web = "https://demoapp.demo.clear.ml"
|
||||
default_files = "https://demofiles.demo.clear.ml"
|
||||
default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
|
||||
default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
|
||||
|
||||
@@ -85,6 +91,7 @@ class Session(TokenManager):
|
||||
initialize_logging=True,
|
||||
client=None,
|
||||
config=None,
|
||||
http_retries_config=None,
|
||||
**kwargs
|
||||
):
|
||||
# add backward compatibility support for old environment variables
|
||||
@@ -95,45 +102,50 @@ class Session(TokenManager):
|
||||
else:
|
||||
self.config = load()
|
||||
if initialize_logging:
|
||||
self.config.initialize_logging()
|
||||
self.config.initialize_logging(debug=kwargs.get('debug', False))
|
||||
|
||||
token_expiration_threshold_sec = self.config.get(
|
||||
"auth.token_expiration_threshold_sec", 60
|
||||
)
|
||||
|
||||
super(Session, self).__init__(
|
||||
token_expiration_threshold_sec=token_expiration_threshold_sec, **kwargs
|
||||
)
|
||||
super(Session, self).__init__(config=config, **kwargs)
|
||||
|
||||
self._verbose = verbose if verbose is not None else ENV_VERBOSE.get()
|
||||
self._logger = logger
|
||||
self.__auth_token = None
|
||||
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key)
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
if ENV_AUTH_TOKEN.get(
|
||||
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
|
||||
):
|
||||
self.set_auth_token(ENV_AUTH_TOKEN.get())
|
||||
else:
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key),
|
||||
value_cb=lambda key, value: print("Using environment access key {}={}".format(key, value))
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret)
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
|
||||
value_cb=lambda key, value: print("Using environment secret key {}=********".format(key))
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
if self.access_key == self.default_key and self.secret_key == self.default_secret:
|
||||
print("Using built-in ClearML default key/secret")
|
||||
|
||||
host = host or self.get_api_server_host(config=self.config)
|
||||
if not host:
|
||||
raise ValueError("host is required in init or config")
|
||||
raise ValueError(
|
||||
"Could not find host server definition "
|
||||
"(missing `~/clearml.conf` or Environment CLEARML_API_HOST)\n"
|
||||
"To get started with ClearML: setup your own `clearml-server`, "
|
||||
"or create a free account at https://app.clear.ml and run `clearml-agent init`"
|
||||
)
|
||||
|
||||
self.__host = host.strip("/")
|
||||
http_retries_config = self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
self.__http_session = get_http_session_with_retry(**http_retries_config)
|
||||
|
||||
self.__worker = worker or gethostname()
|
||||
|
||||
@@ -143,16 +155,26 @@ class Session(TokenManager):
|
||||
|
||||
self.client = client or "api-{}".format(__version__)
|
||||
|
||||
# limit the reconnect retries, so we get an error if we are starting the session
|
||||
_, self.__http_session = self._setup_session(
|
||||
http_retries_config,
|
||||
initial_session=True,
|
||||
default_initial_connect_override=(False if kwargs.get("command") == "execute" else None)
|
||||
)
|
||||
# try to connect with the server
|
||||
self.refresh_token()
|
||||
# create the default session with many retries
|
||||
http_retries_config, self.__http_session = self._setup_session(http_retries_config)
|
||||
|
||||
# update api version from server response
|
||||
try:
|
||||
token_dict = jwt.decode(self.token, verify=False)
|
||||
token_dict = TokenManager.get_decoded_token(self.token, verify=False)
|
||||
api_version = token_dict.get('api_version')
|
||||
if not api_version:
|
||||
api_version = '2.2' if token_dict.get('env', '') == 'prod' else Session.api_version
|
||||
|
||||
Session.api_version = str(api_version)
|
||||
Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
|
||||
except (jwt.DecodeError, ValueError):
|
||||
pass
|
||||
|
||||
@@ -161,6 +183,69 @@ class Session(TokenManager):
|
||||
# notice: this is across the board warning omission
|
||||
urllib_log_warning_setup(total_retries=http_retries_config.get('total', 0), display_warning_after=3)
|
||||
|
||||
def _setup_session(self, http_retries_config, initial_session=False, default_initial_connect_override=None):
|
||||
# type: (dict, bool, Optional[bool]) -> (dict, requests.Session)
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
if initial_session:
|
||||
kwargs = {} if default_initial_connect_override is None else {
|
||||
"default": default_initial_connect_override
|
||||
}
|
||||
if ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(**kwargs):
|
||||
connect_retries = self._session_initial_retry_connect_override
|
||||
try:
|
||||
value = ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(converter=str)
|
||||
if not isinstance(value, bool):
|
||||
connect_retries = abs(int(value))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
http_retries_config = dict(**http_retries_config)
|
||||
http_retries_config['connect'] = connect_retries
|
||||
|
||||
return http_retries_config, get_http_session_with_retry(config=self.config or None, **http_retries_config)
|
||||
|
||||
def load_vaults(self):
|
||||
if not self.check_min_api_version("2.15") or self.feature_set == "basic":
|
||||
return
|
||||
|
||||
if ENV_DISABLE_VAULT_SUPPORT.get():
|
||||
print("Vault support is disabled")
|
||||
return
|
||||
|
||||
def parse(vault):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
d = vault.get('data', None)
|
||||
if d:
|
||||
r = ConfigFactory.parse_string(d)
|
||||
if isinstance(r, (ConfigTree, dict)):
|
||||
return r
|
||||
except Exception as e:
|
||||
print("Failed parsing vault {}: {}".format(vault.get("description", "<unknown>"), e))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
res = self.send_request("users", "get_vaults", json={"enabled": True, "types": ["config"]})
|
||||
if res.ok:
|
||||
vaults = res.json().get("data", {}).get("vaults", [])
|
||||
data = list(filter(None, map(parse, vaults)))
|
||||
if data:
|
||||
self.config.set_overrides(*data)
|
||||
elif res.status_code != 404:
|
||||
raise Exception(res.json().get("meta", {}).get("result_msg", res.text))
|
||||
except Exception as ex:
|
||||
print("Failed getting vaults: {}".format(ex))
|
||||
|
||||
def verify_feature_set(self, feature_set):
|
||||
if isinstance(feature_set, str):
|
||||
feature_set = [feature_set]
|
||||
if self.feature_set not in feature_set:
|
||||
raise ValueError('ClearML-server does not support requested feature set {}'.format(feature_set))
|
||||
|
||||
def _send_request(
|
||||
self,
|
||||
service,
|
||||
@@ -184,8 +269,10 @@ class Session(TokenManager):
|
||||
"""
|
||||
host = self.host
|
||||
headers = headers.copy() if headers else {}
|
||||
headers[self._WORKER_HEADER] = self.worker
|
||||
headers[self._CLIENT_HEADER] = self.client
|
||||
for h in self._WORKER_HEADER:
|
||||
headers[h] = self.worker
|
||||
for h in self._CLIENT_HEADER:
|
||||
headers[h] = self.client
|
||||
|
||||
token_refreshed_on_error = False
|
||||
url = (
|
||||
@@ -232,6 +319,10 @@ class Session(TokenManager):
|
||||
headers[self._AUTHORIZATION_HEADER] = "Bearer {}".format(self.token)
|
||||
return headers
|
||||
|
||||
def set_auth_token(self, auth_token):
|
||||
self.__access_key = self.__secret_key = None
|
||||
self._set_token(auth_token)
|
||||
|
||||
def send_request(
|
||||
self,
|
||||
service,
|
||||
@@ -260,7 +351,8 @@ class Session(TokenManager):
|
||||
headers.copy() if headers else {}
|
||||
)
|
||||
if async_enable:
|
||||
headers[self._ASYNC_HEADER] = "1"
|
||||
for h in self._ASYNC_HEADER:
|
||||
headers[h] = "1"
|
||||
return self._send_request(
|
||||
service=service,
|
||||
action=action,
|
||||
@@ -426,16 +518,18 @@ class Session(TokenManager):
|
||||
@classmethod
|
||||
def get_api_server_host(cls, config=None):
|
||||
if not config:
|
||||
from ...config import config_obj
|
||||
config = config_obj
|
||||
return ENV_HOST.get(default=(config.get("api.api_server", None) or
|
||||
config.get("api.host", None) or cls.default_host))
|
||||
return None
|
||||
|
||||
default = config.get("api.api_server", None) or config.get("api.host", None)
|
||||
if not ENV_NO_DEFAULT_SERVER.get():
|
||||
default = default or cls.default_host
|
||||
|
||||
return ENV_HOST.get(default=default)
|
||||
|
||||
@classmethod
|
||||
def get_app_server_host(cls, config=None):
|
||||
if not config:
|
||||
from ...config import config_obj
|
||||
config = config_obj
|
||||
return None
|
||||
|
||||
# get from config/environment
|
||||
web_host = ENV_WEB_HOST.get(default=config.get("api.web_server", None))
|
||||
@@ -457,13 +551,13 @@ class Session(TokenManager):
|
||||
if parsed.port == 8008:
|
||||
return host.replace(':8008', ':8080', 1)
|
||||
|
||||
raise ValueError('Could not detect TRAINS web application server')
|
||||
raise ValueError('Could not detect ClearML web application server')
|
||||
|
||||
@classmethod
|
||||
def get_files_server_host(cls, config=None):
|
||||
if not config:
|
||||
from ...config import config_obj
|
||||
config = config_obj
|
||||
return None
|
||||
|
||||
# get from config/environment
|
||||
files_host = ENV_FILES_HOST.get(default=(config.get("api.files_server", None)))
|
||||
if files_host:
|
||||
@@ -498,7 +592,7 @@ class Session(TokenManager):
|
||||
return v + (0,) * max(0, 3 - len(v))
|
||||
return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))
|
||||
|
||||
def _do_refresh_token(self, old_token, exp=None):
|
||||
def _do_refresh_token(self, current_token, exp=None):
|
||||
""" TokenManager abstract method implementation.
|
||||
Here we ignore the old token and simply obtain a new token.
|
||||
"""
|
||||
@@ -510,15 +604,23 @@ class Session(TokenManager):
|
||||
)
|
||||
)
|
||||
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
auth = None
|
||||
headers = None
|
||||
if self.access_key and self.secret_key:
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
elif current_token:
|
||||
headers = dict(Authorization="Bearer {}".format(current_token))
|
||||
|
||||
res = None
|
||||
try:
|
||||
data = {"expiration_sec": exp} if exp else {}
|
||||
res = self._send_request(
|
||||
method=ENV_API_DEFAULT_REQ_METHOD.get(default="get"),
|
||||
service="auth",
|
||||
action="login",
|
||||
auth=auth,
|
||||
json=data,
|
||||
headers=headers,
|
||||
refresh_token_if_unauthorized=False,
|
||||
)
|
||||
try:
|
||||
@@ -534,17 +636,23 @@ class Session(TokenManager):
|
||||
)
|
||||
if verbose:
|
||||
self._logger.info("Received new token")
|
||||
return resp["data"]["token"]
|
||||
token = resp["data"]["token"]
|
||||
if ENV_AUTH_TOKEN.get():
|
||||
os.environ[ENV_AUTH_TOKEN.key] = token
|
||||
return token
|
||||
except LoginError:
|
||||
six.reraise(*sys.exc_info())
|
||||
except KeyError as ex:
|
||||
# check if this is a misconfigured api server (getting 200 without the data section)
|
||||
if res and res.status_code == 200:
|
||||
raise ValueError('It seems *api_server* is misconfigured. '
|
||||
'Is this the TRAINS API server {} ?'.format(self.get_api_server_host()))
|
||||
'Is this the ClearML API server {} ?'.format(self.get_api_server_host()))
|
||||
else:
|
||||
raise LoginError("Response data mismatch: No 'token' in 'data' value from res, receive : {}, "
|
||||
"exception: {}".format(res, ex))
|
||||
except requests.ConnectionError as ex:
|
||||
raise ValueError('Connection Error: it seems *api_server* is misconfigured. '
|
||||
'Is this the ClearML API server {} ?'.format('/'.join(ex.request.url.split('/')[:3])))
|
||||
except Exception as ex:
|
||||
raise LoginError('Unrecognized Authentication Error: {} {}'.format(type(ex), ex))
|
||||
|
||||
@@ -3,11 +3,14 @@ from abc import ABCMeta, abstractmethod
|
||||
from time import time
|
||||
|
||||
import jwt
|
||||
from jwt.algorithms import get_default_algorithms
|
||||
import six
|
||||
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class TokenManager(object):
|
||||
_default_token_exp_threshold_sec = 12 * 60 * 60
|
||||
_default_req_token_expiration_sec = None
|
||||
|
||||
@property
|
||||
def token_expiration_threshold_sec(self):
|
||||
@@ -40,17 +43,30 @@ class TokenManager(object):
|
||||
return self.__token
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=60,
|
||||
**kwargs
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=None,
|
||||
config=None,
|
||||
**kwargs
|
||||
):
|
||||
super(TokenManager, self).__init__()
|
||||
assert isinstance(token_history, (type(None), dict))
|
||||
self.token_expiration_threshold_sec = token_expiration_threshold_sec
|
||||
self.req_token_expiration_sec = req_token_expiration_sec
|
||||
if config:
|
||||
req_token_expiration_sec = req_token_expiration_sec or config.get(
|
||||
"api.auth.request_token_expiration_sec", None
|
||||
)
|
||||
token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec
|
||||
or config.get("api.auth.token_expiration_threshold_sec", None)
|
||||
)
|
||||
self.token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec or self._default_token_exp_threshold_sec
|
||||
)
|
||||
self.req_token_expiration_sec = (
|
||||
req_token_expiration_sec or self._default_req_token_expiration_sec
|
||||
)
|
||||
self._set_token(token)
|
||||
|
||||
def _calc_token_valid_period_sec(self, token, exp=None, at_least_sec=None):
|
||||
@@ -58,7 +74,9 @@ class TokenManager(object):
|
||||
try:
|
||||
exp = exp or self._get_token_exp(token)
|
||||
if at_least_sec:
|
||||
at_least_sec = max(at_least_sec, self.token_expiration_threshold_sec)
|
||||
at_least_sec = max(
|
||||
at_least_sec, self.token_expiration_threshold_sec
|
||||
)
|
||||
else:
|
||||
at_least_sec = self.token_expiration_threshold_sec
|
||||
return max(0, (exp - time() - at_least_sec))
|
||||
@@ -66,10 +84,26 @@ class TokenManager(object):
|
||||
pass
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def get_decoded_token(cls, token, verify=False):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
if hasattr(jwt, '__version__') and jwt.__version__[0] == '1':
|
||||
return jwt.decode(
|
||||
token,
|
||||
verify=verify,
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
return jwt.decode(
|
||||
token,
|
||||
options=dict(verify_signature=verify),
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_token_exp(cls, token):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
return jwt.decode(token, verify=False).get('exp', sys.maxsize)
|
||||
return cls.get_decoded_token(token).get("exp", sys.maxsize)
|
||||
|
||||
def _set_token(self, token):
|
||||
if token:
|
||||
@@ -80,7 +114,9 @@ class TokenManager(object):
|
||||
self.__token_expiration_sec = 0
|
||||
|
||||
def get_token_valid_period_sec(self):
|
||||
return self._calc_token_valid_period_sec(self.__token, self.token_expiration_sec)
|
||||
return self._calc_token_valid_period_sec(
|
||||
self.__token, self.token_expiration_sec
|
||||
)
|
||||
|
||||
def _get_token(self):
|
||||
if self.get_token_valid_period_sec() <= 0:
|
||||
@@ -92,4 +128,6 @@ class TokenManager(object):
|
||||
pass
|
||||
|
||||
def refresh_token(self):
|
||||
self._set_token(self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec))
|
||||
self._set_token(
|
||||
self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec)
|
||||
)
|
||||
@@ -6,16 +6,9 @@ import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util import Retry
|
||||
from urllib3 import PoolManager
|
||||
import six
|
||||
|
||||
from .session.defs import ENV_HOST_VERIFY_CERT
|
||||
|
||||
if six.PY3:
|
||||
from functools import lru_cache
|
||||
elif six.PY2:
|
||||
# python 2 support
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
|
||||
|
||||
__disable_certificate_verification_warning = 0
|
||||
|
||||
@@ -107,7 +100,7 @@ def get_http_session_with_retry(
|
||||
if not session.verify and __disable_certificate_verification_warning < 2:
|
||||
# show warning
|
||||
__disable_certificate_verification_warning += 1
|
||||
logging.getLogger('TRAINS').warning(
|
||||
logging.getLogger('ClearML').warning(
|
||||
msg='InsecureRequestWarning: Certificate verification is disabled! Adding '
|
||||
'certificate verification is strongly advised. See: '
|
||||
'https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings')
|
||||
@@ -1,4 +1,3 @@
|
||||
from .defs import Environment
|
||||
from .config import Config, ConfigEntry
|
||||
from .errors import ConfigurationError
|
||||
from .environment import EnvEntry
|
||||
@@ -4,15 +4,13 @@ import functools
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from fnmatch import fnmatch
|
||||
from os.path import expanduser
|
||||
from typing import Any
|
||||
|
||||
import pyhocon
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
from pyhocon import ConfigTree, ConfigFactory
|
||||
from pyparsing import (
|
||||
ParseFatalException,
|
||||
ParseException,
|
||||
@@ -71,6 +69,10 @@ class Config(object):
|
||||
|
||||
# used in place of None in Config.get as default value because None is a valid value
|
||||
_MISSING = object()
|
||||
extra_config_values_env_key_sep = "__"
|
||||
extra_config_values_env_key_prefix = [
|
||||
"CLEARML_AGENT" + extra_config_values_env_key_sep,
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -90,6 +92,7 @@ class Config(object):
|
||||
self._env = env or os.environ.get("TRAINS_ENV", Environment.default)
|
||||
self.config_paths = set()
|
||||
self.is_server = is_server
|
||||
self._overrides_configs = None
|
||||
|
||||
if self._verbose:
|
||||
print("Config env:%s" % str(self._env))
|
||||
@@ -100,6 +103,7 @@ class Config(object):
|
||||
)
|
||||
if self._env not in get_options(Environment):
|
||||
raise ValueError("Invalid environment %s" % env)
|
||||
|
||||
if relative_to is not None:
|
||||
self.load_relative_to(relative_to)
|
||||
|
||||
@@ -138,7 +142,7 @@ class Config(object):
|
||||
else:
|
||||
env_config_paths = []
|
||||
|
||||
env_config_path_override = os.environ.get(ENV_CONFIG_PATH_OVERRIDE_VAR)
|
||||
env_config_path_override = ENV_CONFIG_PATH_OVERRIDE_VAR.get()
|
||||
if env_config_path_override:
|
||||
env_config_paths = [expanduser(env_config_path_override)]
|
||||
|
||||
@@ -158,14 +162,16 @@ class Config(object):
|
||||
if LOCAL_CONFIG_PATHS:
|
||||
config = functools.reduce(
|
||||
lambda cfg, path: ConfigTree.merge_configs(
|
||||
cfg, self._read_recursive(path, verbose=self._verbose), copy_trees=True
|
||||
cfg,
|
||||
self._read_recursive(path, verbose=self._verbose),
|
||||
copy_trees=True,
|
||||
),
|
||||
LOCAL_CONFIG_PATHS,
|
||||
config,
|
||||
)
|
||||
|
||||
local_config_files = LOCAL_CONFIG_FILES
|
||||
local_config_override = os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR)
|
||||
local_config_override = LOCAL_CONFIG_FILE_OVERRIDE_VAR.get()
|
||||
if local_config_override:
|
||||
local_config_files = [expanduser(local_config_override)]
|
||||
|
||||
@@ -181,16 +187,45 @@ class Config(object):
|
||||
config,
|
||||
)
|
||||
|
||||
config = ConfigTree.merge_configs(
|
||||
config, self._read_extra_env_config_values(), copy_trees=True
|
||||
)
|
||||
|
||||
if self._overrides_configs:
|
||||
config = functools.reduce(
|
||||
lambda cfg, override: ConfigTree.merge_configs(cfg, override, copy_trees=True),
|
||||
self._overrides_configs,
|
||||
config,
|
||||
)
|
||||
|
||||
config["env"] = env
|
||||
return config
|
||||
|
||||
def _read_extra_env_config_values(self) -> ConfigTree:
|
||||
""" Loads extra configuration from environment-injected values """
|
||||
result = ConfigTree()
|
||||
|
||||
for prefix in self.extra_config_values_env_key_prefix:
|
||||
keys = sorted(k for k in os.environ if k.startswith(prefix))
|
||||
for key in keys:
|
||||
path = (
|
||||
key[len(prefix) :]
|
||||
.replace(self.extra_config_values_env_key_sep, ".")
|
||||
.lower()
|
||||
)
|
||||
result = ConfigTree.merge_configs(
|
||||
result, ConfigFactory.parse_string("{}: {}".format(path, os.environ[key]))
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def replace(self, config):
|
||||
self._config = config
|
||||
|
||||
def reload(self):
|
||||
self.replace(self._reload())
|
||||
|
||||
def initialize_logging(self):
|
||||
def initialize_logging(self, debug=False):
|
||||
logging_config = self._config.get("logging", None)
|
||||
if not logging_config:
|
||||
return False
|
||||
@@ -217,6 +252,8 @@ class Config(object):
|
||||
)
|
||||
for logger in loggers:
|
||||
handlers = logger.get("handlers", None)
|
||||
if debug:
|
||||
logger['level'] = 'DEBUG'
|
||||
if not handlers:
|
||||
continue
|
||||
logger["handlers"] = [h for h in handlers if h not in deleted]
|
||||
@@ -338,3 +375,10 @@ class Config(object):
|
||||
except Exception as ex:
|
||||
print("Failed loading %s: %s" % (file_path, ex))
|
||||
raise
|
||||
|
||||
def set_overrides(self, *dicts):
|
||||
""" Set several override dictionaries or ConfigTree objects which should be merged onto the configuration """
|
||||
self._overrides_configs = [
|
||||
d if isinstance(d, ConfigTree) else pyhocon.ConfigFactory.from_dict(d) for d in dicts
|
||||
]
|
||||
self.reload()
|
||||
@@ -24,6 +24,14 @@ def text_to_bool(value):
|
||||
return bool(strtobool(value))
|
||||
|
||||
|
||||
def safe_text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
try:
|
||||
return text_to_bool(value)
|
||||
except ValueError:
|
||||
return bool(value)
|
||||
|
||||
|
||||
def any_to_bool(value):
|
||||
# type: (Optional[Union[int, float, Text]]) -> bool
|
||||
if isinstance(value, six.text_type):
|
||||
@@ -1,6 +1,8 @@
|
||||
from os.path import expanduser
|
||||
from pathlib2 import Path
|
||||
|
||||
from ..backend_config.environment import EnvEntry
|
||||
|
||||
ENV_VAR = 'TRAINS_ENV'
|
||||
""" Name of system environment variable that can be used to specify the config environment name """
|
||||
|
||||
@@ -17,23 +19,24 @@ ENV_CONFIG_PATHS = [
|
||||
|
||||
|
||||
LOCAL_CONFIG_PATHS = [
|
||||
# '/etc/opt/trains', # used by servers for docker-generated configuration
|
||||
# expanduser('~/.trains/config'),
|
||||
# '/etc/opt/clearml', # used by servers for docker-generated configuration
|
||||
# expanduser('~/.clearml/config'),
|
||||
]
|
||||
""" Local config paths, not related to environment """
|
||||
|
||||
|
||||
LOCAL_CONFIG_FILES = [
|
||||
expanduser('~/trains.conf'), # used for workstation configuration (end-users, workers)
|
||||
expanduser('~/clearml.conf'), # used for workstation configuration (end-users, workers)
|
||||
]
|
||||
""" Local config files (not paths) """
|
||||
|
||||
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR = 'TRAINS_CONFIG_FILE'
|
||||
LOCAL_CONFIG_FILE_OVERRIDE_VAR = EnvEntry('CLEARML_CONFIG_FILE', 'TRAINS_CONFIG_FILE', )
|
||||
""" Local config file override environment variable. If this is set, no other local config files will be used. """
|
||||
|
||||
|
||||
ENV_CONFIG_PATH_OVERRIDE_VAR = 'TRAINS_CONFIG_PATH'
|
||||
ENV_CONFIG_PATH_OVERRIDE_VAR = EnvEntry('CLEARML_CONFIG_PATH', 'TRAINS_CONFIG_PATH', )
|
||||
"""
|
||||
Environment-related config path override environment variable. If this is set, no other env config path will be used.
|
||||
"""
|
||||
@@ -46,6 +49,15 @@ class Environment(object):
|
||||
local = 'local'
|
||||
|
||||
|
||||
class UptimeConf(object):
|
||||
min_api_version = "2.10"
|
||||
queue_tag_on = "force_workers:on"
|
||||
queue_tag_off = "force_workers:off"
|
||||
worker_key = "force"
|
||||
worker_value_off = ["off"]
|
||||
worker_value_on = ["on"]
|
||||
|
||||
|
||||
CONFIG_FILE_EXTENSION = '.conf'
|
||||
|
||||
|
||||
@@ -64,8 +64,8 @@ class Entry(object):
|
||||
converter = self.default_conversions().get(self.type, self.type)
|
||||
return converter(value)
|
||||
|
||||
def get_pair(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Tuple[Text, Any]]
|
||||
def get_pair(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Tuple[Text, Any]]
|
||||
for key in self.keys:
|
||||
value = self._get(key)
|
||||
if value is NotSet:
|
||||
@@ -75,18 +75,26 @@ class Entry(object):
|
||||
except Exception as ex:
|
||||
self.error("invalid value {key}={value}: {ex}".format(**locals()))
|
||||
break
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if value_cb:
|
||||
value_cb(key, value)
|
||||
except Exception:
|
||||
pass
|
||||
return key, value
|
||||
|
||||
result = self.default if default is NotSet else default
|
||||
return self.key, result
|
||||
|
||||
def get(self, default=NotSet, converter=None):
|
||||
# type: (Any, Converter) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter)[1]
|
||||
def get(self, default=NotSet, converter=None, value_cb=None):
|
||||
# type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Any]
|
||||
return self.get_pair(default=default, converter=converter, value_cb=value_cb)[1]
|
||||
|
||||
def set(self, value):
|
||||
# type: (Any, Any) -> (Text, Any)
|
||||
key, _ = self.get_pair(default=None, converter=None)
|
||||
self._set(key, str(value))
|
||||
# key, _ = self.get_pair(default=None, converter=None)
|
||||
for k in self.keys:
|
||||
self._set(k, str(value))
|
||||
|
||||
def _set(self, key, value):
|
||||
# type: (Text, Text) -> None
|
||||
64
clearml_agent/backend_config/environment.py
Normal file
64
clearml_agent/backend_config/environment.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from os import getenv, environ
|
||||
|
||||
from .converters import text_to_bool
|
||||
from .entry import Entry, NotSet
|
||||
|
||||
|
||||
class EnvEntry(Entry):
|
||||
@classmethod
|
||||
def default_conversions(cls):
|
||||
conversions = super(EnvEntry, cls).default_conversions().copy()
|
||||
conversions[bool] = text_to_bool
|
||||
return conversions
|
||||
|
||||
def pop(self):
|
||||
for k in self.keys:
|
||||
environ.pop(k, None)
|
||||
|
||||
def _get(self, key):
|
||||
value = getenv(key, "").strip()
|
||||
return value or NotSet
|
||||
|
||||
def _set(self, key, value):
|
||||
environ[key] = value
|
||||
|
||||
def __str__(self):
|
||||
return "env:{}".format(super(EnvEntry, self).__str__())
|
||||
|
||||
def error(self, message):
|
||||
print("Environment configuration: {}".format(message))
|
||||
|
||||
|
||||
def backward_compatibility_support():
|
||||
from ..definitions import ENVIRONMENT_CONFIG, ENVIRONMENT_SDK_PARAMS, ENVIRONMENT_BACKWARD_COMPATIBLE
|
||||
if ENVIRONMENT_BACKWARD_COMPATIBLE.get():
|
||||
# Add TRAINS_ prefix on every CLEARML_ os environment we support
|
||||
for k, v in ENVIRONMENT_CONFIG.items():
|
||||
try:
|
||||
trains_vars = [var for var in v.vars if var.startswith('CLEARML_')]
|
||||
if not trains_vars:
|
||||
continue
|
||||
alg_var = trains_vars[0].replace('CLEARML_', 'TRAINS_', 1)
|
||||
if alg_var not in v.vars:
|
||||
v.vars = tuple(list(v.vars) + [alg_var])
|
||||
except:
|
||||
continue
|
||||
for k, v in ENVIRONMENT_SDK_PARAMS.items():
|
||||
try:
|
||||
trains_vars = [var for var in v if var.startswith('CLEARML_')]
|
||||
if not trains_vars:
|
||||
continue
|
||||
alg_var = trains_vars[0].replace('CLEARML_', 'TRAINS_', 1)
|
||||
if alg_var not in v:
|
||||
ENVIRONMENT_SDK_PARAMS[k] = tuple(list(v) + [alg_var])
|
||||
except:
|
||||
continue
|
||||
|
||||
# set OS environ:
|
||||
keys = list(environ.keys())
|
||||
for k in keys:
|
||||
if not k.startswith('CLEARML_'):
|
||||
continue
|
||||
backwards_k = k.replace('CLEARML_', 'TRAINS_', 1)
|
||||
if backwards_k not in keys:
|
||||
environ[backwards_k] = environ[k]
|
||||
@@ -4,11 +4,11 @@ from pathlib2 import Path
|
||||
|
||||
|
||||
def logger(path=None):
|
||||
name = "trains"
|
||||
name = "clearml"
|
||||
if path:
|
||||
p = Path(path)
|
||||
module = (p.parent if p.stem.startswith('_') else p).stem
|
||||
name = "trains.%s" % module
|
||||
name = "clearml.%s" % module
|
||||
return logging.getLogger(name)
|
||||
|
||||
|
||||
112
clearml_agent/backend_config/utils.py
Normal file
112
clearml_agent/backend_config/utils.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import base64
|
||||
import os
|
||||
from os.path import expandvars, expanduser
|
||||
from pathlib import Path
|
||||
from typing import List, TYPE_CHECKING
|
||||
|
||||
from pyhocon import HOCONConverter, ConfigTree
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .config import Config
|
||||
|
||||
|
||||
def get_items(cls):
|
||||
""" get key/value items from an enum-like class (members represent enumeration key/value) """
|
||||
return {k: v for k, v in vars(cls).items() if not k.startswith('_')}
|
||||
|
||||
|
||||
def get_options(cls):
|
||||
""" get options from an enum-like class (members represent enumeration key/value) """
|
||||
return get_items(cls).values()
|
||||
|
||||
|
||||
def apply_environment(config):
|
||||
# type: (Config) -> List[str]
|
||||
env_vars = config.get("environment", None)
|
||||
if not env_vars:
|
||||
return []
|
||||
if isinstance(env_vars, (list, tuple)):
|
||||
env_vars = dict(env_vars)
|
||||
|
||||
keys = list(filter(None, env_vars.keys()))
|
||||
|
||||
for key in keys:
|
||||
os.environ[str(key)] = str(env_vars[key] or "")
|
||||
|
||||
return keys
|
||||
|
||||
|
||||
def apply_files(config):
|
||||
# type: (Config) -> None
|
||||
files = config.get("files", None)
|
||||
if not files:
|
||||
return
|
||||
|
||||
if isinstance(files, (list, tuple)):
|
||||
files = dict(files)
|
||||
|
||||
print("Creating files from configuration")
|
||||
for key, data in files.items():
|
||||
path = data.get("path")
|
||||
fmt = data.get("format", "string")
|
||||
target_fmt = data.get("target_format", "string")
|
||||
overwrite = bool(data.get("overwrite", True))
|
||||
contents = data.get("contents")
|
||||
|
||||
target = Path(expanduser(expandvars(path)))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if target.is_dir():
|
||||
print("Skipped [{}]: is a directory {}".format(key, target))
|
||||
continue
|
||||
|
||||
if not overwrite and target.is_file():
|
||||
print("Skipped [{}]: file exists {}".format(key, target))
|
||||
continue
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: can't access {} ({})".format(key, target, ex))
|
||||
continue
|
||||
|
||||
if contents:
|
||||
try:
|
||||
if fmt == "base64":
|
||||
contents = base64.b64decode(contents)
|
||||
if target_fmt != "bytes":
|
||||
contents = contents.decode("utf-8")
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed decoding {} ({})".format(key, fmt, ex))
|
||||
continue
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed creating path {} ({})".format(key, target.parent, ex))
|
||||
continue
|
||||
|
||||
try:
|
||||
if target_fmt == "bytes":
|
||||
try:
|
||||
target.write_bytes(contents)
|
||||
except TypeError:
|
||||
# simpler error so the user won't get confused
|
||||
raise TypeError("a bytes-like object is required")
|
||||
else:
|
||||
try:
|
||||
if target_fmt == "json":
|
||||
text = HOCONConverter.to_json(contents)
|
||||
elif target_fmt in ("yaml", "yml"):
|
||||
text = HOCONConverter.to_yaml(contents)
|
||||
else:
|
||||
if isinstance(contents, ConfigTree):
|
||||
contents = contents.as_plain_ordered_dict()
|
||||
text = str(contents)
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed encoding to {} ({})".format(key, target_fmt, ex))
|
||||
continue
|
||||
target.write_text(text)
|
||||
print("Saved [{}]: {}".format(key, target))
|
||||
except Exception as ex:
|
||||
print("Skipped [{}]: failed saving file {} ({})".format(key, target, ex))
|
||||
continue
|
||||
@@ -9,16 +9,16 @@ from operator import attrgetter
|
||||
from traceback import print_exc
|
||||
from typing import Text
|
||||
|
||||
from trains_agent.helper.console import ListFormatter, print_text
|
||||
from trains_agent.helper.dicts import filter_keys
|
||||
from clearml_agent.helper.console import ListFormatter, print_text
|
||||
from clearml_agent.helper.dicts import filter_keys
|
||||
|
||||
import six
|
||||
from trains_agent.backend_api import services
|
||||
from clearml_agent.backend_api import services
|
||||
|
||||
from trains_agent.errors import APIError, CommandFailedError
|
||||
from trains_agent.helper.base import Singleton, return_list, print_parameters, dump_yaml, load_yaml, error, warning
|
||||
from trains_agent.interface.base import ObjectID
|
||||
from trains_agent.session import Session
|
||||
from clearml_agent.errors import APIError, CommandFailedError
|
||||
from clearml_agent.helper.base import Singleton, return_list, print_parameters, dump_yaml, load_yaml, error, warning
|
||||
from clearml_agent.interface.base import ObjectID
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
class NameResolutionError(CommandFailedError):
|
||||
@@ -74,7 +74,7 @@ class BaseCommandSection(object):
|
||||
|
||||
@staticmethod
|
||||
def log(message, *args):
|
||||
print("trains-agent: {}".format(message % args))
|
||||
print("clearml-agent: {}".format(message % args))
|
||||
|
||||
@classmethod
|
||||
def exit(cls, message, code=1): # type: (Text, int) -> ()
|
||||
@@ -118,11 +118,13 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
""" The name of the REST service used by this command """
|
||||
pass
|
||||
|
||||
def get(self, endpoint, *args, **kwargs):
|
||||
return self._session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def get(self, endpoint, *args, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
return session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def post(self, endpoint, *args, **kwargs):
|
||||
return self._session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def post(self, endpoint, *args, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
return session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def get_with_act_as(self, endpoint, *args, **kwargs):
|
||||
return self._session.get_with_act_as(service=self.service, action=endpoint, *args, **kwargs)
|
||||
@@ -1,4 +1,4 @@
|
||||
from trains_agent.commands.base import ServiceCommandSection
|
||||
from clearml_agent.commands.base import ServiceCommandSection
|
||||
|
||||
|
||||
class Config(ServiceCommandSection):
|
||||
@@ -5,14 +5,16 @@ from pyhocon import ConfigFactory, ConfigMissingException
|
||||
from pathlib2 import Path
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from trains_agent.backend_api.session import Session
|
||||
from trains_agent.backend_api.session.defs import ENV_HOST
|
||||
from trains_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
from clearml_agent.backend_api.session import Session
|
||||
from clearml_agent.backend_api.session.defs import ENV_HOST
|
||||
from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
|
||||
|
||||
description = """
|
||||
Please create new trains credentials through the profile page in your trains web app (e.g. https://demoapp.trains.allegro.ai/profile)
|
||||
In the profile page, press "Create new credentials", then press "Copy to clipboard".
|
||||
Please create new clearml credentials through the settings page in your `clearml-server` web app,
|
||||
or create a free account at https://app.clear.ml/settings/webapp-configuration
|
||||
|
||||
In the settings > workspace page, press "Create new credentials", then press "Copy to clipboard".
|
||||
|
||||
Paste copied configuration here:
|
||||
"""
|
||||
@@ -25,16 +27,20 @@ except Exception:
|
||||
|
||||
host_description = """
|
||||
Editing configuration file: {CONFIG_FILE}
|
||||
Enter the url of the trains-server's Web service, for example: {HOST}
|
||||
Enter the url of the clearml-server's Web service, for example: {HOST} or https://app.clear.ml
|
||||
""".format(
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[0],
|
||||
CONFIG_FILE=LOCAL_CONFIG_FILES[-1],
|
||||
HOST=def_host,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
print('TRAINS-AGENT setup process')
|
||||
conf_file = Path(LOCAL_CONFIG_FILES[0]).absolute()
|
||||
print('CLEARML-AGENT setup process')
|
||||
for f in LOCAL_CONFIG_FILES:
|
||||
conf_file = Path(f).absolute()
|
||||
if conf_file.exists():
|
||||
break
|
||||
|
||||
if conf_file.exists() and conf_file.is_file() and conf_file.stat().st_size > 0:
|
||||
print('Configuration file already exists: {}'.format(str(conf_file)))
|
||||
print('Leaving setup, feel free to edit the configuration file.')
|
||||
@@ -42,7 +48,12 @@ def main():
|
||||
|
||||
print(description, end='')
|
||||
sentinel = ''
|
||||
parse_input = '\n'.join(iter(input, sentinel))
|
||||
parse_input = ''
|
||||
for line in iter(input, sentinel):
|
||||
parse_input += line+'\n'
|
||||
if line.rstrip() == '}':
|
||||
break
|
||||
|
||||
credentials = None
|
||||
api_server = None
|
||||
web_server = None
|
||||
@@ -73,7 +84,7 @@ def main():
|
||||
host = input_url('API Host', api_server)
|
||||
else:
|
||||
print(host_description)
|
||||
host = input_url('WEB Host', '')
|
||||
host = input_url('WEB Host', 'https://app.clear.ml')
|
||||
|
||||
parsed_host = verify_url(host)
|
||||
api_host, files_host, web_host = parse_host(parsed_host, allow_input=True)
|
||||
@@ -86,7 +97,7 @@ def main():
|
||||
|
||||
files_host = input_url('File Store Host', files_host)
|
||||
|
||||
print('\nTRAINS Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
|
||||
print('\nClearML Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
|
||||
web_host, api_host, files_host))
|
||||
|
||||
retry = 1
|
||||
@@ -105,9 +116,15 @@ def main():
|
||||
print('Enter git username for repository cloning (leave blank for SSH key authentication): [] ', end='')
|
||||
git_user = input()
|
||||
if git_user.strip():
|
||||
print('Enter password for user \'{}\': '.format(git_user), end='')
|
||||
print(
|
||||
"Git personal token is equivalent to a password, to learn how to generate a token:\n"
|
||||
" GitHub: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token\n" # noqa
|
||||
" Bitbucket: https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/\n"
|
||||
" GitLab: https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html\n"
|
||||
)
|
||||
print('Enter git password token for user \'{}\': '.format(git_user), end='')
|
||||
git_pass = input()
|
||||
print('Git repository cloning will be using user={} password={}'.format(git_user, git_pass))
|
||||
print('Git repository cloning will be using user={} token={}'.format(git_user, git_pass))
|
||||
else:
|
||||
git_user = None
|
||||
git_pass = None
|
||||
@@ -140,13 +157,14 @@ def main():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open(str(conf_file), 'wt') as f:
|
||||
header = '# TRAINS-AGENT configuration file\n' \
|
||||
header = '# CLEARML-AGENT configuration file\n' \
|
||||
'api {\n' \
|
||||
' # Notice: \'host\' is the api server (default port 8008), not the web server.\n' \
|
||||
' api_server: %s\n' \
|
||||
' web_server: %s\n' \
|
||||
' files_server: %s\n' \
|
||||
' # Credentials are generated using the webapp, %s/profile\n' \
|
||||
' # Override with os environment: TRAINS_API_ACCESS_KEY / TRAINS_API_SECRET_KEY\n' \
|
||||
' # Credentials are generated using the webapp, %s/settings\n' \
|
||||
' # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY\n' \
|
||||
' credentials {"access_key": "%s", "secret_key": "%s"}\n' \
|
||||
'}\n\n' % (api_host, web_host, files_host,
|
||||
web_host, credentials['access_key'], credentials['secret_key'])
|
||||
@@ -157,7 +175,7 @@ def main():
|
||||
'agent.git_pass=\"{}\"\n' \
|
||||
'\n'.format(git_user or '', git_pass or '')
|
||||
f.write(git_credentials)
|
||||
extra_index_str = '# extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]\n' \
|
||||
extra_index_str = '# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]\n' \
|
||||
'agent.package_manager.extra_index_url= ' \
|
||||
'[\n{}\n]\n\n'.format("\n".join(map("\"{}\"".format, extra_index_urls)))
|
||||
f.write(extra_index_str)
|
||||
@@ -167,7 +185,7 @@ def main():
|
||||
return
|
||||
|
||||
print('\nNew configuration stored in {}'.format(str(conf_file)))
|
||||
print('TRAINS-AGENT setup completed successfully.')
|
||||
print('CLEARML-AGENT setup completed successfully.')
|
||||
|
||||
|
||||
def parse_host(parsed_host, allow_input=True):
|
||||
@@ -233,7 +251,8 @@ def verify_credentials(api_host, credentials):
|
||||
try:
|
||||
print('Verifying credentials ...')
|
||||
if api_host:
|
||||
Session(api_key=credentials['access_key'], secret_key=credentials['secret_key'], host=api_host)
|
||||
Session(api_key=credentials['access_key'], secret_key=credentials['secret_key'], host=api_host,
|
||||
http_retries_config={"total": 2})
|
||||
print('Credentials verified!')
|
||||
return True
|
||||
else:
|
||||
@@ -275,7 +294,7 @@ def read_manual_credentials():
|
||||
|
||||
def input_url(host_type, host=None):
|
||||
while True:
|
||||
print('{} configured to: [{}] '.format(host_type, host), end='')
|
||||
print('{} configured to: {}'.format(host_type, '[{}] '.format(host) if host else ''), end='')
|
||||
parse_input = input()
|
||||
if host and (not parse_input or parse_input.lower() == 'yes' or parse_input.lower() == 'y'):
|
||||
break
|
||||
@@ -289,11 +308,12 @@ def input_url(host_type, host=None):
|
||||
def input_host_port(host_type, parsed_host):
|
||||
print('Enter port for {} host '.format(host_type), end='')
|
||||
replace_port = input().lower()
|
||||
return parsed_host.scheme + "://" + parsed_host.netloc + (':{}'.format(replace_port) if replace_port else '') + \
|
||||
parsed_host.path
|
||||
return parsed_host.scheme + "://" + parsed_host.netloc + (
|
||||
':{}'.format(replace_port) if replace_port else '') + parsed_host.path
|
||||
|
||||
|
||||
def verify_url(parse_input):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not parse_input.startswith('http://') and not parse_input.startswith('https://'):
|
||||
# if we have a specific port, use http prefix, otherwise assume https
|
||||
@@ -306,7 +326,7 @@ def verify_url(parse_input):
|
||||
parsed_host = None
|
||||
except Exception:
|
||||
parsed_host = None
|
||||
print('Could not parse url {}\nEnter your trains-server host: '.format(parse_input), end='')
|
||||
print('Could not parse url {}\nEnter your clearml-server host: '.format(parse_input), end='')
|
||||
return parsed_host
|
||||
|
||||
|
||||
@@ -5,8 +5,8 @@ import time
|
||||
|
||||
from future.builtins import super
|
||||
|
||||
from trains_agent.commands.base import ServiceCommandSection
|
||||
from trains_agent.helper.base import return_list
|
||||
from clearml_agent.commands.base import ServiceCommandSection
|
||||
from clearml_agent.helper.base import return_list
|
||||
|
||||
|
||||
class Events(ServiceCommandSection):
|
||||
@@ -21,14 +21,16 @@ class Events(ServiceCommandSection):
|
||||
""" Events command service endpoint """
|
||||
return 'events'
|
||||
|
||||
def send_events(self, list_events):
|
||||
def send_events(self, list_events, session=None):
|
||||
def send_packet(jsonlines):
|
||||
if not jsonlines:
|
||||
return 0
|
||||
num_lines = len(jsonlines)
|
||||
jsonlines = '\n'.join(jsonlines)
|
||||
|
||||
new_events = self.post('add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'})
|
||||
new_events = self.post(
|
||||
'add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'}, session=session
|
||||
)
|
||||
if new_events['added'] != num_lines:
|
||||
print('Error (%s) sending events only %d of %d registered' %
|
||||
(new_events['errors'], new_events['added'], num_lines))
|
||||
@@ -57,7 +59,7 @@ class Events(ServiceCommandSection):
|
||||
# print('Sending events done: %d / %d events sent' % (sent_events, len(list_events)))
|
||||
return sent_events
|
||||
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG'):
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG', session=None):
|
||||
log_events = []
|
||||
base_timestamp = int(time.time() * 1000)
|
||||
base_log_items = {
|
||||
@@ -94,4 +96,4 @@ class Events(ServiceCommandSection):
|
||||
log_events.append(get_event(count))
|
||||
|
||||
# now send the events
|
||||
return self.send_events(list_events=log_events)
|
||||
return self.send_events(list_events=log_events, session=session)
|
||||
166
clearml_agent/commands/resolver.py
Normal file
166
clearml_agent/commands/resolver.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import json
|
||||
import re
|
||||
import shlex
|
||||
from clearml_agent.helper.package.requirements import (
|
||||
RequirementsManager, MarkerRequirement,
|
||||
compare_version_rules, )
|
||||
|
||||
|
||||
def resolve_default_container(session, task_id, container_config):
|
||||
container_lookup = session.config.get('agent.default_docker.match_rules', None)
|
||||
if not session.check_min_api_version("2.13") or not container_lookup:
|
||||
return container_config
|
||||
|
||||
# check backend support before sending any more requests (because they will fail and crash the Task)
|
||||
try:
|
||||
session.verify_feature_set('advanced')
|
||||
except ValueError:
|
||||
return container_config
|
||||
|
||||
result = session.send_request(
|
||||
service='tasks',
|
||||
action='get_all',
|
||||
version='2.14',
|
||||
json={'id': [task_id],
|
||||
'only_fields': ['script.requirements', 'script.binary',
|
||||
'script.repository', 'script.branch',
|
||||
'project', 'container'],
|
||||
'search_hidden': True},
|
||||
method='get',
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
task_info = result.json()['data']['tasks'][0] if result.ok else {}
|
||||
except (ValueError, TypeError):
|
||||
return container_config
|
||||
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
# store tasks repository
|
||||
repository = task_info.get('script', {}).get('repository') or ''
|
||||
branch = task_info.get('script', {}).get('branch') or ''
|
||||
binary = task_info.get('script', {}).get('binary') or ''
|
||||
requested_container = task_info.get('container', {})
|
||||
|
||||
# get project full path
|
||||
project_full_name = ''
|
||||
if task_info.get('project', None):
|
||||
result = session.send_request(
|
||||
service='projects',
|
||||
action='get_all',
|
||||
version='2.13',
|
||||
json={
|
||||
'id': [task_info.get('project')],
|
||||
'only_fields': ['name'],
|
||||
},
|
||||
method='get',
|
||||
async_enable=False,
|
||||
)
|
||||
try:
|
||||
if result.ok:
|
||||
project_full_name = result.json()['data']['projects'][0]['name'] or ''
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
task_packages_lookup = {}
|
||||
for entry in container_lookup:
|
||||
match = entry.get('match', None)
|
||||
if not match:
|
||||
continue
|
||||
if match.get('project', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('project', None), project_full_name):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('project', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.repository', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.repository', None), repository):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.repository', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.branch', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.branch', None), branch):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.branch', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('script.binary', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('script.binary', None), binary):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('script.binary', None), entry))
|
||||
continue
|
||||
|
||||
if match.get('container', None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not re.search(match.get('container', None), requested_container.get('image', '')):
|
||||
continue
|
||||
except Exception:
|
||||
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
|
||||
match.get('container', None), entry))
|
||||
continue
|
||||
|
||||
matched = True
|
||||
for req_section in ['script.requirements.pip', 'script.requirements.conda']:
|
||||
if not match.get(req_section, None):
|
||||
continue
|
||||
|
||||
match_pip_reqs = [MarkerRequirement(Requirement.parse('{} {}'.format(k, v)))
|
||||
for k, v in match.get(req_section, None).items()]
|
||||
|
||||
if not task_packages_lookup.get(req_section):
|
||||
req_section_parts = req_section.split('.')
|
||||
task_packages_lookup[req_section] = \
|
||||
RequirementsManager.parse_requirements_section_to_marker_requirements(
|
||||
requirements=task_info.get(req_section_parts[0], {}).get(
|
||||
req_section_parts[1], {}).get(req_section_parts[2], None)
|
||||
)
|
||||
|
||||
matched_all_reqs = True
|
||||
for mr in match_pip_reqs:
|
||||
matched_req = False
|
||||
for pr in task_packages_lookup[req_section]:
|
||||
if mr.req.name != pr.req.name:
|
||||
continue
|
||||
if compare_version_rules(mr.specs, pr.specs):
|
||||
matched_req = True
|
||||
break
|
||||
if not matched_req:
|
||||
matched_all_reqs = False
|
||||
break
|
||||
|
||||
# if ew have a match, check second section
|
||||
if matched_all_reqs:
|
||||
continue
|
||||
# no match stop
|
||||
matched = False
|
||||
break
|
||||
|
||||
if matched:
|
||||
if not container_config.get('container'):
|
||||
container_config['container'] = entry.get('image', None)
|
||||
if not container_config.get('arguments'):
|
||||
container_config['arguments'] = entry.get('arguments', None)
|
||||
container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
|
||||
print('Matching default container with rule:\n{}'.format(json.dumps(entry)))
|
||||
return container_config
|
||||
|
||||
return container_config
|
||||
|
||||
3958
clearml_agent/commands/worker.py
Normal file
3958
clearml_agent/commands/worker.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,8 @@
|
||||
"""
|
||||
Script for generating command-line completion.
|
||||
Called by trains_agent/utilities/complete.sh (or a copy of it) like so:
|
||||
Called by clearml_agent/utilities/complete.sh (or a copy of it) like so:
|
||||
|
||||
python -m trains_agent.complete "current command line"
|
||||
python -m clearml_agent.complete "current command line"
|
||||
|
||||
And writes line-separated completion targets to stdout.
|
||||
Results are line-separated in order to enable other whitespace in results.
|
||||
@@ -13,7 +13,7 @@ from __future__ import print_function
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from trains_agent.interface import get_parser
|
||||
from clearml_agent.interface import get_parser
|
||||
|
||||
|
||||
def is_argument_required(action):
|
||||
@@ -1,7 +1,7 @@
|
||||
from pyhocon import ConfigTree
|
||||
|
||||
import six
|
||||
from trains_agent.helper.base import Singleton
|
||||
from clearml_agent.helper.base import Singleton
|
||||
|
||||
|
||||
@six.add_metaclass(Singleton)
|
||||
197
clearml_agent/definitions.py
Normal file
197
clearml_agent/definitions.py
Normal file
@@ -0,0 +1,197 @@
|
||||
import shlex
|
||||
from datetime import timedelta
|
||||
from distutils.util import strtobool
|
||||
from enum import IntEnum
|
||||
from os import getenv, environ
|
||||
from typing import Text, Optional, Union, Tuple, Any
|
||||
|
||||
from pathlib2 import Path
|
||||
|
||||
import six
|
||||
from clearml_agent.helper.base import normalize_path
|
||||
|
||||
PROGRAM_NAME = "clearml-agent"
|
||||
FROM_FILE_PREFIX_CHARS = "@"
|
||||
|
||||
CONFIG_DIR = normalize_path("~/.clearml")
|
||||
TOKEN_CACHE_FILE = normalize_path("~/.clearml.clearml_agent.tmp")
|
||||
|
||||
CONFIG_FILE_CANDIDATES = ["~/clearml.conf"]
|
||||
|
||||
|
||||
def find_config_path():
|
||||
for candidate in CONFIG_FILE_CANDIDATES:
|
||||
if Path(candidate).expanduser().exists():
|
||||
return candidate
|
||||
return CONFIG_FILE_CANDIDATES[0]
|
||||
|
||||
|
||||
CONFIG_FILE = normalize_path(find_config_path())
|
||||
|
||||
|
||||
class EnvironmentConfig(object):
|
||||
|
||||
conversions = {
|
||||
bool: lambda value: bool(strtobool(value)),
|
||||
six.text_type: lambda s: six.text_type(s).strip(),
|
||||
list: lambda s: shlex.split(s.strip()),
|
||||
}
|
||||
|
||||
def __init__(self, *names, **kwargs):
|
||||
self.vars = names
|
||||
self.type = kwargs.pop("type", six.text_type)
|
||||
|
||||
def pop(self):
|
||||
for k in self.vars:
|
||||
environ.pop(k, None)
|
||||
|
||||
def set(self, value):
|
||||
for k in self.vars:
|
||||
environ[k] = str(value)
|
||||
|
||||
def convert(self, value):
|
||||
return self.conversions.get(self.type, self.type)(value)
|
||||
|
||||
def get(self, key=False): # type: (bool) -> Optional[Union[Any, Tuple[Text, Any]]]
|
||||
for name in self.vars:
|
||||
value = getenv(name)
|
||||
if value:
|
||||
value = self.convert(value)
|
||||
if key:
|
||||
return name, value
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AGENT_AUTH_TOKEN = EnvironmentConfig("CLEARML_AUTH_TOKEN")
|
||||
ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
|
||||
ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")
|
||||
|
||||
ENVIRONMENT_CONFIG = {
|
||||
"api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
|
||||
"api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ),
|
||||
"api.web_server": EnvironmentConfig("CLEARML_WEB_HOST", "TRAINS_WEB_HOST", ),
|
||||
"api.credentials.access_key": EnvironmentConfig(
|
||||
"CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
|
||||
),
|
||||
"api.credentials.secret_key": ENV_AGENT_SECRET_KEY,
|
||||
"agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
|
||||
"agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
|
||||
"agent.cuda_version": EnvironmentConfig(
|
||||
"CLEARML_CUDA_VERSION", "TRAINS_CUDA_VERSION", "CUDA_VERSION"
|
||||
),
|
||||
"agent.cudnn_version": EnvironmentConfig(
|
||||
"CLEARML_CUDNN_VERSION", "TRAINS_CUDNN_VERSION", "CUDNN_VERSION"
|
||||
),
|
||||
"agent.cpu_only": EnvironmentConfig(
|
||||
names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
|
||||
),
|
||||
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
|
||||
"sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
|
||||
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
|
||||
"sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
||||
'account_key': ENV_AZURE_ACCOUNT_KEY},
|
||||
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
|
||||
}
|
||||
|
||||
ENVIRONMENT_SDK_PARAMS = {
|
||||
"task_id": ("CLEARML_TASK_ID", "TRAINS_TASK_ID", ),
|
||||
"config_file": ("CLEARML_CONFIG_FILE", "TRAINS_CONFIG_FILE", ),
|
||||
"log_level": ("CLEARML_LOG_LEVEL", "TRAINS_LOG_LEVEL", ),
|
||||
"log_to_backend": ("CLEARML_LOG_TASK_TO_BACKEND", "TRAINS_LOG_TASK_TO_BACKEND", ),
|
||||
}
|
||||
|
||||
ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig(
|
||||
names=("CLEARML_AGENT_ALG_ENV", "TRAINS_AGENT_ALG_ENV"), type=bool)
|
||||
|
||||
VIRTUAL_ENVIRONMENT_PATH = {
|
||||
"python2": normalize_path(CONFIG_DIR, "py2venv"),
|
||||
"python3": normalize_path(CONFIG_DIR, "py3venv"),
|
||||
}
|
||||
|
||||
DEFAULT_BASE_DIR = normalize_path(CONFIG_DIR, "data_cache")
|
||||
DEFAULT_HOST = "https://demoapi.demo.clear.ml"
|
||||
MAX_DATASET_SOURCES_COUNT = 50000
|
||||
|
||||
INVALID_WORKER_ID = (400, 1001)
|
||||
WORKER_ALREADY_REGISTERED = (400, 1003)
|
||||
|
||||
API_VERSION = "v1.5"
|
||||
TOKEN_EXPIRATION_SECONDS = int(timedelta(days=2).total_seconds())
|
||||
|
||||
METADATA_EXTENSION = ".json"
|
||||
|
||||
DEFAULT_VENV_UPDATE_URL = (
|
||||
"https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
|
||||
)
|
||||
WORKING_REPOSITORY_DIR = "task_repository"
|
||||
WORKING_STANDALONE_DIR = "code"
|
||||
DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
|
||||
PIP_EXTRA_INDICES = [
|
||||
]
|
||||
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
|
||||
ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAGE')
|
||||
ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
|
||||
ENV_WORKER_TAGS = EnvironmentConfig('CLEARML_WORKER_TAGS')
|
||||
ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PIP_VENV_INSTALL')
|
||||
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL', type=bool)
|
||||
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
|
||||
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
|
||||
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
|
||||
ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
|
||||
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig('CLEARML_AGENT_DISABLE_SSH_MOUNT', type=bool)
|
||||
ENV_SSH_AUTH_SOCK = EnvironmentConfig('SSH_AUTH_SOCK')
|
||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
|
||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
|
||||
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
|
||||
'TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
|
||||
ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
|
||||
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
|
||||
|
||||
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
|
||||
"""
|
||||
Specifies a custom environment setup script to be executed instead of installing a virtual environment.
|
||||
If provided, this script is executed following Git cloning. Script command may include environment variable and
|
||||
will be expanded before execution (e.g. "$CLEARML_GIT_ROOT/script.sh").
|
||||
The script can also be specified using the `agent.custom_build_script` configuration setting.
|
||||
|
||||
When running the script, the following environment variables will be set:
|
||||
- CLEARML_CUSTOM_BUILD_TASK_CONFIG_JSON: specifies a path to a temporary files containing the complete task
|
||||
contents in JSON format
|
||||
- CLEARML_TASK_SCRIPT_ENTRY: task entrypoint script as defined in the task's script section
|
||||
- CLEARML_TASK_WORKING_DIR: task working directory as defined in the task's script section
|
||||
- CLEARML_VENV_PATH: path to the agent's default virtual environment path (as defined in the configuration)
|
||||
- CLEARML_GIT_ROOT: path to the cloned Git repository
|
||||
- CLEARML_CUSTOM_BUILD_OUTPUT: a path to a non-existing file that may be created by the script. If created,
|
||||
this file must be in the following JSON format:
|
||||
```json
|
||||
{
|
||||
"binary": "/absolute/path/to/python-executable",
|
||||
"entry_point": "/absolute/path/to/task-entrypoint-script",
|
||||
"working_dir": "/absolute/path/to/task-working/dir"
|
||||
}
|
||||
```
|
||||
If provided, the agent will use these instead of the predefined task script section to execute the task and will
|
||||
skip virtual environment creation.
|
||||
|
||||
In case the custom script returns with a non-zero exit code, the agent will fail with the same exit code.
|
||||
In case the custom script is specified but does not exist, or if the custom script does not write valid content
|
||||
into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
|
||||
standard flow.
|
||||
"""
|
||||
|
||||
|
||||
class FileBuffering(IntEnum):
|
||||
"""
|
||||
File buffering options:
|
||||
- UNSET: follows the defaults for the type of file,
|
||||
line-buffered for interactive (tty) text files and with a default chunk size otherwise
|
||||
- UNBUFFERED: no buffering at all
|
||||
- LINE_BUFFERED: per-line buffering, only valid for text files
|
||||
- values bigger than 1 indicate the size of the buffer in bytes and are not represented by the enum
|
||||
"""
|
||||
|
||||
UNSET = -1
|
||||
UNBUFFERED = 0
|
||||
LINE_BUFFERING = 1
|
||||
@@ -84,3 +84,13 @@ class MissingPackageError(CommandFailedError):
|
||||
def __str__(self):
|
||||
return '{self.__class__.__name__}: ' \
|
||||
'"{self.name}" package is required. Please run "pip install {self.name}"'.format(self=self)
|
||||
|
||||
|
||||
class CustomBuildScriptFailed(CommandFailedError):
|
||||
def __init__(self, errno, *args, **kwargs):
|
||||
super(CustomBuildScriptFailed, self).__init__(*args, **kwargs)
|
||||
self.errno = errno
|
||||
|
||||
|
||||
class SkippedCustomBuildScript(CommandFailedError):
|
||||
pass
|
||||
22
clearml_agent/external/requirements_parser/__init__.py
vendored
Normal file
22
clearml_agent/external/requirements_parser/__init__.py
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
from .parser import parse # noqa
|
||||
|
||||
_MAJOR = 0
|
||||
_MINOR = 2
|
||||
_PATCH = 0
|
||||
|
||||
|
||||
def version_tuple():
|
||||
'''
|
||||
Returns a 3-tuple of ints that represent the version
|
||||
'''
|
||||
return (_MAJOR, _MINOR, _PATCH)
|
||||
|
||||
|
||||
def version():
|
||||
'''
|
||||
Returns a string representation of the version
|
||||
'''
|
||||
return '%d.%d.%d' % (version_tuple())
|
||||
|
||||
|
||||
__version__ = version()
|
||||
44
clearml_agent/external/requirements_parser/fragment.py
vendored
Normal file
44
clearml_agent/external/requirements_parser/fragment.py
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
import re
|
||||
|
||||
# Copied from pip
|
||||
# https://github.com/pypa/pip/blob/281eb61b09d87765d7c2b92f6982b3fe76ccb0af/pip/index.py#L947
|
||||
HASH_ALGORITHMS = set(['sha1', 'sha224', 'sha384', 'sha256', 'sha512', 'md5'])
|
||||
|
||||
extras_require_search = re.compile(
|
||||
r'(?P<name>.+)\[(?P<extras>[^\]]+)\]').search
|
||||
|
||||
|
||||
def parse_fragment(fragment_string):
|
||||
"""Takes a fragment string nd returns a dict of the components"""
|
||||
fragment_string = fragment_string.lstrip('#')
|
||||
|
||||
try:
|
||||
return dict(
|
||||
key_value_string.split('=')
|
||||
for key_value_string in fragment_string.split('&')
|
||||
)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
'Invalid fragment string {fragment_string}'.format(
|
||||
fragment_string=fragment_string
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_hash_info(d):
|
||||
"""Returns the first matching hashlib name and value from a dict"""
|
||||
for key in d.keys():
|
||||
if key.lower() in HASH_ALGORITHMS:
|
||||
return key, d[key]
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def parse_extras_require(egg):
|
||||
if egg is not None:
|
||||
match = extras_require_search(egg)
|
||||
if match is not None:
|
||||
name = match.group('name')
|
||||
extras = match.group('extras')
|
||||
return name, [extra.strip() for extra in extras.split(',')]
|
||||
return egg, []
|
||||
53
clearml_agent/external/requirements_parser/parser.py
vendored
Normal file
53
clearml_agent/external/requirements_parser/parser.py
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
import os
|
||||
import warnings
|
||||
|
||||
from .requirement import Requirement
|
||||
|
||||
|
||||
def parse(reqstr, cwd=None):
|
||||
"""
|
||||
Parse a requirements file into a list of Requirements
|
||||
|
||||
See: pip/req.py:parse_requirements()
|
||||
|
||||
:param reqstr: a string or file like object containing requirements
|
||||
:param cwd: Optional current working dir for -r file.txt loading
|
||||
:returns: a *generator* of Requirement objects
|
||||
"""
|
||||
filename = getattr(reqstr, 'name', None)
|
||||
try:
|
||||
# Python 2.x compatibility
|
||||
if not isinstance(reqstr, basestring): # noqa
|
||||
reqstr = reqstr.read()
|
||||
except NameError:
|
||||
# Python 3.x only
|
||||
if not isinstance(reqstr, str):
|
||||
reqstr = reqstr.read()
|
||||
|
||||
for line in reqstr.splitlines():
|
||||
line = line.strip()
|
||||
if line == '':
|
||||
continue
|
||||
elif not line or line.startswith('#'):
|
||||
# comments are lines that start with # only
|
||||
continue
|
||||
elif line.startswith('-r ') or line.startswith('--requirement '):
|
||||
_, new_filename = line.split()
|
||||
new_file_path = os.path.join(
|
||||
os.path.dirname(filename or '.') if filename or not cwd else cwd, new_filename)
|
||||
if not os.path.exists(new_file_path):
|
||||
continue
|
||||
with open(new_file_path) as f:
|
||||
for requirement in parse(f):
|
||||
yield requirement
|
||||
elif line.startswith('-f') or line.startswith('--find-links') or \
|
||||
line.startswith('-i') or line.startswith('--index-url') or \
|
||||
line.startswith('--extra-index-url') or \
|
||||
line.startswith('--no-index'):
|
||||
warnings.warn('Private repos not supported. Skipping.')
|
||||
continue
|
||||
elif line.startswith('-Z') or line.startswith('--always-unzip'):
|
||||
warnings.warn('Unused option --always-unzip. Skipping.')
|
||||
continue
|
||||
else:
|
||||
yield Requirement.parse(line)
|
||||
250
clearml_agent/external/requirements_parser/requirement.py
vendored
Normal file
250
clearml_agent/external/requirements_parser/requirement.py
vendored
Normal file
@@ -0,0 +1,250 @@
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
from pkg_resources import Requirement as Req
|
||||
|
||||
from .fragment import get_hash_info, parse_fragment, parse_extras_require
|
||||
from .vcs import VCS, VCS_SCHEMES
|
||||
|
||||
|
||||
URI_REGEX = re.compile(
|
||||
r'^(?P<scheme>https?|file|ftps?)://(?P<path>[^#]+)'
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
VCS_REGEX = re.compile(
|
||||
r'^(?P<scheme>{0})://'.format(r'|'.join(
|
||||
[scheme.replace('+', r'\+') for scheme in VCS_SCHEMES])) +
|
||||
r'((?P<login>[^/@]+)@)?'
|
||||
r'(?P<path>[^#@]+)'
|
||||
r'(@(?P<revision>[^#]+))?'
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
VCS_EXT_REGEX = re.compile(
|
||||
r'^(?P<scheme>{0})(@)'.format(r'|'.join(
|
||||
[scheme.replace('+', r'\+') for scheme in ['git+git']])) +
|
||||
r'((?P<login>[^/@]+)@)?'
|
||||
r'(?P<path>[^#@]+)'
|
||||
r'(@(?P<revision>[^#]+))?'
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
# This matches just about everyting
|
||||
LOCAL_REGEX = re.compile(
|
||||
r'^((?P<scheme>file)://)?'
|
||||
r'(?P<path>[^#]+)' +
|
||||
r'(#(?P<fragment>\S+))?'
|
||||
)
|
||||
|
||||
|
||||
class Requirement(object):
|
||||
"""
|
||||
Represents a single requirementfrom clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
Typically instances of this class are created with ``Requirement.parse``.
|
||||
For local file requirements, there's no verification that the file
|
||||
exists. This class attempts to be *dict-like*.
|
||||
|
||||
See: http://www.pip-installer.org/en/latest/logic.html
|
||||
|
||||
**Members**:
|
||||
|
||||
* ``line`` - the actual requirement line being parsed
|
||||
* ``editable`` - a boolean whether this requirement is "editable"
|
||||
* ``local_file`` - a boolean whether this requirement is a local file/path
|
||||
* ``specifier`` - a boolean whether this requirement used a requirement
|
||||
specifier (eg. "django>=1.5" or "requirements")
|
||||
* ``vcs`` - a string specifying the version control system
|
||||
* ``revision`` - a version control system specifier
|
||||
* ``name`` - the name of the requirement
|
||||
* ``uri`` - the URI if this requirement was specified by URI
|
||||
* ``subdirectory`` - the subdirectory fragment of the URI
|
||||
* ``path`` - the local path to the requirement
|
||||
* ``hash_name`` - the type of hashing algorithm indicated in the line
|
||||
* ``hash`` - the hash value indicated by the requirement line
|
||||
* ``extras`` - a list of extras for this requirement
|
||||
(eg. "mymodule[extra1, extra2]")
|
||||
* ``specs`` - a list of specs for this requirement
|
||||
(eg. "mymodule>1.5,<1.6" => [('>', '1.5'), ('<', '1.6')])
|
||||
"""
|
||||
|
||||
def __init__(self, line):
|
||||
# Do not call this private method
|
||||
self.line = line
|
||||
self.editable = False
|
||||
self.local_file = False
|
||||
self.specifier = False
|
||||
self.vcs = None
|
||||
self.name = None
|
||||
self.subdirectory = None
|
||||
self.uri = None
|
||||
self.path = None
|
||||
self.revision = None
|
||||
self.hash_name = None
|
||||
self.hash = None
|
||||
self.extras = []
|
||||
self.specs = []
|
||||
|
||||
def __repr__(self):
|
||||
return '<Requirement: "{0}">'.format(self.line)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return getattr(self, key)
|
||||
|
||||
def keys(self):
|
||||
return self.__dict__.keys()
|
||||
|
||||
@classmethod
|
||||
def parse_editable(cls, line):
|
||||
"""
|
||||
Parses a Requirement from an "editable" requirement which is either
|
||||
a local project path or a VCS project URI.
|
||||
|
||||
See: pip/req.py:from_editable()
|
||||
|
||||
:param line: an "editable" requirement
|
||||
:returns: a Requirement instance for the given line
|
||||
:raises: ValueError on an invalid requirement
|
||||
"""
|
||||
|
||||
req = cls('-e {0}'.format(line))
|
||||
req.editable = True
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
if vcs_match is not None:
|
||||
groups = vcs_match.groupdict()
|
||||
if groups.get('login'):
|
||||
req.uri = '{scheme}://{login}@{path}'.format(**groups)
|
||||
else:
|
||||
req.uri = '{scheme}://{path}'.format(**groups)
|
||||
req.revision = groups['revision']
|
||||
if groups['fragment']:
|
||||
fragment = parse_fragment(groups['fragment'])
|
||||
egg = fragment.get('egg')
|
||||
req.name, req.extras = parse_extras_require(egg)
|
||||
req.hash_name, req.hash = get_hash_info(fragment)
|
||||
req.subdirectory = fragment.get('subdirectory')
|
||||
for vcs in VCS:
|
||||
if req.uri.startswith(vcs):
|
||||
req.vcs = vcs
|
||||
else:
|
||||
assert local_match is not None, 'This should match everything'
|
||||
groups = local_match.groupdict()
|
||||
req.local_file = True
|
||||
if groups['fragment']:
|
||||
fragment = parse_fragment(groups['fragment'])
|
||||
egg = fragment.get('egg')
|
||||
req.name, req.extras = parse_extras_require(egg)
|
||||
req.hash_name, req.hash = get_hash_info(fragment)
|
||||
req.subdirectory = fragment.get('subdirectory')
|
||||
req.path = groups['path']
|
||||
|
||||
return req
|
||||
|
||||
@classmethod
|
||||
def parse_line(cls, line):
|
||||
"""
|
||||
Parses a Requirement from a non-editable requirement.
|
||||
|
||||
See: pip/req.py:from_line()
|
||||
|
||||
:param line: a "non-editable" requirement
|
||||
:returns: a Requirement instance for the given line
|
||||
:raises: ValueError on an invalid requirement
|
||||
"""
|
||||
|
||||
req = cls(line)
|
||||
|
||||
vcs_match = VCS_REGEX.match(line) or VCS_EXT_REGEX.match(line)
|
||||
uri_match = URI_REGEX.match(line)
|
||||
local_match = LOCAL_REGEX.match(line)
|
||||
|
||||
if vcs_match is not None:
|
||||
groups = vcs_match.groupdict()
|
||||
if groups.get('login'):
|
||||
req.uri = '{scheme}://{login}@{path}'.format(**groups)
|
||||
else:
|
||||
req.uri = '{scheme}://{path}'.format(**groups)
|
||||
req.revision = groups['revision']
|
||||
if groups['fragment']:
|
||||
fragment = parse_fragment(groups['fragment'])
|
||||
egg = fragment.get('egg')
|
||||
req.name, req.extras = parse_extras_require(egg)
|
||||
req.hash_name, req.hash = get_hash_info(fragment)
|
||||
req.subdirectory = fragment.get('subdirectory')
|
||||
for vcs in VCS:
|
||||
if req.uri.startswith(vcs):
|
||||
req.vcs = vcs
|
||||
elif uri_match is not None:
|
||||
groups = uri_match.groupdict()
|
||||
req.uri = '{scheme}://{path}'.format(**groups)
|
||||
if groups['fragment']:
|
||||
fragment = parse_fragment(groups['fragment'])
|
||||
egg = fragment.get('egg')
|
||||
req.name, req.extras = parse_extras_require(egg)
|
||||
req.hash_name, req.hash = get_hash_info(fragment)
|
||||
req.subdirectory = fragment.get('subdirectory')
|
||||
if groups['scheme'] == 'file':
|
||||
req.local_file = True
|
||||
elif '#egg=' in line:
|
||||
# Assume a local file match
|
||||
assert local_match is not None, 'This should match everything'
|
||||
groups = local_match.groupdict()
|
||||
req.local_file = True
|
||||
if groups['fragment']:
|
||||
fragment = parse_fragment(groups['fragment'])
|
||||
egg = fragment.get('egg')
|
||||
name, extras = parse_extras_require(egg)
|
||||
req.name = fragment.get('egg')
|
||||
req.hash_name, req.hash = get_hash_info(fragment)
|
||||
req.subdirectory = fragment.get('subdirectory')
|
||||
req.path = groups['path']
|
||||
else:
|
||||
# This is a requirement specifier.
|
||||
# Delegate to pkg_resources and hope for the best
|
||||
req.specifier = True
|
||||
pkg_req = Req.parse(line)
|
||||
req.name = pkg_req.unsafe_name
|
||||
req.extras = list(pkg_req.extras)
|
||||
req.specs = pkg_req.specs
|
||||
return req
|
||||
|
||||
@classmethod
|
||||
def parse(cls, line):
|
||||
"""
|
||||
Parses a Requirement from a line of a requirement file.
|
||||
|
||||
:param line: a line of a requirement file
|
||||
:returns: a Requirement instance for the given line
|
||||
:raises: ValueError on an invalid requirement
|
||||
"""
|
||||
line = line.lstrip()
|
||||
if line.startswith('-e') or line.startswith('--editable'):
|
||||
# Editable installs are either a local project path
|
||||
# or a VCS project URI
|
||||
return cls.parse_editable(
|
||||
re.sub(r'^(-e|--editable=?)\s*', '', line))
|
||||
elif '@' in line and ('#' not in line or line.index('#') > line.index('@')):
|
||||
# Allegro bug fix: support 'name @ git+' entries
|
||||
name, uri = line.split('@', 1)
|
||||
name = name.strip()
|
||||
uri = uri.strip()
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# check if the name is valid & parsed
|
||||
Req.parse(name)
|
||||
# if we are here, name is a valid package name, check if the vcs part is valid
|
||||
if VCS_REGEX.match(uri) or VCS_EXT_REGEX.match(uri):
|
||||
req = cls.parse_line(uri)
|
||||
req.name = name
|
||||
return req
|
||||
elif URI_REGEX.match(uri):
|
||||
req = cls.parse_line(uri)
|
||||
req.name = name
|
||||
req.line = line
|
||||
return req
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return cls.parse_line(line)
|
||||
30
clearml_agent/external/requirements_parser/vcs.py
vendored
Normal file
30
clearml_agent/external/requirements_parser/vcs.py
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
VCS = [
|
||||
'git',
|
||||
'hg',
|
||||
'svn',
|
||||
'bzr',
|
||||
]
|
||||
|
||||
VCS_SCHEMES = [
|
||||
'git',
|
||||
'git+https',
|
||||
'git+ssh',
|
||||
'git+git',
|
||||
'hg+http',
|
||||
'hg+https',
|
||||
'hg+static-http',
|
||||
'hg+ssh',
|
||||
'svn',
|
||||
'svn+svn',
|
||||
'svn+http',
|
||||
'svn+https',
|
||||
'svn+ssh',
|
||||
'bzr+http',
|
||||
'bzr+https',
|
||||
'bzr+ssh',
|
||||
'bzr+sftp',
|
||||
'bzr+ftp',
|
||||
'bzr+lp',
|
||||
]
|
||||
816
clearml_agent/glue/k8s.py
Normal file
816
clearml_agent/glue/k8s.py
Normal file
@@ -0,0 +1,816 @@
|
||||
from __future__ import print_function, division, unicode_literals
|
||||
|
||||
import base64
|
||||
import functools
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from time import sleep
|
||||
from typing import Text, List, Callable, Any, Collection, Optional, Union
|
||||
|
||||
import yaml
|
||||
|
||||
from clearml_agent.commands.events import Events
|
||||
from clearml_agent.commands.worker import Worker, get_task_container, set_task_container
|
||||
from clearml_agent.definitions import ENV_DOCKER_IMAGE
|
||||
from clearml_agent.errors import APIError
|
||||
from clearml_agent.helper.base import safe_remove_file
|
||||
from clearml_agent.helper.dicts import merge_dicts
|
||||
from clearml_agent.helper.process import get_bash_output
|
||||
from clearml_agent.helper.resource_monitor import ResourceMonitor
|
||||
from clearml_agent.interface.base import ObjectID
|
||||
|
||||
|
||||
class K8sIntegration(Worker):
|
||||
K8S_PENDING_QUEUE = "k8s_scheduler"
|
||||
|
||||
K8S_DEFAULT_NAMESPACE = "clearml"
|
||||
AGENT_LABEL = "CLEARML=agent"
|
||||
LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"
|
||||
|
||||
KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"
|
||||
|
||||
KUBECTL_RUN_CMD = "kubectl run clearml-id-{task_id} " \
|
||||
"--image {docker_image} {docker_args} " \
|
||||
"--restart=Never " \
|
||||
"--namespace={namespace}"
|
||||
|
||||
KUBECTL_DELETE_CMD = "kubectl delete pods " \
|
||||
"--selector={selector} " \
|
||||
"--field-selector=status.phase!=Pending,status.phase!=Running " \
|
||||
"--namespace={namespace}"
|
||||
|
||||
BASH_INSTALL_SSH_CMD = [
|
||||
"apt-get update",
|
||||
"apt-get install -y openssh-server",
|
||||
"mkdir -p /var/run/sshd",
|
||||
"echo 'root:training' | chpasswd",
|
||||
"echo 'PermitRootLogin yes' >> /etc/ssh/sshd_config",
|
||||
"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config",
|
||||
r"sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd",
|
||||
"echo 'AcceptEnv TRAINS_API_ACCESS_KEY TRAINS_API_SECRET_KEY CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY' "
|
||||
">> /etc/ssh/sshd_config",
|
||||
'echo "export VISIBLE=now" >> /etc/profile',
|
||||
'echo "export PATH=$PATH" >> /etc/profile',
|
||||
'echo "ldconfig" >> /etc/profile',
|
||||
"/usr/sbin/sshd -p {port}"]
|
||||
|
||||
CONTAINER_BASH_SCRIPT = [
|
||||
"export DEBIAN_FRONTEND='noninteractive'",
|
||||
"echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||
"chown -R root /root/.cache/pip",
|
||||
"apt-get update",
|
||||
"apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
"declare LOCAL_PYTHON",
|
||||
"[ ! -z $LOCAL_PYTHON ] || for i in {{15..5}}; do which python3.$i && python3.$i -m pip --version && "
|
||||
"export LOCAL_PYTHON=$(which python3.$i) && break ; done",
|
||||
"[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
|
||||
"[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
|
||||
"$LOCAL_PYTHON -m pip install clearml-agent",
|
||||
"{extra_bash_init_cmd}",
|
||||
"{extra_docker_bash_script}",
|
||||
"$LOCAL_PYTHON -m clearml_agent execute --full-monitoring --require-queue --id {task_id}"
|
||||
]
|
||||
|
||||
_edit_hyperparams_version = "2.9"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
k8s_pending_queue_name=None,
|
||||
kubectl_cmd=None,
|
||||
container_bash_script=None,
|
||||
debug=False,
|
||||
ports_mode=False,
|
||||
num_of_services=20,
|
||||
base_pod_num=1,
|
||||
user_props_cb=None,
|
||||
overrides_yaml=None,
|
||||
template_yaml=None,
|
||||
clearml_conf_file=None,
|
||||
extra_bash_init_script=None,
|
||||
namespace=None,
|
||||
max_pods_limit=None,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the k8s integration glue layer daemon
|
||||
|
||||
:param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
|
||||
:param str|callable kubectl_cmd: kubectl command line str, supports formatting (default: KUBECTL_RUN_CMD)
|
||||
example: "task={task_id} image={docker_image} queue_id={queue_id}"
|
||||
or a callable function: kubectl_cmd(task_id, docker_image, docker_args, queue_id, task_data)
|
||||
:param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
|
||||
Notice this string will use format() call, if you have curly brackets they should be doubled { -> {{
|
||||
Format arguments passed: {task_id} and {extra_bash_init_cmd}
|
||||
:param bool debug: Switch logging on
|
||||
:param bool ports_mode: Adds a label to each pod which can be used in services in order to expose ports.
|
||||
Requires the `num_of_services` parameter.
|
||||
:param int num_of_services: Number of k8s services configured in the cluster. Required if `port_mode` is True.
|
||||
(default: 20)
|
||||
:param int base_pod_num: Used when `ports_mode` is True, sets the base pod number to a given value (default: 1)
|
||||
:param callable user_props_cb: An Optional callable allowing additional user properties to be specified
|
||||
when scheduling a task to run in a pod. Callable can receive an optional pod number and should return
|
||||
a dictionary of user properties (name and value). Signature is [[Optional[int]], Dict[str,str]]
|
||||
:param str overrides_yaml: YAML file containing the overrides for the pod (optional)
|
||||
:param str template_yaml: YAML file containing the template for the pod (optional).
|
||||
If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
|
||||
:param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional)
|
||||
:param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
|
||||
:param str namespace: K8S namespace to be used when creating the new pods (default: clearml)
|
||||
:param int max_pods_limit: Maximum number of pods that K8S glue can run at the same time
|
||||
"""
|
||||
super(K8sIntegration, self).__init__()
|
||||
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
|
||||
self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
|
||||
self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
|
||||
# Always do system packages, because by we will be running inside a docker
|
||||
self._session.config.put("agent.package_manager.system_site_packages", True)
|
||||
# Add debug logging
|
||||
if debug:
|
||||
self.log.logger.disabled = False
|
||||
self.log.logger.setLevel(logging.INFO)
|
||||
self.ports_mode = ports_mode
|
||||
self.num_of_services = num_of_services
|
||||
self.base_pod_num = base_pod_num
|
||||
self._edit_hyperparams_support = None
|
||||
self._user_props_cb = user_props_cb
|
||||
self.conf_file_content = None
|
||||
self.overrides_json_string = None
|
||||
self.template_dict = None
|
||||
self.extra_bash_init_script = extra_bash_init_script or None
|
||||
if self.extra_bash_init_script and not isinstance(self.extra_bash_init_script, str):
|
||||
self.extra_bash_init_script = ' ; '.join(self.extra_bash_init_script) # noqa
|
||||
self.namespace = namespace or self.K8S_DEFAULT_NAMESPACE
|
||||
self.pod_limits = []
|
||||
self.pod_requests = []
|
||||
self.max_pods_limit = max_pods_limit if not self.ports_mode else None
|
||||
if overrides_yaml:
|
||||
with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
|
||||
overrides = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
||||
if overrides:
|
||||
containers = overrides.get('spec', {}).get('containers', [])
|
||||
for c in containers:
|
||||
resources = {str(k).lower(): v for k, v in c.get('resources', {}).items()}
|
||||
if not resources:
|
||||
continue
|
||||
if resources.get('limits'):
|
||||
self.pod_limits += ['{}={}'.format(k, v) for k, v in resources['limits'].items()]
|
||||
if resources.get('requests'):
|
||||
self.pod_requests += ['{}={}'.format(k, v) for k, v in resources['requests'].items()]
|
||||
# remove double entries
|
||||
self.pod_limits = list(set(self.pod_limits))
|
||||
self.pod_requests = list(set(self.pod_requests))
|
||||
if self.pod_limits or self.pod_requests:
|
||||
self.log.warning('Found pod container requests={} limits={}'.format(
|
||||
self.pod_limits, self.pod_requests))
|
||||
if containers:
|
||||
self.log.warning('Removing containers section: {}'.format(overrides['spec'].pop('containers')))
|
||||
self.overrides_json_string = json.dumps(overrides)
|
||||
if template_yaml:
|
||||
with open(os.path.expandvars(os.path.expanduser(str(template_yaml))), 'rt') as f:
|
||||
self.template_dict = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
||||
|
||||
clearml_conf_file = clearml_conf_file or kwargs.get('trains_conf_file')
|
||||
|
||||
if clearml_conf_file:
|
||||
with open(os.path.expandvars(os.path.expanduser(str(clearml_conf_file))), 'rt') as f:
|
||||
self.conf_file_content = f.read()
|
||||
# make sure we use system packages!
|
||||
self.conf_file_content += '\nagent.package_manager.system_site_packages=true\n'
|
||||
|
||||
self._agent_label = None
|
||||
|
||||
self._monitor_hanging_pods()
|
||||
|
||||
def _monitor_hanging_pods(self):
|
||||
_check_pod_thread = Thread(target=self._monitor_hanging_pods_daemon)
|
||||
_check_pod_thread.daemon = True
|
||||
_check_pod_thread.start()
|
||||
|
||||
@staticmethod
|
||||
def _get_path(d, *path, default=None):
|
||||
try:
|
||||
return functools.reduce(
|
||||
lambda a, b: a[b], path, d
|
||||
)
|
||||
except (IndexError, KeyError):
|
||||
return default
|
||||
|
||||
def _monitor_hanging_pods_daemon(self):
|
||||
last_tasks_msgs = {} # last msg updated for every task
|
||||
|
||||
while True:
|
||||
output = get_bash_output('kubectl get pods -n {namespace} -o=JSON'.format(
|
||||
namespace=self.namespace
|
||||
))
|
||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||
try:
|
||||
output_config = json.loads(output)
|
||||
except Exception as ex:
|
||||
self.log.warning('K8S Glue pods monitor: Failed parsing kubectl output:\n{}\nEx: {}'.format(output, ex))
|
||||
sleep(self._polling_interval)
|
||||
continue
|
||||
pods = output_config.get('items', [])
|
||||
task_ids = set()
|
||||
for pod in pods:
|
||||
if self._get_path(pod, 'status', 'phase') != "Pending":
|
||||
continue
|
||||
|
||||
pod_name = pod.get('metadata', {}).get('name', None)
|
||||
if not pod_name:
|
||||
continue
|
||||
|
||||
task_id = pod_name.rpartition('-')[-1]
|
||||
if not task_id:
|
||||
continue
|
||||
|
||||
task_ids.add(task_id)
|
||||
|
||||
msg = None
|
||||
|
||||
waiting = self._get_path(pod, 'status', 'containerStatuses', 0, 'state', 'waiting')
|
||||
if not waiting:
|
||||
condition = self._get_path(pod, 'status', 'conditions', 0)
|
||||
if condition:
|
||||
reason = condition.get('reason')
|
||||
if reason == 'Unschedulable':
|
||||
message = condition.get('message')
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
else:
|
||||
reason = waiting.get("reason", None)
|
||||
message = waiting.get("message", None)
|
||||
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
|
||||
if reason == 'ImagePullBackOff':
|
||||
delete_pod_cmd = 'kubectl delete pods {} -n {}'.format(pod_name, self.namespace)
|
||||
get_bash_output(delete_pod_cmd)
|
||||
try:
|
||||
self._session.api_client.tasks.failed(
|
||||
task=task_id,
|
||||
status_reason="K8S glue error: {}".format(msg),
|
||||
status_message="Changed by K8S glue",
|
||||
force=True
|
||||
)
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed deleting task "{}"\nEX: {}'.format(task_id, ex)
|
||||
)
|
||||
|
||||
# clean up any msg for this task
|
||||
last_tasks_msgs.pop(task_id, None)
|
||||
continue
|
||||
if msg and last_tasks_msgs.get(task_id, None) != msg:
|
||||
try:
|
||||
result = self._session.send_request(
|
||||
service='tasks',
|
||||
action='update',
|
||||
json={"task": task_id, "status_message": "K8S glue status: {}".format(msg)},
|
||||
method='get',
|
||||
async_enable=False,
|
||||
)
|
||||
if not result.ok:
|
||||
result_msg = self._get_path(result.json(), 'meta', 'result_msg')
|
||||
raise Exception(result_msg or result.text)
|
||||
|
||||
# update last msg for this task
|
||||
last_tasks_msgs[task_id] = msg
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed setting status message for task "{}"\nEX: {}'.format(
|
||||
task_id, ex
|
||||
)
|
||||
)
|
||||
|
||||
# clean up any last message for a task that wasn't seen as a pod
|
||||
last_tasks_msgs = {k: v for k, v in last_tasks_msgs.items() if k in task_ids}
|
||||
|
||||
sleep(self._polling_interval)
|
||||
|
||||
def _set_task_user_properties(self, task_id: str, **properties: str):
|
||||
if self._edit_hyperparams_support is not True:
|
||||
# either not supported or never tested
|
||||
if self._edit_hyperparams_support == self._session.api_version:
|
||||
# tested against latest api_version, not supported
|
||||
return
|
||||
if not self._session.check_min_api_version(self._edit_hyperparams_version):
|
||||
# not supported due to insufficient api_version
|
||||
self._edit_hyperparams_support = self._session.api_version
|
||||
return
|
||||
try:
|
||||
self._session.get(
|
||||
service="tasks",
|
||||
action="edit_hyper_params",
|
||||
task=task_id,
|
||||
hyperparams=[
|
||||
{
|
||||
"section": "properties",
|
||||
"name": k,
|
||||
"value": str(v),
|
||||
}
|
||||
for k, v in properties.items()
|
||||
],
|
||||
)
|
||||
# definitely supported
|
||||
self._runtime_props_support = True
|
||||
except APIError as error:
|
||||
if error.code == 404:
|
||||
self._edit_hyperparams_support = self._session.api_version
|
||||
|
||||
def _get_agent_label(self):
|
||||
if not self.worker_id:
|
||||
print('WARNING! no worker ID found!!!')
|
||||
return self.AGENT_LABEL
|
||||
|
||||
if not self._agent_label:
|
||||
h = hashlib.md5()
|
||||
h.update(str(self.worker_id).encode('utf-8'))
|
||||
self._agent_label = '{}-{}'.format(self.AGENT_LABEL, h.hexdigest()[:8])
|
||||
|
||||
return self._agent_label
|
||||
|
||||
def _get_number_used_pods(self):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||
error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
|
||||
|
||||
if not output:
|
||||
# No such pod exist so we can use the pod_number we found
|
||||
return 0
|
||||
|
||||
try:
|
||||
current_pod_count = len(json.loads(output).get("items", []))
|
||||
except (ValueError, TypeError) as ex:
|
||||
return -1
|
||||
|
||||
return current_pod_count
|
||||
except Exception as ex:
|
||||
print('Failed getting number of used pods: {}'.format(ex))
|
||||
return -2
|
||||
|
||||
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
|
||||
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
|
||||
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
|
||||
|
||||
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
|
||||
try:
|
||||
print('Pushing task {} into temporary pending queue'.format(task_id))
|
||||
res = self._session.api_client.tasks.stop(task_id, force=True)
|
||||
res = self._session.api_client.tasks.enqueue(
|
||||
task_id,
|
||||
queue=self.k8s_pending_queue_name,
|
||||
status_reason='k8s pending scheduler',
|
||||
)
|
||||
if res.meta.result_code != 200:
|
||||
raise Exception(res.meta.result_msg)
|
||||
except Exception as e:
|
||||
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
|
||||
task_id, self.k8s_pending_queue_name, e))
|
||||
return
|
||||
|
||||
container = get_task_container(self._session, task_id)
|
||||
if not container.get('image'):
|
||||
container['image'] = str(
|
||||
ENV_DOCKER_IMAGE.get() or self._session.config.get("agent.default_docker.image", "nvidia/cuda")
|
||||
)
|
||||
container['arguments'] = self._session.config.get("agent.default_docker.arguments", None)
|
||||
set_task_container(
|
||||
self._session, task_id, docker_image=container['image'], docker_arguments=container['arguments']
|
||||
)
|
||||
|
||||
# get the clearml.conf encoded file
|
||||
# noinspection PyProtectedMember
|
||||
hocon_config_encoded = (
|
||||
self.conf_file_content
|
||||
or Path(self._session._config_file).read_text()
|
||||
).encode("ascii")
|
||||
create_clearml_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
|
||||
base64.b64encode(
|
||||
hocon_config_encoded
|
||||
).decode('ascii')
|
||||
)
|
||||
|
||||
if self.ports_mode:
|
||||
print("Kubernetes looking for available pod to use")
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
queue_name = self._session.api_client.queues.get_by_id(queue=queue).name
|
||||
except Exception:
|
||||
queue_name = 'k8s'
|
||||
|
||||
# Search for a free pod number
|
||||
pod_count = 0
|
||||
pod_number = self.base_pod_num
|
||||
while self.ports_mode or self.max_pods_limit:
|
||||
pod_number = self.base_pod_num + pod_count
|
||||
if self.ports_mode:
|
||||
kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
|
||||
pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
else:
|
||||
kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||
error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
|
||||
|
||||
if not output:
|
||||
# No such pod exist so we can use the pod_number we found
|
||||
break
|
||||
|
||||
if self.max_pods_limit:
|
||||
try:
|
||||
current_pod_count = len(json.loads(output).get("items", []))
|
||||
except (ValueError, TypeError) as ex:
|
||||
self.log.warning(
|
||||
"K8S Glue pods monitor: Failed parsing kubectl output:\n{}\ntask '{}' "
|
||||
"will be enqueued back to queue '{}'\nEx: {}".format(
|
||||
output, task_id, queue, ex
|
||||
)
|
||||
)
|
||||
self._session.api_client.tasks.stop(task_id, force=True)
|
||||
self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error')
|
||||
return
|
||||
max_count = self.max_pods_limit
|
||||
else:
|
||||
current_pod_count = pod_count
|
||||
max_count = self.num_of_services - 1
|
||||
|
||||
if current_pod_count >= max_count:
|
||||
# All pods are taken, exit
|
||||
self.log.debug(
|
||||
"kubectl last result: {}\n{}".format(error, output))
|
||||
self.log.warning(
|
||||
"All k8s services are in use, task '{}' "
|
||||
"will be enqueued back to queue '{}'".format(
|
||||
task_id, queue
|
||||
)
|
||||
)
|
||||
self._session.api_client.tasks.stop(task_id, force=True)
|
||||
self._session.api_client.tasks.enqueue(
|
||||
task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
|
||||
return
|
||||
elif self.max_pods_limit:
|
||||
# max pods limit hasn't reached yet, so we can create the pod
|
||||
break
|
||||
pod_count += 1
|
||||
|
||||
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + \
|
||||
[self._get_agent_label()]
|
||||
labels.append("clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)))
|
||||
labels.append("clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name)))
|
||||
|
||||
if self.ports_mode:
|
||||
print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
|
||||
else:
|
||||
print("Kubernetes scheduling task id={}".format(task_id))
|
||||
|
||||
kubectl_kwargs = dict(
|
||||
create_clearml_conf=create_clearml_conf,
|
||||
labels=labels,
|
||||
docker_image=container['image'],
|
||||
docker_args=container['arguments'],
|
||||
docker_bash=container.get('setup_shell_script'),
|
||||
task_id=task_id,
|
||||
queue=queue
|
||||
)
|
||||
|
||||
if self.template_dict:
|
||||
output, error = self._kubectl_apply(**kubectl_kwargs)
|
||||
else:
|
||||
output, error = self._kubectl_run(task_data=task_data, **kubectl_kwargs)
|
||||
|
||||
error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
|
||||
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
|
||||
print('kubectl output:\n{}\n{}'.format(error, output))
|
||||
if error:
|
||||
send_log = "Running kubectl encountered an error: {}".format(error)
|
||||
self.log.error(send_log)
|
||||
self.send_logs(task_id, send_log.splitlines())
|
||||
|
||||
user_props = {"k8s-queue": str(queue_name)}
|
||||
if self.ports_mode:
|
||||
user_props.update(
|
||||
{
|
||||
"k8s-pod-number": pod_number,
|
||||
"k8s-pod-label": labels[0],
|
||||
"k8s-internal-pod-count": pod_count,
|
||||
}
|
||||
)
|
||||
|
||||
if self._user_props_cb:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
custom_props = self._user_props_cb(pod_number) if self.ports_mode else self._user_props_cb()
|
||||
user_props.update(custom_props)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if user_props:
|
||||
self._set_task_user_properties(
|
||||
task_id=task_id,
|
||||
**user_props
|
||||
)
|
||||
|
||||
def _get_docker_args(self, docker_args, flags, target=None, convert=None):
|
||||
# type: (List[str], Collection[str], Optional[str], Callable[[str], Any]) -> Union[dict, List[str]]
|
||||
"""
|
||||
Get docker args matching specific flags.
|
||||
|
||||
:argument docker_args: List of docker argument strings (flags and values)
|
||||
:argument flags: List of flags/names to intercept (e.g. "--env" etc.)
|
||||
:argument target: Controls return format. If provided, returns a dict with a target field containing a list
|
||||
of result strings, otherwise returns a list of result strings
|
||||
:argument convert: Optional conversion function for each result string
|
||||
"""
|
||||
args = docker_args[:] if docker_args else []
|
||||
results = []
|
||||
while args:
|
||||
cmd = args.pop(0).strip()
|
||||
if cmd in flags:
|
||||
env = args.pop(0).strip()
|
||||
if convert:
|
||||
env = convert(env)
|
||||
results.append(env)
|
||||
else:
|
||||
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
|
||||
if target:
|
||||
return {target: results} if results else {}
|
||||
return results
|
||||
|
||||
def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, docker_bash, labels, queue, task_id):
|
||||
template = deepcopy(self.template_dict)
|
||||
template.setdefault('apiVersion', 'v1')
|
||||
template['kind'] = 'Pod'
|
||||
template.setdefault('metadata', {})
|
||||
name = 'clearml-id-{task_id}'.format(task_id=task_id)
|
||||
template['metadata']['name'] = name
|
||||
template.setdefault('spec', {})
|
||||
template['spec'].setdefault('containers', [])
|
||||
template['spec'].setdefault('restartPolicy', 'Never')
|
||||
if labels:
|
||||
labels_dict = dict(pair.split('=', 1) for pair in labels)
|
||||
template['metadata'].setdefault('labels', {})
|
||||
template['metadata']['labels'].update(labels_dict)
|
||||
|
||||
container = self._get_docker_args(
|
||||
docker_args,
|
||||
target="env",
|
||||
flags={"-e", "--env"},
|
||||
convert=lambda env: {'name': env.partition("=")[0], 'value': env.partition("=")[2]},
|
||||
)
|
||||
|
||||
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
|
||||
else self.container_bash_script
|
||||
|
||||
extra_docker_bash_script = '\n'.join(self._session.config.get("agent.extra_docker_shell_script", None) or [])
|
||||
if docker_bash:
|
||||
extra_docker_bash_script += '\n' + str(docker_bash) + '\n'
|
||||
|
||||
script_encoded = '\n'.join(
|
||||
['#!/bin/bash', ] +
|
||||
[line.format(extra_bash_init_cmd=self.extra_bash_init_script or '',
|
||||
task_id=task_id,
|
||||
extra_docker_bash_script=extra_docker_bash_script)
|
||||
for line in container_bash_script])
|
||||
|
||||
create_init_script = \
|
||||
"echo '{}' | base64 --decode >> ~/__start_agent__.sh ; " \
|
||||
"/bin/bash ~/__start_agent__.sh".format(
|
||||
base64.b64encode(
|
||||
script_encoded.encode('ascii')
|
||||
).decode('ascii'))
|
||||
|
||||
# Notice: we always leave with exit code 0, so pods are never restarted
|
||||
container = self._merge_containers(
|
||||
container,
|
||||
dict(name=name, image=docker_image,
|
||||
command=['/bin/bash'],
|
||||
args=['-c', '{} ; {} ; exit 0'.format(create_clearml_conf, create_init_script)])
|
||||
)
|
||||
|
||||
if template['spec']['containers']:
|
||||
template['spec']['containers'][0] = self._merge_containers(template['spec']['containers'][0], container)
|
||||
else:
|
||||
template['spec']['containers'].append(container)
|
||||
|
||||
if self._docker_force_pull:
|
||||
for c in template['spec']['containers']:
|
||||
c.setdefault('imagePullPolicy', 'Always')
|
||||
|
||||
fp, yaml_file = tempfile.mkstemp(prefix='clearml_k8stmpl_', suffix='.yml')
|
||||
os.close(fp)
|
||||
with open(yaml_file, 'wt') as f:
|
||||
yaml.dump(template, f)
|
||||
|
||||
kubectl_cmd = self.KUBECTL_APPLY_CMD.format(
|
||||
task_id=task_id,
|
||||
docker_image=docker_image,
|
||||
queue_id=queue,
|
||||
namespace=self.namespace
|
||||
)
|
||||
# make sure we provide a list
|
||||
if isinstance(kubectl_cmd, str):
|
||||
kubectl_cmd = kubectl_cmd.split()
|
||||
|
||||
# add the template file at the end
|
||||
kubectl_cmd += [yaml_file]
|
||||
try:
|
||||
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
except Exception as ex:
|
||||
return None, str(ex)
|
||||
finally:
|
||||
safe_remove_file(yaml_file)
|
||||
|
||||
return output, error
|
||||
|
||||
def _kubectl_run(
|
||||
self, create_clearml_conf, docker_image, docker_args, docker_bash, labels, queue, task_data, task_id
|
||||
):
|
||||
if callable(self.kubectl_cmd):
|
||||
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, docker_args, queue, task_data)
|
||||
else:
|
||||
kubectl_cmd = self.kubectl_cmd.format(
|
||||
task_id=task_id,
|
||||
docker_image=docker_image,
|
||||
docker_args=" ".join(self._get_docker_args(
|
||||
docker_args, flags={"-e", "--env"}, convert=lambda env: '--env={}'.format(env))
|
||||
),
|
||||
queue_id=queue,
|
||||
namespace=self.namespace,
|
||||
)
|
||||
# make sure we provide a list
|
||||
if isinstance(kubectl_cmd, str):
|
||||
kubectl_cmd = kubectl_cmd.split()
|
||||
|
||||
if self.overrides_json_string:
|
||||
kubectl_cmd += ['--overrides=' + self.overrides_json_string]
|
||||
|
||||
if self.pod_limits:
|
||||
kubectl_cmd += ['--limits', ",".join(self.pod_limits)]
|
||||
if self.pod_requests:
|
||||
kubectl_cmd += ['--requests', ",".join(self.pod_requests)]
|
||||
|
||||
if self._docker_force_pull and not any(x.startswith("--image-pull-policy=") for x in kubectl_cmd):
|
||||
kubectl_cmd += ["--image-pull-policy='always'"]
|
||||
|
||||
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
|
||||
else self.container_bash_script
|
||||
container_bash_script = ' ; '.join(container_bash_script)
|
||||
|
||||
kubectl_cmd += [
|
||||
"--labels=" + ",".join(labels),
|
||||
"--command",
|
||||
"--",
|
||||
"/bin/sh",
|
||||
"-c",
|
||||
"{} ; {}".format(create_clearml_conf, container_bash_script.format(
|
||||
extra_bash_init_cmd=self.extra_bash_init_script or "",
|
||||
extra_docker_bash_script=docker_bash or "",
|
||||
task_id=task_id
|
||||
)),
|
||||
]
|
||||
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
return output, error
|
||||
|
||||
def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
|
||||
"""
|
||||
:summary: Pull and run tasks from queues.
|
||||
:description: 1. Go through ``queues`` by order.
|
||||
2. Try getting the next task for each and run the first one that returns.
|
||||
3. Go to step 1
|
||||
:param queues: IDs of queues to pull tasks from
|
||||
:type queues: list of ``Text``
|
||||
:param worker_params: Worker command line arguments
|
||||
:type worker_params: ``clearml_agent.helper.process.WorkerParams``
|
||||
"""
|
||||
events_service = self.get_service(Events)
|
||||
|
||||
# make sure we have a k8s pending queue
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._session.api_client.queues.create(self.k8s_pending_queue_name)
|
||||
except Exception:
|
||||
pass
|
||||
# get queue id
|
||||
self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")
|
||||
|
||||
_last_machine_update_ts = 0
|
||||
while True:
|
||||
# check if have pod limit, then check if we hit it.
|
||||
if self.max_pods_limit:
|
||||
current_pods = self._get_number_used_pods()
|
||||
if current_pods >= self.max_pods_limit:
|
||||
print("Maximum pod limit reached {}/{}, sleeping for {:.1f} seconds".format(
|
||||
current_pods, self.max_pods_limit, self._polling_interval))
|
||||
# delete old completed / failed pods
|
||||
get_bash_output(
|
||||
self.KUBECTL_DELETE_CMD.format(namespace=self.namespace, selector=self._get_agent_label())
|
||||
)
|
||||
# go to sleep
|
||||
sleep(self._polling_interval)
|
||||
continue
|
||||
|
||||
# iterate over queues (priority style, queues[0] is highest)
|
||||
for queue in queues:
|
||||
# delete old completed / failed pods
|
||||
get_bash_output(
|
||||
self.KUBECTL_DELETE_CMD.format(namespace=self.namespace, selector=self._get_agent_label())
|
||||
)
|
||||
|
||||
# get next task in queue
|
||||
try:
|
||||
response = self._session.api_client.queues.get_next_task(queue=queue)
|
||||
except Exception as e:
|
||||
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
task_id = response.entry.task
|
||||
except AttributeError:
|
||||
print("No tasks in queue {}".format(queue))
|
||||
continue
|
||||
events_service.send_log_events(
|
||||
self.worker_id,
|
||||
task_id=task_id,
|
||||
lines="task {} pulled from {} by worker {}".format(
|
||||
task_id, queue, self.worker_id
|
||||
),
|
||||
level="INFO",
|
||||
)
|
||||
|
||||
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
|
||||
self.run_one_task(queue, task_id, worker_params)
|
||||
self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
|
||||
break
|
||||
else:
|
||||
# sleep and retry polling
|
||||
print("No tasks in Queues, sleeping for {:.1f} seconds".format(self._polling_interval))
|
||||
sleep(self._polling_interval)
|
||||
|
||||
if self._session.config["agent.reload_config"]:
|
||||
self.reload_config()
|
||||
|
||||
def k8s_daemon(self, queue):
|
||||
"""
|
||||
Start the k8s Glue service.
|
||||
This service will be pulling tasks from *queue* and scheduling them for execution using kubectl.
|
||||
Notice all scheduled tasks are pushed back into K8S_PENDING_QUEUE,
|
||||
and popped when execution actually starts. This creates full visibility into the k8s scheduler.
|
||||
Manually popping a task from the K8S_PENDING_QUEUE,
|
||||
will cause the k8s scheduler to skip the execution once the scheduled tasks needs to be executed
|
||||
|
||||
:param list(str) queue: queue name to pull from
|
||||
"""
|
||||
return self.daemon(queues=[ObjectID(name=queue)] if queue else None,
|
||||
log_level=logging.INFO, foreground=True, docker=False)
|
||||
|
||||
@classmethod
|
||||
def get_ssh_server_bash(cls, ssh_port_number):
|
||||
return ' ; '.join(line.format(port=ssh_port_number) for line in cls.BASH_INSTALL_SSH_CMD)
|
||||
|
||||
@staticmethod
|
||||
def _merge_containers(c1, c2):
|
||||
def merge_env(k, d1, d2, not_set):
|
||||
if k != "env":
|
||||
return not_set
|
||||
# Merge environment lists, second list overrides first
|
||||
return list({
|
||||
item['name']: item for envs in (d1, d2) for item in envs
|
||||
}.values())
|
||||
|
||||
return merge_dicts(
|
||||
c1, c2, custom_merge_func=merge_env
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _safe_k8s_label_value(value):
|
||||
""" Conform string to k8s standards for a label value """
|
||||
value = value.lower().strip()
|
||||
value = re.sub(r'^[^A-Za-z0-9]+', '', value) # strip leading non-alphanumeric chars
|
||||
value = re.sub(r'[^A-Za-z0-9]+$', '', value) # strip trailing non-alphanumeric chars
|
||||
value = re.sub(r'\W+', '-', value) # allow only word chars (this removed "." which is supported, but nvm)
|
||||
value = re.sub(r'-+', '-', value) # don't leave messy "--" after replacing previous chars
|
||||
return value[:63]
|
||||
@@ -1,4 +1,4 @@
|
||||
""" TRAINS-AGENT Stdout Helper Functions """
|
||||
""" CLEARML-AGENT Stdout Helper Functions """
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import io
|
||||
@@ -24,12 +24,11 @@ import pyhocon
|
||||
import yaml
|
||||
from attr import fields_dict
|
||||
from pathlib2 import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
import six
|
||||
from six.moves import reduce
|
||||
from trains_agent.errors import CommandFailedError
|
||||
from trains_agent.helper.dicts import filter_keys
|
||||
from clearml_agent.errors import CommandFailedError
|
||||
from clearml_agent.helper.dicts import filter_keys
|
||||
|
||||
pretty_lines = False
|
||||
|
||||
@@ -173,28 +172,49 @@ def normalize_path(*paths):
|
||||
|
||||
|
||||
def safe_remove_file(filename, error_message=None):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
os.remove(filename)
|
||||
if filename:
|
||||
os.remove(filename)
|
||||
except Exception:
|
||||
if error_message:
|
||||
print(error_message)
|
||||
|
||||
|
||||
def get_python_path(script_dir, entry_point, package_api):
|
||||
def safe_remove_tree(filename):
|
||||
if not filename:
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
shutil.rmtree(filename, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
os.remove(filename)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def get_python_path(script_dir, entry_point, package_api, is_conda_env=False):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
python_path_sep = ';' if is_windows_platform() else ':'
|
||||
python_path_cmd = package_api.get_python_command(
|
||||
["-c", "import sys; print('{}'.join(sys.path))".format(python_path_sep)])
|
||||
org_python_path = python_path_cmd.get_output(cwd=script_dir)
|
||||
# Add path of the script directory and executable directory
|
||||
python_path = '{}{python_path_sep}{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(),
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
if is_windows_platform():
|
||||
return python_path.replace('/', '\\') + org_python_path
|
||||
python_path = '{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(), python_path_sep=python_path_sep)
|
||||
if entry_point:
|
||||
python_path += '{}{python_path_sep}'.format(
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
|
||||
return python_path + org_python_path
|
||||
if is_windows_platform():
|
||||
python_path = python_path.replace('/', '\\')
|
||||
|
||||
return python_path if is_conda_env else (python_path + org_python_path)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@@ -362,11 +382,11 @@ AllDumper.add_multi_representer(object, lambda dumper, data: dumper.represent_st
|
||||
|
||||
|
||||
def error(message):
|
||||
print('\ntrains_agent: ERROR: {}\n'.format(message))
|
||||
print('\nclearml_agent: ERROR: {}\n'.format(message))
|
||||
|
||||
|
||||
def warning(message):
|
||||
print('trains_agent: Warning: {}'.format(message))
|
||||
print('clearml_agent: Warning: {}'.format(message))
|
||||
|
||||
|
||||
class TqdmStream(object):
|
||||
@@ -381,12 +401,6 @@ class TqdmStream(object):
|
||||
self.buffer.write('\n')
|
||||
|
||||
|
||||
class TqdmLog(tqdm):
|
||||
|
||||
def __init__(self, iterable=None, file=None, **kwargs):
|
||||
super(TqdmLog, self).__init__(iterable, file=TqdmStream(file or sys.stderr), **kwargs)
|
||||
|
||||
|
||||
def url_join(first, *rest):
|
||||
"""
|
||||
Join url parts similarly to Path.join
|
||||
@@ -442,9 +456,9 @@ def chain_map(*args):
|
||||
return reduce(lambda x, y: x.update(y) or x, args, {})
|
||||
|
||||
|
||||
def check_directory_path(path):
|
||||
def check_directory_path(path, check_whitespace_in_path=True):
|
||||
message = 'Could not create directory "{}": {}'
|
||||
if not is_windows_platform():
|
||||
if not is_windows_platform() and check_whitespace_in_path:
|
||||
match = re.search(r'\s', path)
|
||||
if match:
|
||||
raise CommandFailedError(
|
||||
@@ -492,6 +506,38 @@ def is_conda(config):
|
||||
return config['agent.package_manager.type'].lower() == 'conda'
|
||||
|
||||
|
||||
def convert_cuda_version_to_float_single_digit_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into a float representation, e.g. 11.4
|
||||
Notice returns String Single digit only!
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = str(cuda_version or 0)
|
||||
# if we have patch version we parse it here
|
||||
cuda_version_parts = [int(v) for v in cuda_version.split('.')]
|
||||
if len(cuda_version_parts) > 1 or cuda_version_parts[0] < 60:
|
||||
cuda_version = 10 * cuda_version_parts[0]
|
||||
if len(cuda_version_parts) > 1:
|
||||
cuda_version += float(".{:d}".format(cuda_version_parts[1]))*10
|
||||
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
else:
|
||||
cuda_version = cuda_version_parts[0]
|
||||
cuda_version_full = "{:.1f}".format(float(cuda_version) / 10.)
|
||||
|
||||
return cuda_version_full
|
||||
|
||||
|
||||
def convert_cuda_version_to_int_10_base_str(cuda_version):
|
||||
"""
|
||||
Convert a cuda_version (string/float/int) into an integer version, e.g. 112 for cuda 11.2
|
||||
Return string
|
||||
:return str:
|
||||
"""
|
||||
cuda_version = convert_cuda_version_to_float_single_digit_str(cuda_version)
|
||||
return str(int(float(cuda_version)*10))
|
||||
|
||||
|
||||
class NonStrictAttrs(object):
|
||||
|
||||
@classmethod
|
||||
@@ -537,6 +583,7 @@ class ExecutionInfo(NonStrictAttrs):
|
||||
branch = nullable_string
|
||||
version_num = nullable_string
|
||||
tag = nullable_string
|
||||
docker_cmd = nullable_string
|
||||
|
||||
@classmethod
|
||||
def from_task(cls, task_info):
|
||||
@@ -554,4 +601,24 @@ class ExecutionInfo(NonStrictAttrs):
|
||||
execution.entry_point = entry_point
|
||||
execution.working_dir = working_dir or ""
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
execution.docker_cmd = task_info.execution.docker_cmd
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return execution
|
||||
|
||||
|
||||
class safe_furl(furl.furl):
|
||||
|
||||
@property
|
||||
def port(self):
|
||||
return self._port
|
||||
|
||||
@port.setter
|
||||
def port(self, port):
|
||||
"""
|
||||
Any port value is valid
|
||||
"""
|
||||
self._port = port
|
||||
@@ -21,14 +21,14 @@ def start_check_update_daemon():
|
||||
|
||||
def _check_new_version_available():
|
||||
cur_version = __version__
|
||||
update_server_releases = requests.get('https://updates.trains.allegro.ai/updates',
|
||||
data=json.dumps({"versions": {"trains-agent": str(cur_version)}}),
|
||||
update_server_releases = requests.get('https://updates.clear.ml/updates',
|
||||
data=json.dumps({"versions": {"clearml-agent": str(cur_version)}}),
|
||||
timeout=3.0)
|
||||
if update_server_releases.ok:
|
||||
update_server_releases = update_server_releases.json()
|
||||
else:
|
||||
return None
|
||||
trains_answer = update_server_releases.get("trains-agent", {})
|
||||
trains_answer = update_server_releases.get("clearml-agent", {})
|
||||
latest_version = trains_answer.get("version")
|
||||
cur_version = cur_version
|
||||
latest_version = latest_version or ''
|
||||
@@ -48,7 +48,7 @@ def _check_update_daemon():
|
||||
if latest_version:
|
||||
if latest_version[1]:
|
||||
sep = os.linesep
|
||||
print('TRAINS-AGENT new package available: UPGRADE to v{} is recommended!\nRelease Notes:\n{}'.format(
|
||||
print('CLEARML-AGENT new package available: UPGRADE to v{} is recommended!\nRelease Notes:\n{}'.format(
|
||||
latest_version[0], sep.join(latest_version[2])))
|
||||
else:
|
||||
print('TRAINS-SERVER new version available: upgrade to v{} is recommended!'.format(
|
||||
@@ -2,14 +2,14 @@ from __future__ import unicode_literals, print_function
|
||||
|
||||
import csv
|
||||
import sys
|
||||
from collections import Iterable
|
||||
from collections.abc import Iterable
|
||||
from typing import List, Dict, Text, Any
|
||||
|
||||
from attr import attrs, attrib
|
||||
|
||||
import six
|
||||
from six import binary_type, text_type
|
||||
from trains_agent.helper.base import nonstrict_in_place_sort, create_tree
|
||||
from clearml_agent.helper.base import nonstrict_in_place_sort
|
||||
|
||||
|
||||
def print_text(text, newline=True):
|
||||
@@ -22,15 +22,21 @@ def print_text(text, newline=True):
|
||||
sys.stdout.write(data)
|
||||
|
||||
|
||||
def decode_binary_lines(binary_lines, encoding='utf-8'):
|
||||
def decode_binary_lines(binary_lines, encoding='utf-8', replace_cr=False, overwrite_cr=False):
|
||||
# decode per line, if we failed decoding skip the line
|
||||
lines = []
|
||||
for b in binary_lines:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
l = b.decode(encoding=encoding, errors='replace').replace('\r', '\n')
|
||||
except:
|
||||
l = ''
|
||||
lines.append(l + '\n' if l and l[-1] != '\n' else l)
|
||||
line = b.decode(encoding=encoding, errors='replace')
|
||||
if replace_cr:
|
||||
line = line.replace('\r', '\n')
|
||||
elif overwrite_cr:
|
||||
cr_lines = line.split('\r')
|
||||
line = cr_lines[-1] if cr_lines[-1] or len(cr_lines) < 2 else cr_lines[-2]
|
||||
except Exception:
|
||||
line = ''
|
||||
lines.append(line + '\n' if not line or line[-1] != '\n' else line)
|
||||
return lines
|
||||
|
||||
|
||||
23
clearml_agent/helper/dicts.py
Normal file
23
clearml_agent/helper/dicts.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from typing import Callable, Dict, Any, Optional
|
||||
|
||||
_not_set = object()
|
||||
|
||||
|
||||
def filter_keys(filter_, dct): # type: (Callable[[Any], bool], Dict) -> Dict
|
||||
return {key: value for key, value in dct.items() if filter_(key)}
|
||||
|
||||
|
||||
def merge_dicts(dict1, dict2, custom_merge_func=None):
|
||||
# type: (Any, Any, Optional[Callable[[str, Any, Any, Any], Any]]) -> Any
|
||||
""" Recursively merges dict2 into dict1 """
|
||||
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
|
||||
return dict2
|
||||
for k in dict2:
|
||||
if k in dict1:
|
||||
res = None
|
||||
if custom_merge_func:
|
||||
res = custom_merge_func(k, dict1[k], dict2[k], _not_set)
|
||||
dict1[k] = merge_dicts(dict1[k], dict2[k], custom_merge_func) if res is _not_set else res
|
||||
else:
|
||||
dict1[k] = dict2[k]
|
||||
return dict1
|
||||
@@ -20,6 +20,7 @@ import platform
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import psutil
|
||||
from ..gpu import pynvml as N
|
||||
@@ -200,24 +201,30 @@ class GPUStatCollection(object):
|
||||
GPUStatCollection.global_processes[nv_process.pid] = \
|
||||
psutil.Process(pid=nv_process.pid)
|
||||
ps_process = GPUStatCollection.global_processes[nv_process.pid]
|
||||
process['username'] = ps_process.username()
|
||||
# cmdline returns full path;
|
||||
# as in `ps -o comm`, get short cmdnames.
|
||||
_cmdline = ps_process.cmdline()
|
||||
if not _cmdline:
|
||||
# sometimes, zombie or unknown (e.g. [kworker/8:2H])
|
||||
process['command'] = '?'
|
||||
process['full_command'] = ['?']
|
||||
else:
|
||||
process['command'] = os.path.basename(_cmdline[0])
|
||||
process['full_command'] = _cmdline
|
||||
# Bytes to MBytes
|
||||
process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
|
||||
process['cpu_percent'] = ps_process.cpu_percent()
|
||||
process['cpu_memory_usage'] = \
|
||||
round((ps_process.memory_percent() / 100.0) *
|
||||
psutil.virtual_memory().total)
|
||||
process['pid'] = nv_process.pid
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# we do not actually use these, so no point in collecting them
|
||||
# process['username'] = ps_process.username()
|
||||
# # cmdline returns full path;
|
||||
# # as in `ps -o comm`, get short cmdnames.
|
||||
# _cmdline = ps_process.cmdline()
|
||||
# if not _cmdline:
|
||||
# # sometimes, zombie or unknown (e.g. [kworker/8:2H])
|
||||
# process['command'] = '?'
|
||||
# process['full_command'] = ['?']
|
||||
# else:
|
||||
# process['command'] = os.path.basename(_cmdline[0])
|
||||
# process['full_command'] = _cmdline
|
||||
# process['cpu_percent'] = ps_process.cpu_percent()
|
||||
# process['cpu_memory_usage'] = \
|
||||
# round((ps_process.memory_percent() / 100.0) *
|
||||
# psutil.virtual_memory().total)
|
||||
# Bytes to MBytes
|
||||
process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
|
||||
except Exception:
|
||||
# insufficient permissions
|
||||
pass
|
||||
return process
|
||||
|
||||
if not GPUStatCollection._gpu_device_info.get(index):
|
||||
@@ -285,12 +292,13 @@ class GPUStatCollection(object):
|
||||
# e.g. nvidia-smi reset or reboot the system
|
||||
pass
|
||||
|
||||
# TODO: Do not block if full process info is not requested
|
||||
time.sleep(0.1)
|
||||
for process in processes:
|
||||
pid = process['pid']
|
||||
cache_process = GPUStatCollection.global_processes[pid]
|
||||
process['cpu_percent'] = cache_process.cpu_percent()
|
||||
# we do not actually use these, so no point in collecting them
|
||||
# # TODO: Do not block if full process info is not requested
|
||||
# time.sleep(0.1)
|
||||
# for process in processes:
|
||||
# pid = process['pid']
|
||||
# cache_process = GPUStatCollection.global_processes[pid]
|
||||
# process['cpu_percent'] = cache_process.cpu_percent()
|
||||
|
||||
index = N.nvmlDeviceGetIndex(handle)
|
||||
gpu_info = {
|
||||
@@ -383,3 +391,38 @@ def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||
'''
|
||||
return GPUStatCollection.new_query(shutdown=shutdown, per_process_stats=per_process_stats,
|
||||
get_driver_info=get_driver_info)
|
||||
|
||||
|
||||
def get_driver_cuda_version():
|
||||
# type: () -> Optional[str]
|
||||
"""
|
||||
:return: Return detected CUDA version from driver. On fail return value is None.
|
||||
Example: `110` is cuda version 11.0
|
||||
"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlInit()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = str(N.nvmlSystemGetCudaDriverVersion_v2())
|
||||
except BaseException:
|
||||
cuda_version = ''
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
N.nvmlShutdown()
|
||||
except BaseException:
|
||||
return None
|
||||
|
||||
# for some reason we get CUDA version 11020 instead of 11200, so this is the fix
|
||||
if cuda_version and len(cuda_version) >= 4 and cuda_version[2] == '0' and cuda_version[3] != '0':
|
||||
return cuda_version[:2]+cuda_version[3]
|
||||
|
||||
return cuda_version[:3] if cuda_version else None
|
||||
3735
clearml_agent/helper/gpu/pynvml.py
Normal file
3735
clearml_agent/helper/gpu/pynvml.py
Normal file
File diff suppressed because it is too large
Load Diff
225
clearml_agent/helper/os/folder_cache.py
Normal file
225
clearml_agent/helper/os/folder_cache.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import os
|
||||
import shutil
|
||||
from logging import warning
|
||||
from random import random
|
||||
from time import time
|
||||
from typing import List, Optional, Sequence
|
||||
|
||||
import psutil
|
||||
from pathlib2 import Path
|
||||
|
||||
from .locks import FileLock
|
||||
|
||||
|
||||
class FolderCache(object):
|
||||
_lock_filename = '.clearml.lock'
|
||||
_lock_timeout_seconds = 30
|
||||
_temp_entry_prefix = '_temp.'
|
||||
|
||||
def __init__(self, cache_folder, max_cache_entries=5, min_free_space_gb=None):
|
||||
self._cache_folder = Path(os.path.expandvars(cache_folder)).expanduser().absolute()
|
||||
self._cache_folder.mkdir(parents=True, exist_ok=True)
|
||||
self._max_cache_entries = max_cache_entries
|
||||
self._last_copied_entry_folder = None
|
||||
self._min_free_space_gb = min_free_space_gb if min_free_space_gb and min_free_space_gb > 0 else None
|
||||
self._lock = FileLock((self._cache_folder / self._lock_filename).as_posix())
|
||||
|
||||
def get_cache_folder(self):
|
||||
# type: () -> Path
|
||||
"""
|
||||
:return: Return the base cache folder
|
||||
"""
|
||||
return self._cache_folder
|
||||
|
||||
def copy_cached_entry(self, keys, destination):
|
||||
# type: (List[str], Path) -> Optional[Path]
|
||||
"""
|
||||
Copy a cached entry into a destination directory, if the cached entry does not exist return None
|
||||
:param keys:
|
||||
:param destination:
|
||||
:return: Target path, None if cached entry does not exist
|
||||
"""
|
||||
self._last_copied_entry_folder = None
|
||||
if not keys:
|
||||
return None
|
||||
|
||||
# lock so we make sure no one deletes it before we copy it
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
return None
|
||||
|
||||
src = None
|
||||
try:
|
||||
src = self.get_entry(keys)
|
||||
if src:
|
||||
destination = Path(destination).absolute()
|
||||
destination.mkdir(parents=True, exist_ok=True)
|
||||
shutil.rmtree(destination.as_posix())
|
||||
shutil.copytree(src.as_posix(), dst=destination.as_posix(), symlinks=True)
|
||||
except BaseException as ex:
|
||||
warning('Could not copy cache folder {} to {}: {}'.format(src, destination, ex))
|
||||
self._lock.release()
|
||||
return None
|
||||
|
||||
# release Lock
|
||||
self._lock.release()
|
||||
|
||||
self._last_copied_entry_folder = src
|
||||
return destination if src else None
|
||||
|
||||
def get_entry(self, keys):
|
||||
# type: (List[str]) -> Optional[Path]
|
||||
"""
|
||||
Return a folder (a sub-folder of inside the cache_folder) matching one of the keys
|
||||
:param keys: List of keys, return the first match to one of the keys, notice keys cannot contain '.'
|
||||
:return: Path to the sub-folder or None if none was found
|
||||
"""
|
||||
if not keys:
|
||||
return None
|
||||
# conform keys
|
||||
keys = [keys] if isinstance(keys, str) else keys
|
||||
keys = sorted([k.replace('.', '_') for k in keys])
|
||||
for cache_folder in self._cache_folder.glob('*'):
|
||||
if cache_folder.is_dir() and any(True for k in cache_folder.name.split('.') if k in keys):
|
||||
cache_folder.touch()
|
||||
return cache_folder
|
||||
return None
|
||||
|
||||
def add_entry(self, keys, source_folder, exclude_sub_folders=None):
|
||||
# type: (List[str], Path, Optional[Sequence[str]]) -> bool
|
||||
"""
|
||||
Add a local folder into the cache, copy all sub-folders inside `source_folder`
|
||||
excluding folders matching `exclude_sub_folders` list
|
||||
:param keys: Cache entry keys list (str)
|
||||
:param source_folder: Folder to copy into the cache
|
||||
:param exclude_sub_folders: List of sub-folders to exclude from the copy operation
|
||||
:return: return True is new entry was added to cache
|
||||
"""
|
||||
if not keys:
|
||||
return False
|
||||
|
||||
keys = [keys] if isinstance(keys, str) else keys
|
||||
keys = sorted([k.replace('.', '_') for k in keys])
|
||||
|
||||
# If entry already exists skip it
|
||||
cached_entry = self.get_entry(keys)
|
||||
if cached_entry:
|
||||
# make sure the entry contains all keys
|
||||
cached_keys = cached_entry.name.split('.')
|
||||
if set(keys) - set(cached_keys):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
# failed locking do nothing
|
||||
return True
|
||||
keys = sorted(list(set(keys) | set(cached_keys)))
|
||||
dst = cached_entry.parent / '.'.join(keys)
|
||||
# rename
|
||||
try:
|
||||
shutil.move(src=cached_entry.as_posix(), dst=dst.as_posix())
|
||||
except BaseException as ex:
|
||||
warning('Could not rename cache entry {} to {}: ex'.format(
|
||||
cached_entry.as_posix(), dst.as_posix(), ex))
|
||||
# release lock
|
||||
self._lock.release()
|
||||
return True
|
||||
|
||||
# make sure we remove old entries
|
||||
self._remove_old_entries()
|
||||
|
||||
# if we do not have enough free space, do nothing.
|
||||
if not self._check_min_free_space():
|
||||
warning('Could not add cache entry, not enough free space on drive, '
|
||||
'free space threshold {} GB. Clearing all cache entries!'.format(self._min_free_space_gb))
|
||||
self._remove_old_entries(max_cache_entries=0)
|
||||
return False
|
||||
|
||||
# create the new entry for us
|
||||
exclude_sub_folders = exclude_sub_folders or []
|
||||
source_folder = Path(source_folder).absolute()
|
||||
# create temp folder
|
||||
temp_folder = \
|
||||
self._temp_entry_prefix + \
|
||||
'{}.{}'.format(str(time()).replace('.', '_'), str(random()).replace('.', '_'))
|
||||
temp_folder = self._cache_folder / temp_folder
|
||||
temp_folder.mkdir(parents=True, exist_ok=False)
|
||||
|
||||
for f in source_folder.glob('*'):
|
||||
if f.name in exclude_sub_folders:
|
||||
continue
|
||||
if f.is_dir():
|
||||
shutil.copytree(
|
||||
src=f.as_posix(), dst=(temp_folder / f.name).as_posix(),
|
||||
symlinks=True, ignore_dangling_symlinks=True)
|
||||
else:
|
||||
shutil.copy(
|
||||
src=f.as_posix(), dst=(temp_folder / f.name).as_posix(),
|
||||
follow_symlinks=False)
|
||||
|
||||
# rename the target folder
|
||||
target_cache_folder = self._cache_folder / '.'.join(keys)
|
||||
# if we failed moving it means someone else created the cached entry before us, we can just leave
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
shutil.move(src=temp_folder.as_posix(), dst=target_cache_folder.as_posix())
|
||||
except BaseException:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
shutil.rmtree(path=temp_folder.as_posix())
|
||||
except BaseException:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_last_copied_entry(self):
|
||||
# type: () -> Optional[Path]
|
||||
"""
|
||||
:return: the last copied cached entry folder inside the cache
|
||||
"""
|
||||
return self._last_copied_entry_folder
|
||||
|
||||
def _remove_old_entries(self, max_cache_entries=None):
|
||||
# type: (Optional[int]) -> ()
|
||||
"""
|
||||
Notice we only keep self._max_cache_entries-1, assuming we will be adding a new entry soon
|
||||
:param int max_cache_entries: if not None use instead of self._max_cache_entries
|
||||
"""
|
||||
folder_entries = [(cache_folder, cache_folder.stat().st_mtime)
|
||||
for cache_folder in self._cache_folder.glob('*')
|
||||
if cache_folder.is_dir() and not cache_folder.name.startswith(self._temp_entry_prefix)]
|
||||
folder_entries = sorted(folder_entries, key=lambda x: x[1], reverse=True)
|
||||
|
||||
# lock so we make sure no one deletes it before we copy it
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._lock.acquire(timeout=self._lock_timeout_seconds)
|
||||
except BaseException as ex:
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
return
|
||||
|
||||
number_of_entries_to_keep = self._max_cache_entries - 1 \
|
||||
if max_cache_entries is None else max(0, int(max_cache_entries))
|
||||
for folder, ts in folder_entries[number_of_entries_to_keep:]:
|
||||
try:
|
||||
shutil.rmtree(folder.as_posix(), ignore_errors=True)
|
||||
except BaseException as ex:
|
||||
warning('Could not delete cache entry {}: {}'.format(folder.as_posix(), ex))
|
||||
|
||||
self._lock.release()
|
||||
|
||||
def _check_min_free_space(self):
|
||||
# type: () -> bool
|
||||
"""
|
||||
:return: return False if we hit the free space limit.
|
||||
If not free space limit provided, always return True
|
||||
"""
|
||||
if not self._min_free_space_gb or not self._cache_folder:
|
||||
return True
|
||||
free_space = float(psutil.disk_usage(self._cache_folder.as_posix()).free)
|
||||
free_space /= 2**30
|
||||
return free_space > self._min_free_space_gb
|
||||
211
clearml_agent/helper/os/locks.py
Normal file
211
clearml_agent/helper/os/locks.py
Normal file
@@ -0,0 +1,211 @@
|
||||
import os
|
||||
import time
|
||||
import tempfile
|
||||
import contextlib
|
||||
|
||||
from .portalocker import constants, exceptions, lock, unlock
|
||||
|
||||
|
||||
current_time = getattr(time, "monotonic", time.time)
|
||||
|
||||
DEFAULT_TIMEOUT = 10 ** 8
|
||||
DEFAULT_CHECK_INTERVAL = 0.25
|
||||
LOCK_METHOD = constants.LOCK_EX | constants.LOCK_NB
|
||||
|
||||
__all__ = [
|
||||
'FileLock',
|
||||
'open_atomic',
|
||||
]
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def open_atomic(filename, binary=True):
|
||||
"""Open a file for atomic writing. Instead of locking this method allows
|
||||
you to write the entire file and move it to the actual location. Note that
|
||||
this makes the assumption that a rename is atomic on your platform which
|
||||
is generally the case but not a guarantee.
|
||||
|
||||
http://docs.python.org/library/os.html#os.rename
|
||||
|
||||
>>> filename = 'test_file.txt'
|
||||
>>> if os.path.exists(filename):
|
||||
... os.remove(filename)
|
||||
|
||||
>>> with open_atomic(filename) as fh:
|
||||
... written = fh.write(b'test')
|
||||
>>> assert os.path.exists(filename)
|
||||
>>> os.remove(filename)
|
||||
|
||||
"""
|
||||
assert not os.path.exists(filename), '%r exists' % filename
|
||||
path, name = os.path.split(filename)
|
||||
|
||||
# Create the parent directory if it doesn't exist
|
||||
if path and not os.path.isdir(path): # pragma: no cover
|
||||
os.makedirs(path)
|
||||
|
||||
temp_fh = tempfile.NamedTemporaryFile(
|
||||
mode=binary and 'wb' or 'w',
|
||||
dir=path,
|
||||
delete=False,
|
||||
)
|
||||
yield temp_fh
|
||||
temp_fh.flush()
|
||||
os.fsync(temp_fh.fileno())
|
||||
temp_fh.close()
|
||||
try:
|
||||
os.rename(temp_fh.name, filename)
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_fh.name)
|
||||
except Exception: # noqa
|
||||
pass
|
||||
|
||||
|
||||
class FileLock(object):
|
||||
|
||||
def __init__(
|
||||
self, filename, mode='a', timeout=DEFAULT_TIMEOUT,
|
||||
check_interval=DEFAULT_CHECK_INTERVAL, fail_when_locked=False,
|
||||
flags=LOCK_METHOD, **file_open_kwargs):
|
||||
"""Lock manager with build-in timeout
|
||||
|
||||
filename -- filename
|
||||
mode -- the open mode, 'a' or 'ab' should be used for writing
|
||||
truncate -- use truncate to emulate 'w' mode, None is disabled, 0 is
|
||||
truncate to 0 bytes
|
||||
timeout -- timeout when trying to acquire a lock
|
||||
check_interval -- check interval while waiting
|
||||
fail_when_locked -- after the initial lock failed, return an error
|
||||
or lock the file
|
||||
**file_open_kwargs -- The kwargs for the `open(...)` call
|
||||
|
||||
fail_when_locked is useful when multiple threads/processes can race
|
||||
when creating a file. If set to true than the system will wait till
|
||||
the lock was acquired and then return an AlreadyLocked exception.
|
||||
|
||||
Note that the file is opened first and locked later. So using 'w' as
|
||||
mode will result in truncate _BEFORE_ the lock is checked.
|
||||
"""
|
||||
|
||||
if 'w' in mode:
|
||||
truncate = True
|
||||
mode = mode.replace('w', 'a')
|
||||
else:
|
||||
truncate = False
|
||||
|
||||
self.fh = None
|
||||
self.filename = filename
|
||||
self.mode = mode
|
||||
self.truncate = truncate
|
||||
self.timeout = timeout
|
||||
self.check_interval = check_interval
|
||||
self.fail_when_locked = fail_when_locked
|
||||
self.flags = flags
|
||||
self.file_open_kwargs = file_open_kwargs
|
||||
|
||||
def acquire(
|
||||
self, timeout=None, check_interval=None, fail_when_locked=None):
|
||||
"""Acquire the locked filehandle"""
|
||||
if timeout is None:
|
||||
timeout = self.timeout
|
||||
if timeout is None:
|
||||
timeout = 0
|
||||
|
||||
if check_interval is None:
|
||||
check_interval = self.check_interval
|
||||
|
||||
if fail_when_locked is None:
|
||||
fail_when_locked = self.fail_when_locked
|
||||
|
||||
# If we already have a filehandle, return it
|
||||
fh = self.fh
|
||||
if fh:
|
||||
return fh
|
||||
|
||||
# Get a new filehandler
|
||||
fh = self._get_fh()
|
||||
try:
|
||||
# Try to lock
|
||||
fh = self._get_lock(fh)
|
||||
except exceptions.LockException as exception:
|
||||
# Try till the timeout has passed
|
||||
timeoutend = current_time() + timeout
|
||||
while timeoutend > current_time():
|
||||
# Wait a bit
|
||||
time.sleep(check_interval)
|
||||
|
||||
# Try again
|
||||
try:
|
||||
|
||||
# We already tried to the get the lock
|
||||
# If fail_when_locked is true, then stop trying
|
||||
if fail_when_locked:
|
||||
raise exceptions.AlreadyLocked(exception)
|
||||
|
||||
else: # pragma: no cover
|
||||
# We've got the lock
|
||||
fh = self._get_lock(fh)
|
||||
break
|
||||
|
||||
except exceptions.LockException:
|
||||
pass
|
||||
|
||||
else:
|
||||
# We got a timeout... reraising
|
||||
raise exceptions.LockException(exception)
|
||||
|
||||
# Prepare the filehandle (truncate if needed)
|
||||
fh = self._prepare_fh(fh)
|
||||
|
||||
self.fh = fh
|
||||
return fh
|
||||
|
||||
def release(self):
|
||||
"""Releases the currently locked file handle"""
|
||||
if self.fh:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
unlock(self.fh)
|
||||
except Exception:
|
||||
pass
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self.fh.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.fh = None
|
||||
|
||||
def _get_fh(self):
|
||||
"""Get a new filehandle"""
|
||||
return open(self.filename, self.mode, **self.file_open_kwargs)
|
||||
|
||||
def _get_lock(self, fh):
|
||||
"""
|
||||
Try to lock the given filehandle
|
||||
|
||||
returns LockException if it fails"""
|
||||
lock(fh, self.flags)
|
||||
return fh
|
||||
|
||||
def _prepare_fh(self, fh):
|
||||
"""
|
||||
Prepare the filehandle for usage
|
||||
|
||||
If truncate is a number, the file will be truncated to that amount of
|
||||
bytes
|
||||
"""
|
||||
if self.truncate:
|
||||
fh.seek(0)
|
||||
fh.truncate(0)
|
||||
|
||||
return fh
|
||||
|
||||
def __enter__(self):
|
||||
return self.acquire()
|
||||
|
||||
def __exit__(self, type_, value, tb):
|
||||
self.release()
|
||||
|
||||
def __delete__(self, instance): # pragma: no cover
|
||||
instance.release()
|
||||
193
clearml_agent/helper/os/portalocker.py
Normal file
193
clearml_agent/helper/os/portalocker.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
class exceptions:
|
||||
class BaseLockException(Exception):
|
||||
# Error codes:
|
||||
LOCK_FAILED = 1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.fh = kwargs.pop('fh', None)
|
||||
Exception.__init__(self, *args, **kwargs)
|
||||
|
||||
class LockException(BaseLockException):
|
||||
pass
|
||||
|
||||
class AlreadyLocked(BaseLockException):
|
||||
pass
|
||||
|
||||
class FileToLarge(BaseLockException):
|
||||
pass
|
||||
|
||||
|
||||
class constants:
|
||||
# The actual tests will execute the code anyhow so the following code can
|
||||
# safely be ignored from the coverage tests
|
||||
if os.name == 'nt': # pragma: no cover
|
||||
import msvcrt
|
||||
|
||||
LOCK_EX = 0x1 #: exclusive lock
|
||||
LOCK_SH = 0x2 #: shared lock
|
||||
LOCK_NB = 0x4 #: non-blocking
|
||||
LOCK_UN = msvcrt.LK_UNLCK #: unlock
|
||||
|
||||
LOCKFILE_FAIL_IMMEDIATELY = 1
|
||||
LOCKFILE_EXCLUSIVE_LOCK = 2
|
||||
|
||||
elif os.name == 'posix': # pragma: no cover
|
||||
import fcntl
|
||||
|
||||
LOCK_EX = fcntl.LOCK_EX #: exclusive lock
|
||||
LOCK_SH = fcntl.LOCK_SH #: shared lock
|
||||
LOCK_NB = fcntl.LOCK_NB #: non-blocking
|
||||
LOCK_UN = fcntl.LOCK_UN #: unlock
|
||||
|
||||
else: # pragma: no cover
|
||||
raise RuntimeError('PortaLocker only defined for nt and posix platforms')
|
||||
|
||||
|
||||
if os.name == 'nt': # pragma: no cover
|
||||
import msvcrt
|
||||
|
||||
if sys.version_info.major == 2:
|
||||
lock_length = -1
|
||||
else:
|
||||
lock_length = int(2**31 - 1)
|
||||
|
||||
def lock(file_, flags):
|
||||
if flags & constants.LOCK_SH:
|
||||
import win32file
|
||||
import pywintypes
|
||||
import winerror
|
||||
__overlapped = pywintypes.OVERLAPPED()
|
||||
if sys.version_info.major == 2:
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = constants.LOCKFILE_FAIL_IMMEDIATELY
|
||||
else:
|
||||
mode = 0
|
||||
|
||||
else:
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = msvcrt.LK_NBRLCK
|
||||
else:
|
||||
mode = msvcrt.LK_RLCK
|
||||
|
||||
# is there any reason not to reuse the following structure?
|
||||
hfile = win32file._get_osfhandle(file_.fileno())
|
||||
try:
|
||||
win32file.LockFileEx(hfile, mode, 0, -0x10000, __overlapped)
|
||||
except pywintypes.error as exc_value:
|
||||
# error: (33, 'LockFileEx', 'The process cannot access the file
|
||||
# because another process has locked a portion of the file.')
|
||||
if exc_value.winerror == winerror.ERROR_LOCK_VIOLATION:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
else:
|
||||
# Q: Are there exceptions/codes we should be dealing with
|
||||
# here?
|
||||
raise
|
||||
else:
|
||||
mode = constants.LOCKFILE_EXCLUSIVE_LOCK
|
||||
if flags & constants.LOCK_NB:
|
||||
mode |= constants.LOCKFILE_FAIL_IMMEDIATELY
|
||||
|
||||
if flags & constants.LOCK_NB:
|
||||
mode = msvcrt.LK_NBLCK
|
||||
else:
|
||||
mode = msvcrt.LK_LOCK
|
||||
|
||||
# windows locks byte ranges, so make sure to lock from file start
|
||||
try:
|
||||
savepos = file_.tell()
|
||||
if savepos:
|
||||
# [ ] test exclusive lock fails on seek here
|
||||
# [ ] test if shared lock passes this point
|
||||
file_.seek(0)
|
||||
# [x] check if 0 param locks entire file (not documented in
|
||||
# Python)
|
||||
# [x] fails with "IOError: [Errno 13] Permission denied",
|
||||
# but -1 seems to do the trick
|
||||
|
||||
try:
|
||||
msvcrt.locking(file_.fileno(), mode, lock_length)
|
||||
except IOError as exc_value:
|
||||
# [ ] be more specific here
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
finally:
|
||||
if savepos:
|
||||
file_.seek(savepos)
|
||||
except IOError as exc_value:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
|
||||
fh=file_)
|
||||
|
||||
def unlock(file_):
|
||||
try:
|
||||
savepos = file_.tell()
|
||||
if savepos:
|
||||
file_.seek(0)
|
||||
|
||||
try:
|
||||
msvcrt.locking(file_.fileno(), constants.LOCK_UN, lock_length)
|
||||
except IOError as exc_value:
|
||||
if exc_value.strerror == 'Permission denied':
|
||||
import pywintypes
|
||||
import win32file
|
||||
import winerror
|
||||
__overlapped = pywintypes.OVERLAPPED()
|
||||
hfile = win32file._get_osfhandle(file_.fileno())
|
||||
try:
|
||||
win32file.UnlockFileEx(
|
||||
hfile, 0, -0x10000, __overlapped)
|
||||
except pywintypes.error as exc_value:
|
||||
if exc_value.winerror == winerror.ERROR_NOT_LOCKED:
|
||||
# error: (158, 'UnlockFileEx',
|
||||
# 'The segment is already unlocked.')
|
||||
# To match the 'posix' implementation, silently
|
||||
# ignore this error
|
||||
pass
|
||||
else:
|
||||
# Q: Are there exceptions/codes we should be
|
||||
# dealing with here?
|
||||
raise
|
||||
else:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED,
|
||||
exc_value.strerror,
|
||||
fh=file_)
|
||||
finally:
|
||||
if savepos:
|
||||
file_.seek(savepos)
|
||||
except IOError as exc_value:
|
||||
raise exceptions.LockException(
|
||||
exceptions.LockException.LOCK_FAILED, exc_value.strerror,
|
||||
fh=file_)
|
||||
|
||||
elif os.name == 'posix': # pragma: no cover
|
||||
import fcntl
|
||||
|
||||
def lock(file_, flags):
|
||||
locking_exceptions = IOError,
|
||||
try: # pragma: no cover
|
||||
locking_exceptions += BlockingIOError,
|
||||
except NameError: # pragma: no cover
|
||||
pass
|
||||
|
||||
try:
|
||||
fcntl.flock(file_.fileno(), flags)
|
||||
except locking_exceptions as exc_value:
|
||||
# The exception code varies on different systems so we'll catch
|
||||
# every IO error
|
||||
raise exceptions.LockException(exc_value, fh=file_)
|
||||
|
||||
def unlock(file_):
|
||||
fcntl.flock(file_.fileno(), constants.LOCK_UN)
|
||||
|
||||
else: # pragma: no cover
|
||||
raise RuntimeError('PortaLocker only defined for nt and posix platforms')
|
||||
268
clearml_agent/helper/package/base.py
Normal file
268
clearml_agent/helper/package/base.py
Normal file
@@ -0,0 +1,268 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from hashlib import md5
|
||||
from typing import Text, Iterable, Union, Optional, Dict, List
|
||||
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.definitions import ENV_VENV_CACHE_PATH
|
||||
from clearml_agent.helper.base import mkstemp, safe_remove_file, join_lines, select_for_platform
|
||||
from clearml_agent.helper.console import ensure_binary
|
||||
from clearml_agent.helper.os.folder_cache import FolderCache
|
||||
from clearml_agent.helper.process import Executable, Argv, PathLike
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class PackageManager(object):
|
||||
"""
|
||||
ABC for classes providing python package management interface
|
||||
"""
|
||||
|
||||
_selected_manager = None
|
||||
_cwd = None
|
||||
_pip_version = None
|
||||
_config_cache_folder = 'agent.venvs_cache.path'
|
||||
_config_cache_max_entries = 'agent.venvs_cache.max_entries'
|
||||
_config_cache_free_space_threshold = 'agent.venvs_cache.free_space_threshold_gb'
|
||||
|
||||
def __init__(self):
|
||||
self._cache_manager = None
|
||||
|
||||
@abc.abstractproperty
|
||||
def bin(self):
|
||||
# type: () -> PathLike
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def create(self):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def remove(self):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def install_from_file(self, path):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def freeze(self):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def load_requirements(self, requirements):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def install_packages(self, *packages):
|
||||
# type: (Iterable[Text]) -> None
|
||||
"""
|
||||
Install packages, upgrading depends on config
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def _install(self, *packages):
|
||||
# type: (Iterable[Text]) -> None
|
||||
"""
|
||||
Run install command
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def uninstall_packages(self, *packages):
|
||||
# type: (Iterable[Text]) -> None
|
||||
pass
|
||||
|
||||
def upgrade_pip(self):
|
||||
result = self._install(
|
||||
select_for_platform(windows='pip{}', linux='pip{}').format(self.get_pip_version()), "--upgrade")
|
||||
packages = self.run_with_env(('list',), output=True).splitlines()
|
||||
# p.split is ('pip', 'x.y.z')
|
||||
pip = [p.split() for p in packages if len(p.split()) == 2 and p.split()[0] == 'pip']
|
||||
if pip:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
from .requirements import MarkerRequirement
|
||||
pip = pip[0][1].split('.')
|
||||
MarkerRequirement.pip_new_version = bool(int(pip[0]) >= 20)
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
def get_python_command(self, extra=()):
|
||||
# type: (...) -> Executable
|
||||
return Argv(self.bin, *extra)
|
||||
|
||||
@contextmanager
|
||||
def temp_file(self, prefix, contents, suffix=".txt"):
|
||||
# type: (Union[Text, Iterable[Text]], Iterable[Text], Text) -> Text
|
||||
"""
|
||||
Write contents to a temporary file, yielding its path. Finally, delete it.
|
||||
:param prefix: file name prefix
|
||||
:param contents: text lines to write
|
||||
:param suffix: file name suffix
|
||||
"""
|
||||
f, temp_path = mkstemp(suffix=suffix, prefix=prefix)
|
||||
with f:
|
||||
f.write(
|
||||
contents
|
||||
if isinstance(contents, six.text_type)
|
||||
else join_lines(contents)
|
||||
)
|
||||
try:
|
||||
yield temp_path
|
||||
finally:
|
||||
if not self.session.debug_mode:
|
||||
safe_remove_file(temp_path)
|
||||
|
||||
def set_selected_package_manager(self):
|
||||
# set this instance as the selected package manager
|
||||
# this is helpful when we want out of context requirement installations
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
@property
|
||||
def cwd(self):
|
||||
return self._cwd
|
||||
|
||||
@cwd.setter
|
||||
def cwd(self, value):
|
||||
self._cwd = value
|
||||
|
||||
@classmethod
|
||||
def out_of_scope_install_package(cls, package_name, *args):
|
||||
if PackageManager._selected_manager is not None:
|
||||
try:
|
||||
result = PackageManager._selected_manager._install(package_name, *args)
|
||||
if result not in (0, None, True):
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def out_of_scope_freeze(cls):
|
||||
if PackageManager._selected_manager is not None:
|
||||
try:
|
||||
return PackageManager._selected_manager.freeze()
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def set_pip_version(cls, version):
|
||||
if not version:
|
||||
return
|
||||
version = version.replace(' ', '')
|
||||
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
|
||||
cls._pip_version = version
|
||||
else:
|
||||
cls._pip_version = "=="+version
|
||||
|
||||
@classmethod
|
||||
def get_pip_version(cls):
|
||||
return cls._pip_version or ''
|
||||
|
||||
def get_cached_venv(self, requirements, docker_cmd, python_version, cuda_version, destination_folder):
|
||||
# type: (Dict, Optional[Union[dict, str]], Optional[str], Optional[str], Path) -> Optional[Path]
|
||||
"""
|
||||
Copy a cached copy of the venv (based on the requirements) into destination_folder.
|
||||
Return None if failed or cached entry does not exist
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return None
|
||||
|
||||
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
|
||||
return self._get_cache_manager().copy_cached_entry(keys, destination_folder)
|
||||
|
||||
def add_cached_venv(
|
||||
self,
|
||||
requirements, # type: Union[Dict, List[Dict]]
|
||||
docker_cmd, # type: Optional[Union[dict, str]]
|
||||
python_version, # type: Optional[str]
|
||||
cuda_version, # type: Optional[str]
|
||||
source_folder, # type: Path
|
||||
exclude_sub_folders=None # type: Optional[List[str]]
|
||||
):
|
||||
# type: (...) -> ()
|
||||
"""
|
||||
Copy the local venv folder into the venv cache (keys are based on the requirements+python+docker).
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
|
||||
return self._get_cache_manager().add_entry(
|
||||
keys=keys, source_folder=source_folder, exclude_sub_folders=exclude_sub_folders)
|
||||
|
||||
def get_cache_folder(self):
|
||||
# type: () -> Optional[Path]
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
return self._get_cache_manager().get_cache_folder()
|
||||
|
||||
def get_last_used_entry_cache(self):
|
||||
# type: () -> Optional[Path]
|
||||
"""
|
||||
:return: the last used cached folder entry
|
||||
"""
|
||||
if not self._get_cache_manager():
|
||||
return
|
||||
return self._get_cache_manager().get_last_copied_entry()
|
||||
|
||||
@classmethod
|
||||
def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
|
||||
# type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]
|
||||
requirements_list = requirements_list or dict()
|
||||
if not isinstance(requirements_list, (list, tuple)):
|
||||
requirements_list = [requirements_list]
|
||||
docker_cmd = dict(docker_cmd=docker_cmd) if isinstance(docker_cmd, str) else docker_cmd or dict()
|
||||
docker_cmd = OrderedDict(sorted(docker_cmd.items(), key=lambda t: t[0]))
|
||||
if 'docker_cmd' in docker_cmd:
|
||||
# we only take the first part of the docker_cmd which is the docker image name
|
||||
docker_cmd['docker_cmd'] = docker_cmd['docker_cmd'].strip('\r\n\t ').split(' ')[0]
|
||||
|
||||
keys = []
|
||||
strip_chars = '\n\r\t '
|
||||
for requirements in requirements_list:
|
||||
pip, conda = ('pip', 'conda')
|
||||
pip_reqs = requirements.get(pip, '')
|
||||
conda_reqs = requirements.get(conda, '')
|
||||
if isinstance(pip_reqs, str):
|
||||
pip_reqs = pip_reqs.split('\n')
|
||||
if isinstance(conda_reqs, str):
|
||||
conda_reqs = conda_reqs.split('\n')
|
||||
pip_reqs = sorted([p.strip(strip_chars) for p in pip_reqs
|
||||
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
|
||||
conda_reqs = sorted([p.strip(strip_chars) for p in conda_reqs
|
||||
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
|
||||
if not pip_reqs and not conda_reqs:
|
||||
continue
|
||||
# do not process "-r" or "--requirement" because we cannot know what we have in the git repo.
|
||||
if any(r.strip().startswith('-r ') or r.strip().startswith('--requirement ') for r in pip_reqs):
|
||||
continue
|
||||
hash_text = '{class_type}\n{docker_cmd}\n{cuda_ver}\n{python_version}\n{pip_reqs}\n{conda_reqs}'.format(
|
||||
class_type=str(cls),
|
||||
docker_cmd=str(docker_cmd or ''),
|
||||
cuda_ver=str(cuda_version or ''),
|
||||
python_version=str(python_version or ''),
|
||||
pip_reqs=str(pip_reqs or ''),
|
||||
conda_reqs=str(conda_reqs or ''),
|
||||
)
|
||||
keys.append(md5(ensure_binary(hash_text)).hexdigest())
|
||||
return sorted(list(set(keys)))
|
||||
|
||||
def _get_cache_manager(self):
|
||||
if not self._cache_manager:
|
||||
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
|
||||
if not cache_folder:
|
||||
return None
|
||||
|
||||
max_entries = int(self.session.config.get(self._config_cache_max_entries, 10))
|
||||
free_space_threshold = float(self.session.config.get(self._config_cache_free_space_threshold, 0))
|
||||
self._cache_manager = FolderCache(
|
||||
cache_folder, max_cache_entries=max_entries, min_free_space_gb=free_space_threshold)
|
||||
return self._cache_manager
|
||||
760
clearml_agent/helper/package/conda_api.py
Normal file
760
clearml_agent/helper/package/conda_api.py
Normal file
@@ -0,0 +1,760 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import subprocess
|
||||
from collections import OrderedDict
|
||||
from distutils.spawn import find_executable
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
from typing import Text, Iterable, Union, Dict, Set, Sequence, Any
|
||||
|
||||
import six
|
||||
import yaml
|
||||
from time import time
|
||||
from attr import attrs, attrib, Factory
|
||||
from pathlib2 import Path
|
||||
from clearml_agent.external.requirements_parser import parse
|
||||
from clearml_agent.external.requirements_parser.requirement import Requirement
|
||||
|
||||
from clearml_agent.errors import CommandFailedError
|
||||
from clearml_agent.helper.base import (
|
||||
rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform, ExecutionInfo,
|
||||
convert_cuda_version_to_float_single_digit_str, convert_cuda_version_to_int_10_base_str, )
|
||||
from clearml_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
|
||||
from clearml_agent.helper.package.requirements import SimpleVersion
|
||||
from clearml_agent.session import Session
|
||||
from .base import PackageManager
|
||||
from .pip_api.venv import VirtualenvPip
|
||||
from .requirements import RequirementsManager, MarkerRequirement
|
||||
from ...backend_api.session.defs import ENV_CONDA_ENV_PACKAGE
|
||||
|
||||
package_normalize = partial(re.compile(r"""\[version=['"](.*)['"]\]""").sub, r"\1")
|
||||
|
||||
|
||||
def package_set(packages):
|
||||
return set(map(package_normalize, packages))
|
||||
|
||||
|
||||
def _package_diff(path, packages):
|
||||
# type: (Union[Path, Text], Iterable[Text]) -> Set[Text]
|
||||
return package_set(Path(path).read_text().splitlines()) - package_set(packages)
|
||||
|
||||
|
||||
class CondaPip(VirtualenvPip):
|
||||
def __init__(self, source=None, *args, **kwargs):
|
||||
super(CondaPip, self).__init__(*args, interpreter=Path(kwargs.get('path'), "python.exe")
|
||||
if is_windows_platform() and kwargs.get('path') else None, **kwargs)
|
||||
self.source = source
|
||||
|
||||
def run_with_env(self, command, output=False, **kwargs):
|
||||
if not self.source:
|
||||
return super(CondaPip, self).run_with_env(command, output=output, **kwargs)
|
||||
command = CommandSequence(self.source, Argv("pip", *command))
|
||||
return (command.get_output if output else command.check_call)(
|
||||
stdin=DEVNULL, **kwargs
|
||||
)
|
||||
|
||||
|
||||
class CondaAPI(PackageManager):
|
||||
|
||||
"""
|
||||
A programmatic interface for controlling conda
|
||||
"""
|
||||
|
||||
MINIMUM_VERSION = "4.3.30"
|
||||
|
||||
def __init__(self, session, path, python, requirements_manager, execution_info=None, **kwargs):
|
||||
# type: (Session, PathLike, float, RequirementsManager, ExecutionInfo, Any) -> None
|
||||
"""
|
||||
:param python: base python version to use (e.g python3.6)
|
||||
:param path: path of env
|
||||
"""
|
||||
super(CondaAPI, self).__init__()
|
||||
self.session = session
|
||||
self.python = python
|
||||
self.source = None
|
||||
self.requirements_manager = requirements_manager
|
||||
self.path = path
|
||||
self.env_read_only = False
|
||||
self.extra_channels = self.session.config.get('agent.package_manager.conda_channels', [])
|
||||
self.conda_env_as_base_docker = \
|
||||
self.session.config.get('agent.package_manager.conda_env_as_base_docker', None) or \
|
||||
bool(ENV_CONDA_ENV_PACKAGE.get())
|
||||
if ENV_CONDA_ENV_PACKAGE.get():
|
||||
self.conda_pre_build_env_path = ENV_CONDA_ENV_PACKAGE.get()
|
||||
else:
|
||||
self.conda_pre_build_env_path = execution_info.docker_cmd if execution_info else None
|
||||
self.pip = CondaPip(
|
||||
session=self.session,
|
||||
source=self.source,
|
||||
python=self.python,
|
||||
requirements_manager=self.requirements_manager,
|
||||
path=self.path,
|
||||
)
|
||||
try:
|
||||
self.conda = (
|
||||
find_executable("conda") or
|
||||
Argv(select_for_platform(windows="where", linux="which"), "conda").get_output(
|
||||
shell=select_for_platform(windows=True, linux=False)).strip()
|
||||
)
|
||||
except Exception:
|
||||
raise ValueError("ERROR: package manager \"conda\" selected, "
|
||||
"but \'conda\' executable could not be located")
|
||||
try:
|
||||
output = Argv(self.conda, "--version").get_output(stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as ex:
|
||||
raise CommandFailedError(
|
||||
"Unable to determine conda version: {ex}, output={ex.output}".format(
|
||||
ex=ex
|
||||
)
|
||||
)
|
||||
self.conda_version = self.get_conda_version(output)
|
||||
if SimpleVersion.compare_versions(self.conda_version, '<', self.MINIMUM_VERSION):
|
||||
raise CommandFailedError(
|
||||
"conda version '{}' is smaller than minimum supported conda version '{}'".format(
|
||||
self.conda_version, self.MINIMUM_VERSION
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_conda_version(output):
|
||||
match = re.search(r"(\d+\.){0,2}\d+", output)
|
||||
if not match:
|
||||
raise CommandFailedError("Unidentified conda version string:", output)
|
||||
return match.group(0)
|
||||
|
||||
@property
|
||||
def bin(self):
|
||||
return self.pip.bin
|
||||
|
||||
# noinspection SpellCheckingInspection
|
||||
def upgrade_pip(self):
|
||||
# do not change pip version if pre built environement is used
|
||||
if self.env_read_only:
|
||||
print('Conda environment in read-only mode, skipping pip upgrade.')
|
||||
return ''
|
||||
return self._install(select_for_platform(windows='pip{}', linux='pip{}').format(self.pip.get_pip_version()))
|
||||
|
||||
def create(self):
|
||||
"""
|
||||
Create a new environment
|
||||
"""
|
||||
if self.conda_env_as_base_docker and self.conda_pre_build_env_path:
|
||||
if Path(self.conda_pre_build_env_path).is_dir():
|
||||
self._init_existing_environment(self.conda_pre_build_env_path)
|
||||
return self
|
||||
elif Path(self.conda_pre_build_env_path).is_file():
|
||||
print("Restoring Conda environment from {}".format(self.conda_pre_build_env_path))
|
||||
tar_path = find_executable("tar")
|
||||
self.path.mkdir(parents=True, exist_ok=True)
|
||||
output = Argv(
|
||||
tar_path,
|
||||
"-xzf",
|
||||
self.conda_pre_build_env_path,
|
||||
"-C",
|
||||
self.path,
|
||||
).get_output()
|
||||
|
||||
self.source = self.pip.source = ("conda", "activate", self.path.as_posix())
|
||||
conda_env = self._get_conda_sh()
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
# unpack cleanup
|
||||
print("Fixing prefix in Conda environment {}".format(self.path))
|
||||
CommandSequence(('source', conda_env.as_posix()),
|
||||
((self.path / 'bin' / 'conda-unpack').as_posix(), )).get_output()
|
||||
return self
|
||||
else:
|
||||
raise ValueError("Could not restore Conda environment, cannot find {}".format(
|
||||
self.conda_pre_build_env_path))
|
||||
|
||||
command = Argv(
|
||||
self.conda,
|
||||
"create",
|
||||
"--yes",
|
||||
"--mkdir",
|
||||
"--prefix",
|
||||
self.path,
|
||||
"python={}".format(self.python),
|
||||
)
|
||||
print('Executing Conda: {}'.format(command.serialize()))
|
||||
output = command.get_output(stderr=DEVNULL)
|
||||
match = re.search(
|
||||
r"\W*(.*activate) ({})".format(re.escape(str(self.path))), output
|
||||
)
|
||||
self.source = self.pip.source = (
|
||||
tuple(match.group(1).split()) + (match.group(2),)
|
||||
if match
|
||||
else ("conda", "activate", self.path.as_posix())
|
||||
)
|
||||
|
||||
conda_env = self._get_conda_sh()
|
||||
if conda_env.is_file() and not is_windows_platform():
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
|
||||
return self
|
||||
|
||||
def _init_existing_environment(self, conda_pre_build_env_path):
|
||||
print("Using pre-existing Conda environment from {}".format(conda_pre_build_env_path))
|
||||
self.path = Path(conda_pre_build_env_path)
|
||||
self.source = ("conda", "activate", self.path.as_posix())
|
||||
self.pip = CondaPip(
|
||||
session=self.session,
|
||||
source=self.source,
|
||||
python=self.python,
|
||||
requirements_manager=self.requirements_manager,
|
||||
path=self.path,
|
||||
)
|
||||
conda_env = self._get_conda_sh()
|
||||
self.source = self.pip.source = CommandSequence(('source', conda_env.as_posix()), self.source)
|
||||
self.env_read_only = True
|
||||
|
||||
def remove(self):
|
||||
"""
|
||||
Delete a conda environment.
|
||||
Use 'conda env remove', then 'rm_tree' to be safe.
|
||||
|
||||
Conda seems to load "vcruntime140.dll" from all its environment on startup.
|
||||
This means environment have to be deleted using 'conda env remove'.
|
||||
If necessary, conda can be fooled into deleting a partially-deleted environment by creating an empty file
|
||||
in '<ENV>\conda-meta\history' (value found in 'conda.gateways.disk.test.PREFIX_MAGIC_FILE').
|
||||
Otherwise, it complains that said directory is not a conda environment.
|
||||
|
||||
See: https://github.com/conda/conda/issues/7682
|
||||
"""
|
||||
try:
|
||||
self._run_command(("env", "remove", "-p", self.path))
|
||||
except Exception:
|
||||
pass
|
||||
rm_tree(self.path)
|
||||
# if we failed removing the path, change it's name
|
||||
if is_windows_platform() and Path(self.path).exists():
|
||||
try:
|
||||
Path(self.path).rename(Path(self.path).as_posix() + '_' + str(time()))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _install_from_file(self, path):
|
||||
"""
|
||||
Install packages from requirement file.
|
||||
"""
|
||||
self._install("--file", path)
|
||||
|
||||
def _install(self, *args):
|
||||
# type: (*PathLike) -> ()
|
||||
# if we are in read only mode, do not install anything
|
||||
if self.env_read_only:
|
||||
print('Conda environment in read-only mode, skipping package installing: {}'.format(args))
|
||||
return
|
||||
channels_args = tuple(
|
||||
chain.from_iterable(("-c", channel) for channel in self.extra_channels)
|
||||
)
|
||||
self._run_command(("install", "-p", self.path) + channels_args + args)
|
||||
|
||||
def _get_pip_packages(self, packages):
|
||||
# type: (Iterable[Text]) -> Sequence[Text]
|
||||
"""
|
||||
Return subset of ``packages`` which are not available on conda
|
||||
"""
|
||||
pips = []
|
||||
while True:
|
||||
with self.temp_file("conda_reqs", packages) as path:
|
||||
try:
|
||||
self._install_from_file(path)
|
||||
except PackageNotFoundError as e:
|
||||
pips.append(e.pkg)
|
||||
packages = _package_diff(path, {e.pkg})
|
||||
else:
|
||||
break
|
||||
return pips
|
||||
|
||||
def install_packages(self, *packages):
|
||||
# type: (*Text) -> ()
|
||||
return self._install(*packages)
|
||||
|
||||
def uninstall_packages(self, *packages):
|
||||
# if we are in read only mode, do not uninstall anything
|
||||
if self.env_read_only:
|
||||
print('Conda environment in read-only mode, skipping package uninstalling: {}'.format(packages))
|
||||
return ''
|
||||
return self._run_command(("uninstall", "-p", self.path))
|
||||
|
||||
def install_from_file(self, path):
|
||||
"""
|
||||
Try to install packages from conda. Install packages which are not available from conda with pip.
|
||||
"""
|
||||
requirements = {}
|
||||
# assume requirements.txt
|
||||
with open(path, 'rt') as f:
|
||||
requirements['pip'] = f.read()
|
||||
self.load_requirements(requirements)
|
||||
|
||||
def freeze(self, freeze_full_environment=False):
|
||||
requirements = self.pip.freeze()
|
||||
req_lines = []
|
||||
conda_lines = []
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
pip_lines = requirements['pip']
|
||||
conda_packages_json = json.loads(
|
||||
self._run_command((self.conda, "list", "--json", "-p", self.path), raw=True))
|
||||
for r in conda_packages_json:
|
||||
# check if this is a pypi package, if it is, leave it outside
|
||||
if not r.get('channel') or r.get('channel') == 'pypi':
|
||||
name = (r['name'].replace('-', '_'), r['name'])
|
||||
pip_req_line = [l for l in pip_lines
|
||||
if l.split('==', 1)[0].strip() in name or l.split('@', 1)[0].strip() in name]
|
||||
if pip_req_line and \
|
||||
('@' not in pip_req_line[0] or
|
||||
not pip_req_line[0].split('@', 1)[1].strip().startswith('file://')):
|
||||
req_lines.append(pip_req_line[0])
|
||||
continue
|
||||
|
||||
req_lines.append(
|
||||
'{}=={}'.format(name[1], r['version']) if r.get('version') else '{}'.format(name[1]))
|
||||
continue
|
||||
|
||||
# check if we have it in our required packages
|
||||
name = r['name']
|
||||
# hack support pytorch/torch different naming convention
|
||||
if name == 'pytorch':
|
||||
name = 'torch'
|
||||
# skip over packages with _
|
||||
if name.startswith('_'):
|
||||
continue
|
||||
conda_lines.append('{}=={}'.format(name, r['version']) if r.get('version') else '{}'.format(name))
|
||||
# make sure we see the conda packages, put them into the pip as well
|
||||
if conda_lines:
|
||||
req_lines = ['# Conda Packages', ''] + conda_lines + ['', '# pip Packages', ''] + req_lines
|
||||
|
||||
requirements['pip'] = req_lines
|
||||
requirements['conda'] = conda_lines
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if freeze_full_environment:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
conda_env_json = json.loads(
|
||||
self._run_command((self.conda, "env", "export", "--json", "-p", self.path), raw=True))
|
||||
conda_env_json.pop('name', None)
|
||||
conda_env_json.pop('prefix', None)
|
||||
conda_env_json.pop('channels', None)
|
||||
requirements['conda_env_json'] = json.dumps(conda_env_json)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return requirements
|
||||
|
||||
def _load_conda_full_env(self, conda_env_dict, requirements):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
cuda_version = int(self.session.config.get('agent.cuda_version', 0))
|
||||
except Exception:
|
||||
cuda_version = 0
|
||||
|
||||
conda_env_dict['channels'] = self.extra_channels
|
||||
if 'dependencies' not in conda_env_dict:
|
||||
conda_env_dict['dependencies'] = []
|
||||
new_dependencies = OrderedDict()
|
||||
pip_requirements = None
|
||||
for line in conda_env_dict['dependencies']:
|
||||
if isinstance(line, dict):
|
||||
pip_requirements = line.pop('pip', None)
|
||||
continue
|
||||
name = line.strip().split('=', 1)[0].lower()
|
||||
if name == 'pip':
|
||||
continue
|
||||
elif name == 'python':
|
||||
line = 'python={}'.format('.'.join(line.split('=')[1].split('.')[:2]))
|
||||
elif name == 'tensorflow-gpu' and cuda_version == 0:
|
||||
line = 'tensorflow={}'.format(line.split('=')[1])
|
||||
elif name == 'tensorflow' and cuda_version > 0:
|
||||
line = 'tensorflow-gpu={}'.format(line.split('=')[1])
|
||||
elif name in ('cupti', 'cudnn'):
|
||||
# cudatoolkit should pull them based on the cudatoolkit version
|
||||
continue
|
||||
elif name.startswith('_'):
|
||||
continue
|
||||
new_dependencies[line.split('=', 1)[0].strip()] = line
|
||||
|
||||
# fix packages:
|
||||
conda_env_dict['dependencies'] = list(new_dependencies.values())
|
||||
|
||||
with self.temp_file("conda_env", yaml.dump(conda_env_dict), suffix=".yml") as name:
|
||||
print('Conda: Trying to install requirements:\n{}'.format(conda_env_dict['dependencies']))
|
||||
result = self._run_command(
|
||||
("env", "update", "-p", self.path, "--file", name)
|
||||
)
|
||||
|
||||
# check if we need to remove specific packages
|
||||
bad_req = self._parse_conda_result_bad_packges(result)
|
||||
if bad_req:
|
||||
print('failed installing the following conda packages: {}'.format(bad_req))
|
||||
return False
|
||||
|
||||
if pip_requirements:
|
||||
# create a list of vcs packages that we need to replace in the pip section
|
||||
vcs_reqs = {}
|
||||
if 'pip' in requirements:
|
||||
pip_lines = requirements['pip'].splitlines() \
|
||||
if isinstance(requirements['pip'], six.string_types) else requirements['pip']
|
||||
for line in pip_lines:
|
||||
try:
|
||||
marker = list(parse(line))
|
||||
except Exception:
|
||||
marker = None
|
||||
if not marker:
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
if m.vcs:
|
||||
vcs_reqs[m.name] = m
|
||||
try:
|
||||
pip_req_str = [str(vcs_reqs.get(r.split('=', 1)[0], r)) for r in pip_requirements
|
||||
if not r.startswith('pip=') and not r.startswith('virtualenv=')]
|
||||
print('Conda: Installing requirements: step 2 - using pip:\n{}'.format(pip_req_str))
|
||||
PackageManager._selected_manager = self.pip
|
||||
self.pip.load_requirements({'pip': '\n'.join(pip_req_str)})
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise e
|
||||
finally:
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
|
||||
def load_requirements(self, requirements):
|
||||
# if we are in read only mode, do not uninstall anything
|
||||
if self.env_read_only:
|
||||
print('Conda environment in read-only mode, skipping requirements installation.')
|
||||
return None
|
||||
|
||||
# if we have a full conda environment, use it and pass the pip to pip
|
||||
if requirements.get('conda_env_json'):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
conda_env_json = json.loads(requirements.get('conda_env_json'))
|
||||
print('Conda restoring full yaml environment')
|
||||
return self._load_conda_full_env(conda_env_json, requirements)
|
||||
except Exception:
|
||||
print('Could not load fully stored conda environment, falling back to requirements')
|
||||
|
||||
# create new environment file
|
||||
conda_env = dict()
|
||||
conda_env['channels'] = self.extra_channels
|
||||
reqs = []
|
||||
if isinstance(requirements['pip'], six.string_types):
|
||||
requirements['pip'] = requirements['pip'].split('\n')
|
||||
if isinstance(requirements.get('conda'), six.string_types):
|
||||
requirements['conda'] = requirements['conda'].split('\n')
|
||||
has_torch = False
|
||||
has_matplotlib = False
|
||||
has_cudatoolkit = False
|
||||
cuda_version_full = 0
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# notice this is an integer version: 112 (means 11.2)
|
||||
cuda_version = str(self.session.config.get('agent.cuda_version', "")).strip()
|
||||
if not cuda_version:
|
||||
cuda_version = 0
|
||||
else:
|
||||
cuda_version_full = convert_cuda_version_to_float_single_digit_str(cuda_version)
|
||||
cuda_version = int(convert_cuda_version_to_int_10_base_str(cuda_version))
|
||||
except Exception:
|
||||
cuda_version = 0
|
||||
|
||||
# notice 'conda' entry with empty string is a valid conda requirements list, it means pip only
|
||||
# this should happen if experiment was executed on non-conda machine or old trains client
|
||||
conda_supported_req = requirements['pip'] if requirements.get('conda', None) is None else requirements['conda']
|
||||
conda_supported_req_names = []
|
||||
pip_requirements = []
|
||||
for r in conda_supported_req:
|
||||
try:
|
||||
marker = list(parse(r))
|
||||
except:
|
||||
marker = None
|
||||
if not marker:
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
m.validate_local_file_ref()
|
||||
# conda does not support version control links
|
||||
if m.vcs:
|
||||
pip_requirements.append(m)
|
||||
continue
|
||||
# Skip over pip
|
||||
if m.name in ('pip', 'virtualenv', ):
|
||||
continue
|
||||
# python version, only major.minor
|
||||
if m.name == 'python' and m.specs:
|
||||
m.specs = [(m.specs[0][0], '.'.join(m.specs[0][1].split('.')[:2])), ]
|
||||
if '.' not in m.specs[0][1]:
|
||||
continue
|
||||
|
||||
if m.name.lower() == 'cudatoolkit':
|
||||
# skip cuda if we are running on CPU
|
||||
if not cuda_version:
|
||||
continue
|
||||
|
||||
has_cudatoolkit = True
|
||||
# cuda version, only major.minor
|
||||
requested_cuda_version = '.'.join(m.specs[0][1].split('.')[:2])
|
||||
# make sure that the cuda_version we support can install the requested cuda (major version)
|
||||
if int(float(requested_cuda_version)) > int(float(cuda_version)/10.0):
|
||||
continue
|
||||
m.specs = [(m.specs[0][0], str(requested_cuda_version)), ]
|
||||
|
||||
conda_supported_req_names.append(m.name.lower())
|
||||
if m.req.name.lower() == 'matplotlib':
|
||||
has_matplotlib = True
|
||||
elif m.req.name.lower().startswith('torch'):
|
||||
has_torch = True
|
||||
|
||||
if m.req.name.lower() in ('torch', 'pytorch'):
|
||||
has_torch = True
|
||||
m.req.name = 'pytorch'
|
||||
|
||||
if m.req.name.lower() in ('tensorflow_gpu', 'tensorflow-gpu', 'tensorflow'):
|
||||
has_torch = True
|
||||
m.req.name = 'tensorflow-gpu' if cuda_version > 0 else 'tensorflow'
|
||||
|
||||
reqs.append(m)
|
||||
|
||||
if not has_cudatoolkit and cuda_version:
|
||||
m = MarkerRequirement(Requirement.parse("cudatoolkit == {}".format(cuda_version_full)))
|
||||
has_cudatoolkit = True
|
||||
reqs.append(m)
|
||||
|
||||
# if we have a conda list, the rest should be installed with pip,
|
||||
# this means any experiment that was executed with pip environment,
|
||||
# will be installed using pip
|
||||
if requirements.get('conda', None) is not None:
|
||||
for r in requirements['pip']:
|
||||
try:
|
||||
marker = list(parse(r))
|
||||
except:
|
||||
marker = None
|
||||
if not marker:
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
# remove local files reference if it does not exist (leave the package name)
|
||||
m.validate_local_file_ref()
|
||||
|
||||
m_name = (m.name or '').lower()
|
||||
if m_name in conda_supported_req_names:
|
||||
# this package is in the conda list,
|
||||
# make sure that if we changed version and we match it in conda
|
||||
## conda_supported_req_names.remove(m_name)
|
||||
for cr in reqs:
|
||||
if m_name.lower().replace('_', '-') == cr.name.lower().replace('_', '-'):
|
||||
# match versions
|
||||
cr.specs = m.specs
|
||||
# # conda always likes "-" not "_" but only on pypi packages
|
||||
# cr.name = cr.name.lower().replace('_', '-')
|
||||
break
|
||||
else:
|
||||
# not in conda, it is a pip package
|
||||
pip_requirements.append(m)
|
||||
if m_name == 'matplotlib':
|
||||
has_matplotlib = True
|
||||
|
||||
# Conda requirements Hacks:
|
||||
if has_matplotlib:
|
||||
reqs.append(MarkerRequirement(Requirement.parse('graphviz')))
|
||||
reqs.append(MarkerRequirement(Requirement.parse('python-graphviz')))
|
||||
reqs.append(MarkerRequirement(Requirement.parse('kiwisolver')))
|
||||
|
||||
# remove specific cudatoolkit, it should have being preinstalled.
|
||||
# allow to override default cudatoolkit, but not the derivative packages, cudatoolkit should pull them
|
||||
reqs = [r for r in reqs if r.name not in ('cudnn', 'cupti')]
|
||||
|
||||
if has_torch and cuda_version == 0:
|
||||
reqs.append(MarkerRequirement(Requirement.parse('cpuonly')))
|
||||
|
||||
# make sure we have no double entries
|
||||
reqs = list(OrderedDict((r.name, r) for r in reqs).values())
|
||||
|
||||
# conform conda packages (version/name)
|
||||
for r in reqs:
|
||||
# change _ to - in name but not the prefix _ (as this is conda prefix)
|
||||
if r.name and not r.name.startswith('_') and not requirements.get('conda', None):
|
||||
r.name = r.name.replace('_', '-')
|
||||
|
||||
if has_cudatoolkit and r.specs and len(r.specs[0]) > 1 and r.name == 'cudatoolkit':
|
||||
# select specific cuda version if it came from the requirements
|
||||
r.specs = [(r.specs[0][0].replace('==', '='), r.specs[0][1].split('.post')[0])]
|
||||
elif r.specs and r.specs[0] and len(r.specs[0]) > 1:
|
||||
# remove .post from version numbers it fails with ~= version, and change == to ~=
|
||||
r.specs = [(r.specs[0][0].replace('==', '~='), r.specs[0][1].split('.post')[0])]
|
||||
|
||||
while reqs:
|
||||
# notice, we give conda more freedom in version selection, to help it choose best combination
|
||||
def clean_ver(ar):
|
||||
if not ar.specs:
|
||||
return ar.tostr()
|
||||
ar.specs = [(ar.specs[0][0], ar.specs[0][1] + '.0' if '.' not in ar.specs[0][1] else ar.specs[0][1])]
|
||||
return ar.tostr()
|
||||
conda_env['dependencies'] = [clean_ver(r) for r in reqs]
|
||||
with self.temp_file("conda_env", yaml.dump(conda_env), suffix=".yml") as name:
|
||||
print('Conda: Trying to install requirements:\n{}'.format(conda_env['dependencies']))
|
||||
if self.session.debug_mode:
|
||||
print('{}:\n{}'.format(name, yaml.dump(conda_env)))
|
||||
result = self._run_command(
|
||||
("env", "update", "-p", self.path, "--file", name)
|
||||
)
|
||||
# check if we need to remove specific packages
|
||||
bad_req = self._parse_conda_result_bad_packges(result)
|
||||
if not bad_req:
|
||||
break
|
||||
|
||||
solved = False
|
||||
for bad_r in bad_req:
|
||||
name = bad_r.split('[')[0].split('=')[0].split('~')[0].split('<')[0].split('>')[0]
|
||||
# look for name in requirements
|
||||
for r in reqs:
|
||||
if r.name.lower() == name.lower():
|
||||
pip_requirements.append(r)
|
||||
reqs.remove(r)
|
||||
solved = True
|
||||
break
|
||||
|
||||
# we couldn't remove even one package,
|
||||
# nothing we can do but try pip
|
||||
if not solved:
|
||||
pip_requirements.extend(reqs)
|
||||
break
|
||||
|
||||
if pip_requirements:
|
||||
try:
|
||||
pip_req_str = [r.tostr() for r in pip_requirements if r.name not in ('pip', 'virtualenv', )]
|
||||
print('Conda: Installing requirements: step 2 - using pip:\n{}'.format(pip_req_str))
|
||||
PackageManager._selected_manager = self.pip
|
||||
if self.session.debug_mode:
|
||||
print('pip requirements.txt:\n{}'.format('\n'.join(pip_req_str)))
|
||||
self.pip.load_requirements({'pip': '\n'.join(pip_req_str)})
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise e
|
||||
finally:
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
return True
|
||||
|
||||
def _parse_conda_result_bad_packges(self, result_dict):
|
||||
if not result_dict:
|
||||
return None
|
||||
|
||||
if 'bad_deps' in result_dict and result_dict['bad_deps']:
|
||||
return result_dict['bad_deps']
|
||||
|
||||
if result_dict.get('error'):
|
||||
error_lines = result_dict['error'].split('\n')
|
||||
if error_lines[0].strip().lower().startswith("unsatisfiableerror:"):
|
||||
empty_lines = [i for i, l in enumerate(error_lines) if not l.strip()]
|
||||
if len(empty_lines) >= 2:
|
||||
deps = error_lines[empty_lines[0]+1:empty_lines[1]]
|
||||
try:
|
||||
return yaml.load('\n'.join(deps), Loader=yaml.SafeLoader)
|
||||
except:
|
||||
return None
|
||||
return None
|
||||
|
||||
def _run_command(self, command, raw=False, **kwargs):
|
||||
# type: (Iterable[Text], bool, Any) -> Union[Dict, Text]
|
||||
"""
|
||||
Run a conda command, returning JSON output.
|
||||
The command is prepended with 'conda' and run with JSON output flags.
|
||||
:param command: command to run
|
||||
:param raw: return text output and don't change command
|
||||
:param kwargs: kwargs for Argv.get_output()
|
||||
:return: JSON output or text output
|
||||
"""
|
||||
def escape_ansi(line):
|
||||
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
|
||||
return ansi_escape.sub('', line)
|
||||
|
||||
# make sure we are not running it with our own PYTHONPATH
|
||||
env = dict(**os.environ)
|
||||
env.pop('PYTHONPATH', None)
|
||||
|
||||
command = Argv(*command) # type: Executable
|
||||
if not raw:
|
||||
command = (self.conda,) + command + ("--quiet", "--json")
|
||||
try:
|
||||
print('Executing Conda: {}'.format(command.serialize()))
|
||||
result = command.get_output(stdin=DEVNULL, env=env, **kwargs)
|
||||
if self.session.debug_mode:
|
||||
print(result)
|
||||
except Exception as e:
|
||||
result = e.output if hasattr(e, 'output') else ''
|
||||
if self.session.debug_mode:
|
||||
print(result)
|
||||
if raw:
|
||||
raise
|
||||
if raw:
|
||||
return result
|
||||
|
||||
result = json.loads(escape_ansi(result)) if result else {}
|
||||
if result.get('success', False):
|
||||
print('Pass')
|
||||
elif result.get('error'):
|
||||
print('Conda error: {}'.format(result.get('error')))
|
||||
return result
|
||||
|
||||
def get_python_command(self, extra=()):
|
||||
if not self.source:
|
||||
self._init_existing_environment(self.path)
|
||||
return CommandSequence(self.source, self.pip.get_python_command(extra=extra))
|
||||
|
||||
def _get_conda_sh(self):
|
||||
# type () -> Path
|
||||
base_conda_env = Path(self.conda).parent.parent / 'etc' / 'profile.d' / 'conda.sh'
|
||||
if base_conda_env.is_file():
|
||||
return base_conda_env
|
||||
for path in os.environ.get('PATH', '').split(select_for_platform(windows=';', linux=':')):
|
||||
conda = find_executable("conda", path=path)
|
||||
if not conda:
|
||||
continue
|
||||
conda_env = Path(conda).parent.parent / 'etc' / 'profile.d' / 'conda.sh'
|
||||
if conda_env.is_file():
|
||||
return conda_env
|
||||
return base_conda_env
|
||||
|
||||
|
||||
# enable hashing with cmp=False because pdb fails on un-hashable exceptions
|
||||
exception = attrs(str=True, cmp=False)
|
||||
|
||||
|
||||
@exception
|
||||
class CondaException(Exception, NonStrictAttrs):
|
||||
command = attrib()
|
||||
message = attrib(default=None)
|
||||
|
||||
|
||||
@exception
|
||||
class UnknownCondaError(CondaException):
|
||||
data = attrib(default=Factory(dict))
|
||||
|
||||
|
||||
@exception
|
||||
class PackagesNotFoundError(CondaException):
|
||||
"""
|
||||
Conda 4.5 exception - this reports all missing packages.
|
||||
"""
|
||||
|
||||
packages = attrib(default=())
|
||||
|
||||
|
||||
@exception
|
||||
class PackageNotFoundError(CondaException):
|
||||
"""
|
||||
Conda 4.3 exception - this reports one missing package at a time,
|
||||
as a singleton YAML list.
|
||||
"""
|
||||
|
||||
pkg = attrib(default="", converter=lambda val: yaml.load(val, Loader=yaml.SafeLoader)[0].replace(" ", ""))
|
||||
184
clearml_agent/helper/package/external_req.py
Normal file
184
clearml_agent/helper/package/external_req.py
Normal file
@@ -0,0 +1,184 @@
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from typing import Text
|
||||
|
||||
from pathlib2 import Path
|
||||
|
||||
from .base import PackageManager
|
||||
from .requirements import SimpleSubstitution
|
||||
from ..base import safe_furl as furl
|
||||
|
||||
|
||||
class ExternalRequirements(SimpleSubstitution):
|
||||
|
||||
name = "external_link"
|
||||
cwd = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ExternalRequirements, self).__init__(*args, **kwargs)
|
||||
self.post_install_req = []
|
||||
self.post_install_req_lookup = OrderedDict()
|
||||
self.post_install_local_req_lookup = OrderedDict()
|
||||
|
||||
def match(self, req):
|
||||
# match local folder building:
|
||||
if self.is_local_folder_package(req):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
folder_path = req.req.line.strip().split('#')[0].strip()
|
||||
if self.cwd and not Path(folder_path).is_absolute():
|
||||
folder_path = (Path(self.cwd) / Path(folder_path)).absolute().as_posix()
|
||||
self.post_install_local_req_lookup['file://{}'.format(folder_path)] = req.req.line
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
# match both editable or code or unparsed
|
||||
if not (not req.name or req.req and (req.req.editable or req.req.vcs)):
|
||||
return False
|
||||
if not req.req or not req.req.line or not req.req.line.strip() or req.req.line.strip().startswith('#'):
|
||||
return False
|
||||
if req.pip_new_version and not (req.req.editable or req.req.vcs):
|
||||
return False
|
||||
return True
|
||||
|
||||
def post_install(self, session):
|
||||
post_install_req = self.post_install_req
|
||||
self.post_install_req = []
|
||||
for req in post_install_req:
|
||||
if self.is_already_installed(req):
|
||||
print("No need to reinstall \'{}\' from VCS, "
|
||||
"the exact same version is already installed".format(req.name))
|
||||
continue
|
||||
req_line = self._add_vcs_credentials(req, session)
|
||||
|
||||
# if we have older pip version we have to make sure we replace back the package name with the
|
||||
# git repository link. In new versions this is supported and we get "package @ git+https://..."
|
||||
if not req.pip_new_version:
|
||||
PackageManager.out_of_scope_install_package(req_line, "--no-deps")
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
freeze_post = PackageManager.out_of_scope_freeze() or ''
|
||||
package_name = list(set(freeze_post['pip']) - set(freeze_base['pip']))
|
||||
if package_name and package_name[0] not in self.post_install_req_lookup:
|
||||
self.post_install_req_lookup[package_name[0]] = req.req.line
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# no need to force reinstall, pip will always rebuilt if the package comes from git
|
||||
# and make sure the required packages are installed (if they are not it will install them)
|
||||
if not PackageManager.out_of_scope_install_package(req_line):
|
||||
raise ValueError("Failed installing GIT/HTTPs package \'{}\'".format(req_line))
|
||||
|
||||
@staticmethod
|
||||
def _add_vcs_credentials(req, session):
|
||||
req_line = req.tostr(markers=False)
|
||||
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
|
||||
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
|
||||
if req.req.vcs and req_line.startswith('git+'):
|
||||
try:
|
||||
url_no_frag = furl(req_line)
|
||||
url_no_frag.set(fragment=None)
|
||||
# reverse replace
|
||||
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
# remove ssh:// or git:// prefix for git detection and credentials
|
||||
scheme = ''
|
||||
if vcs_url and (vcs_url.startswith('ssh://') or vcs_url.startswith('git://')):
|
||||
scheme = 'ssh://' # notice git:// is actually ssh://
|
||||
vcs_url = vcs_url[6:]
|
||||
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}{}'.format(
|
||||
'' if scheme and '://' in vcs.url else scheme,
|
||||
vcs.url_with_auth, fragment
|
||||
)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
req_line,
|
||||
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
|
||||
req_line = new_req_line
|
||||
except Exception:
|
||||
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
|
||||
return req_line
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
# Store in post req install, and return nothing
|
||||
self.post_install_req.append(req)
|
||||
# mark skip package, we will install it in post install hook
|
||||
return Text('')
|
||||
|
||||
def replace_back(self, list_of_requirements):
|
||||
if not list_of_requirements:
|
||||
return list_of_requirements
|
||||
|
||||
for k in list_of_requirements:
|
||||
# k is either pip/conda
|
||||
if k not in ('pip', 'conda'):
|
||||
continue
|
||||
|
||||
original_requirements = list_of_requirements[k]
|
||||
list_of_requirements[k] = [r for r in original_requirements
|
||||
if r not in self.post_install_req_lookup]
|
||||
list_of_requirements[k] += [self.post_install_req_lookup.get(r, '')
|
||||
for r in self.post_install_req_lookup.keys() if r in original_requirements]
|
||||
|
||||
if self.post_install_local_req_lookup:
|
||||
original_requirements = list_of_requirements[k]
|
||||
list_of_requirements[k] = [
|
||||
r for r in original_requirements
|
||||
if len(r.split('@', 1)) != 2 or r.split('@', 1)[1].strip() not in self.post_install_local_req_lookup]
|
||||
|
||||
list_of_requirements[k] += [
|
||||
self.post_install_local_req_lookup.get(r.split('@', 1)[1].strip(), '')
|
||||
for r in original_requirements
|
||||
if len(r.split('@', 1)) == 2 and r.split('@', 1)[1].strip() in self.post_install_local_req_lookup]
|
||||
|
||||
return list_of_requirements
|
||||
|
||||
@classmethod
|
||||
def is_local_folder_package(cls, req):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
if not req.name and req.req and not req.req.editable and not req.req.vcs and \
|
||||
req.req.line and req.req.line.strip().split('#')[0] and \
|
||||
not req.req.line.strip().split('#')[0].lower().endswith('.whl') and \
|
||||
not (req.req.line.strip().startswith('-r ') or req.req.line.strip().startswith('--requirement ')):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
class OnlyExternalRequirements(ExternalRequirements):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(OnlyExternalRequirements, self).__init__(*args, **kwargs)
|
||||
|
||||
def match(self, req):
|
||||
return True
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
# Do not store the skipped requirements
|
||||
# mark skip package
|
||||
if super(OnlyExternalRequirements, self).match(req):
|
||||
if self.is_already_installed(req):
|
||||
print("No need to reinstall \'{}\' from VCS, "
|
||||
"the exact same version is already installed".format(req.name))
|
||||
return Text('')
|
||||
|
||||
return self._add_vcs_credentials(req, self._session)
|
||||
|
||||
return Text('')
|
||||
0
clearml_agent/helper/package/pip_api/__init__.py
Normal file
0
clearml_agent/helper/package/pip_api/__init__.py
Normal file
@@ -1,11 +1,13 @@
|
||||
import os
|
||||
import sys
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Text, Optional
|
||||
|
||||
from trains_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
|
||||
from trains_agent.helper.package.base import PackageManager
|
||||
from trains_agent.helper.process import Argv, DEVNULL
|
||||
from trains_agent.session import Session
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
|
||||
from clearml_agent.helper.package.base import PackageManager
|
||||
from clearml_agent.helper.process import Argv, DEVNULL
|
||||
from clearml_agent.session import Session
|
||||
|
||||
|
||||
class SystemPip(PackageManager):
|
||||
@@ -17,7 +19,8 @@ class SystemPip(PackageManager):
|
||||
"""
|
||||
Program interface to the system pip.
|
||||
"""
|
||||
self._bin = interpreter or sys.executable
|
||||
super(SystemPip, self).__init__()
|
||||
self._bin = Path(interpreter or sys.executable)
|
||||
self.session = session
|
||||
|
||||
@property
|
||||
@@ -81,7 +84,10 @@ class SystemPip(PackageManager):
|
||||
:param kwargs: kwargs for get_output/check_output command
|
||||
"""
|
||||
command = self._make_command(command)
|
||||
return (command.get_output if output else command.check_call)(stdin=DEVNULL, **kwargs)
|
||||
# make sure we are not running it with our own PYTHONPATH
|
||||
env = dict(**os.environ)
|
||||
env.pop('PYTHONPATH', None)
|
||||
return (command.get_output if output else command.check_call)(stdin=DEVNULL, env=env, **kwargs)
|
||||
|
||||
def _make_command(self, command):
|
||||
return Argv(self.bin, '-m', 'pip', '--disable-pip-version-check', *command)
|
||||
@@ -1,16 +1,18 @@
|
||||
from typing import Any
|
||||
|
||||
from pathlib2 import Path
|
||||
|
||||
from trains_agent.helper.base import select_for_platform, rm_tree
|
||||
from trains_agent.helper.package.base import PackageManager
|
||||
from trains_agent.helper.process import Argv, PathLike
|
||||
from trains_agent.session import Session
|
||||
from clearml_agent.helper.base import select_for_platform, rm_tree, ExecutionInfo
|
||||
from clearml_agent.helper.package.base import PackageManager
|
||||
from clearml_agent.helper.process import Argv, PathLike
|
||||
from clearml_agent.session import Session
|
||||
from ..pip_api.system import SystemPip
|
||||
from ..requirements import RequirementsManager
|
||||
|
||||
|
||||
class VirtualenvPip(SystemPip, PackageManager):
|
||||
def __init__(self, session, python, requirements_manager, path, interpreter=None):
|
||||
# type: (Session, float, RequirementsManager, PathLike, PathLike) -> ()
|
||||
def __init__(self, session, python, requirements_manager, path, interpreter=None, execution_info=None, **kwargs):
|
||||
# type: (Session, str, RequirementsManager, PathLike, PathLike, ExecutionInfo, Any) -> ()
|
||||
"""
|
||||
Program interface to virtualenv pip.
|
||||
Must be given either path to virtualenv or source command.
|
||||
@@ -37,7 +39,7 @@ class VirtualenvPip(SystemPip, PackageManager):
|
||||
if isinstance(requirements, dict) and requirements.get("pip"):
|
||||
requirements["pip"] = self.requirements_manager.replace(requirements["pip"])
|
||||
super(VirtualenvPip, self).load_requirements(requirements)
|
||||
self.requirements_manager.post_install()
|
||||
self.requirements_manager.post_install(self.session, package_manager=self)
|
||||
|
||||
def create_flags(self):
|
||||
"""
|
||||
@@ -5,8 +5,9 @@ import attr
|
||||
import sys
|
||||
import os
|
||||
from pathlib2 import Path
|
||||
from trains_agent.helper.process import Argv, DEVNULL, check_if_command_exists
|
||||
from trains_agent.session import Session, POETRY
|
||||
|
||||
from clearml_agent.helper.process import Argv, DEVNULL, check_if_command_exists
|
||||
from clearml_agent.session import Session, POETRY
|
||||
|
||||
|
||||
def prop_guard(prop, log_prop=None):
|
||||
@@ -81,6 +82,32 @@ class PoetryConfig:
|
||||
@_guard_enabled
|
||||
def initialize(self, cwd=None):
|
||||
if not self._initialized:
|
||||
if self.session.config.get("agent.package_manager.poetry_version", None) is not None:
|
||||
version = str(self.session.config.get("agent.package_manager.poetry_version"))
|
||||
print('Upgrading Poetry package {}'.format(version))
|
||||
# first upgrade pip if we need to
|
||||
try:
|
||||
from clearml_agent.helper.package.pip_api.venv import VirtualenvPip
|
||||
pip = VirtualenvPip(
|
||||
session=self.session, python=self._python,
|
||||
requirements_manager=None, path=None, interpreter=self._python)
|
||||
pip.upgrade_pip()
|
||||
except Exception as ex:
|
||||
self.log.warning("failed upgrading pip: {}".format(ex))
|
||||
|
||||
# now install poetry
|
||||
try:
|
||||
version = version.replace(' ', '')
|
||||
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
|
||||
version = version
|
||||
elif version:
|
||||
version = "==" + version
|
||||
argv = Argv(self._python, "-m", "pip", "install", "poetry{}".format(version),
|
||||
"--upgrade", "--disable-pip-version-check")
|
||||
print(argv.get_output())
|
||||
except Exception as ex:
|
||||
self.log.warning("failed upgrading poetry: {}".format(ex))
|
||||
|
||||
self._initialized = True
|
||||
try:
|
||||
self._config("--local", "virtualenvs.in-project", "true", cwd=cwd)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user