Compare commits

...

72 Commits

Author SHA1 Message Date
allegroai
c58ffdb9f8 Version bump to v0.15.0 2020-06-01 19:56:59 +03:00
allegroai
54d9d77294 Allow services mode to re-register (docker can kill it and not exit gracefully) 2020-06-01 16:34:33 +03:00
allegroai
ce02385420 Fix services mode abort docker while installing, detect docker crash 2020-06-01 16:33:47 +03:00
allegroai
87ffd95eaa Upgrade default pip version to <20.2 2020-06-01 16:33:00 +03:00
allegroai
522dd85d7b Fix docker build with no --entry-point to use bash as an entrypoint 2020-06-01 11:05:06 +03:00
allegroai
3651c85fcd Fix print if no repo (standalone script) 2020-05-31 14:03:31 +03:00
allegroai
566427d550 Fix build failing due to missing session 2020-05-31 14:02:42 +03:00
allegroai
cc99077c92 Do not monitor GPU when running with --cpu-only 2020-05-31 14:01:14 +03:00
allegroai
5f112447f7 CUDA_VISIBLE_DEVICES should not be set to "all" 2020-05-31 14:00:51 +03:00
allegroai
22c5f043aa Fix detached mode to correctly use cache folder slots 2020-05-31 14:00:14 +03:00
allegroai
860ff8911c Fix status message check containing "worker" (deprecated test) 2020-05-31 13:58:39 +03:00
allegroai
799b292146 Support running code from module (i.e. '-m' in execution entry point) 2020-05-31 13:54:13 +03:00
allegroai
fffe8e1c3f Fix init wizard, correctly display the input servers 2020-05-31 13:53:34 +03:00
allegroai
8245293f7f Fix request endpoint constant version numbers 2020-05-31 13:52:53 +03:00
allegroai
6563ce70c8 Update README 2020-05-09 20:12:53 +03:00
allegroai
829b1d8f15 Use deep copy to clone configuration, always write configuration before launching a docker 2020-05-09 20:12:29 +03:00
allegroai
f6be64a4b5 Print conda install output if running in debug mode, turn on debugging if --debug flag is used 2020-05-09 20:11:01 +03:00
allegroai
21f6a73f66 Include CUDA version in the pytorch package fail error 2020-05-09 20:09:18 +03:00
allegroai
77c4c79a2f Support pip 20.1 local/http package reference in pip freeze 2020-05-09 20:08:17 +03:00
allegroai
2ad929fa00 Add torch_nightly flag support (if torch wheel is not found on stable try the nightly builds), improve support for torch in freeze (add actually used HTTP link as comment to the original package) 2020-05-09 20:08:05 +03:00
allegroai
53f511f536 Improve docker host-mount support, use TRAINS_AGENT_DOCKER_HOST_MOUNT env var 2020-05-09 20:02:46 +03:00
allegroai
7c87797a40 Pass git credentials to dockerized task execution 2020-05-09 19:59:58 +03:00
allegroai
272fa07c29 Fix and enhance "build --docker"
- Fix standalone docker execution
- Add --install-globally option to install required packages in the docker's system python
- Add --entry-point option to allow automatic task cloning when running the docker
2020-05-09 19:57:25 +03:00
allegroai
6ce9cf7c2a Fix version control links in requirements when using conda 2020-05-09 19:52:51 +03:00
allegroai
abb30ac2b8 Move --gpus and --cpu-only to worker args (used by daemon, execute and build) 2020-05-09 19:51:45 +03:00
allegroai
5bb257c46c Add daemon --create-queue to automatically create a queue and use it if queue name doesn't exist in server 2020-05-09 19:50:53 +03:00
allegroai
c65b28ed92 Update venv_update URL 2020-05-09 19:47:00 +03:00
allegroai
fce8eb6782 Add OS environment configuration for git user/pass using TRAINS_AGENT_GIT_USER/TRAINS_AGENT_GIT_PASS 2020-05-09 19:46:46 +03:00
allegroai
9cb71b9526 Add daemon service mode to allow multiple tasks to be launched simultaneously on the same machine (--service-mode) 2020-05-09 19:45:14 +03:00
allegroai
38e02ca5cd Add worker command state enforcement conforming and verification callback 2020-05-09 19:42:51 +03:00
allegroai
06bfea80bc Fix read file scope 2020-04-09 11:27:04 +03:00
allegroai
e660c7f2be Fix comments in config files 2020-04-09 11:23:45 +03:00
allegroai
fc28467080 Improve error message when failing to locate a task 2020-04-09 11:23:13 +03:00
allegroai
8d47905982 Show host information when failing to obtain a task 2020-04-01 19:12:45 +03:00
allegroai
a6a0b01f71 Remove deprecated OS environment variables 2020-04-01 19:11:37 +03:00
allegroai
2b561f6066 Version bump to v0.14.1 2020-03-24 20:37:18 +02:00
allegroai
61232d05dd Fix run as user support in Windows and add fall-back for created user folders 2020-03-22 19:16:11 +02:00
allegroai
b3418e4496 Add daemon detached mode (--detached, -d) that runs agent in the background and returns immediately 2020-03-22 19:00:29 +02:00
allegroai
5ef627165c Fix PyTorch support to ignore minor versions when looking for package to install or to download 2020-03-20 10:48:48 +02:00
allegroai
98a983d9a2 Add TRAINS_AGENT_EXTRA_PYTHON_PATH to allow adding additional python path for task execution (helpful when using extra untracked modules) 2020-03-20 10:46:56 +02:00
allegroai
482007c4ce Fix run as user feature (TRAINS_AGENT_EXEC_USER) 2020-03-20 10:42:32 +02:00
allegroai
98198b8006 Auto mount ~/.git-credentials into docker container if file exists 2020-03-20 10:39:59 +02:00
allegroai
94bb11a81a Change message when using local torch 2020-03-20 10:37:42 +02:00
allegroai
4158d08f6f Fix test 2020-03-20 10:36:20 +02:00
allegroai
58ab67ea31 Fix execution output handling 2020-03-20 10:35:25 +02:00
allegroai
ea0ed4807e Version bump to v0.14.0 2020-03-12 19:42:32 +02:00
allegroai
389600b91e Fix git checkout with submodules 2020-03-12 18:39:47 +02:00
allegroai
5fb2550212 Update to backend API v2.5 2020-03-12 18:39:10 +02:00
allegroai
15e9e6b778 Fix "execute --clone" support 2020-03-12 18:38:35 +02:00
allegroai
aa75b92e46 Prefer docker image from command line over the one in the experiment 2020-03-12 18:35:49 +02:00
allegroai
757210d5b3 Add support for "execute --docker" and for cloning an experiment before execution 2020-03-12 18:33:07 +02:00
allegroai
00eb2f10ec Version bump to v0.13.3 2020-03-09 16:07:50 +02:00
allegroai
3393372b9c Do not share apt cache among agents on the same machine 2020-03-09 12:38:51 +02:00
allegroai
f2d2d702de Fix k8s support to allow a specific network for the docker (do not use the parent daemon network definition) 2020-03-09 12:38:32 +02:00
allegroai
e3d0680d39 Improve Unicode/UTF stdout handling 2020-03-09 12:34:48 +02:00
allegroai
618c2ac5c4 Add default storage environment vars to generated agent configuration 2020-03-09 12:33:03 +02:00
allegroai
0272c4c79c Add "--force-current-version" daemon command-line flag 2020-03-09 12:31:43 +02:00
allegroai
ff8cf63abf Add "--force-current-version" daemon command-line flag 2020-03-09 12:27:39 +02:00
allegroai
2c7c7f5b44 Add K8s/trains glue service example 2020-03-05 14:10:08 +02:00
allegroai
01f57c1e44 Create missing queues when starting the AWS dynamic cluster management service 2020-03-05 14:08:32 +02:00
allegroai
47bcd3839a Pass correct GPU limit when skipping gpus flag in docker mode 2020-03-05 14:07:44 +02:00
allegroai
0a3a8a1c52 Add support for mounting dockerized experiment folders to host when running on K8s in daemon mode 2020-03-05 13:13:03 +02:00
allegroai
231a907cff Add support for running daemon inside a K8s pod in daemon mode 2020-03-05 13:03:36 +02:00
allegroai
8f95eecf2e Add TRAINS_AGENT_EXEC_USER support for multiple daemon instances 2020-03-05 12:46:53 +02:00
allegroai
81008ee00e Add support for launching a specific python version based on Task.script.binary 2020-03-01 17:15:18 +02:00
allegroai
25bc44c0cf Add poetry to the list of supported package managers 2020-03-01 17:13:15 +02:00
allegroai
f838c8fc70 Allow providing queue names to daemon 2020-02-26 16:58:25 +02:00
allegroai
596093aac6 Version bump to v0.13.2 2020-02-23 16:25:14 +02:00
allegroai
8f23f3b4c0 Add support for pulling recursive git modules as as well as main project 2020-02-23 15:48:12 +02:00
allegroai
95d503afdd Fix pip install or upgrade with limit in conda 2020-02-23 15:47:28 +02:00
allegroai
73ee33be99 Print error in case Poetry configuration failed 2020-02-23 14:43:21 +02:00
allegroai
ee3adf625f Add single-series-per-graph setting to the configuration example 2020-02-23 12:38:14 +02:00
49 changed files with 19960 additions and 473 deletions

View File

@@ -8,6 +8,8 @@
[![PyPI version shields.io](https://img.shields.io/pypi/v/trains-agent.svg)](https://img.shields.io/pypi/v/trains-agent.svg)
[![PyPI status](https://img.shields.io/pypi/status/trains-agent.svg)](https://pypi.python.org/pypi/trains-agent/)
### Help improve Trains by filling our 2-min [user survey](https://allegro.ai/lp/trains-user-survey/)
**TRAINS Agent is an AI experiment cluster solution.**
It is a zero configuration fire-and-forget execution agent, which combined with trains-server provides a full AI cluster solution.
@@ -165,8 +167,9 @@ trains-agent daemon --queue default --foreground
```
For actual service mode, all the stdout will be stored automatically into a temporary file (no need to pipe)
Notice: with `--detached` flag, the *trains-agent* will be running in the background
```bash
trains-agent daemon --queue default
trains-agent daemon --detached --queue default
```
GPU allocation is controlled via the standard OS environment `NVIDIA_VISIBLE_DEVICES` or `--gpus` flag (or disabled with `--cpu-only`).
@@ -175,15 +178,16 @@ If no flag is set, and `NVIDIA_VISIBLE_DEVICES` variable doesn't exist, all GPU'
If `--cpu-only` flag is set, or `NVIDIA_VISIBLE_DEVICES` is an empty string (""), no gpu will be allocated for the `trains-agent`
Example: spin two agents, one per gpu on the same machine:
Notice: with `--detached` flag, the *trains-agent* will be running in the background
```bash
trains-agent daemon --gpus 0 --queue default &
trains-agent daemon --gpus 1 --queue default &
trains-agent daemon --detached --gpus 0 --queue default
trains-agent daemon --detached --gpus 1 --queue default
```
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent
```bash
trains-agent daemon --gpus 0,1 --queue dual_gpu &
trains-agent daemon --gpus 2,3 --queue dual_gpu &
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu
```
#### Starting the TRAINS Agent in docker mode
@@ -194,20 +198,21 @@ trains-agent daemon --queue default --docker --foreground
```
For actual service mode, all the stdout will be stored automatically into a file (no need to pipe)
Notice: with `--detached` flag, the *trains-agent* will be running in the background
```bash
trains-agent daemon --queue default --docker
trains-agent daemon --detached --queue default --docker
```
Example: spin two agents, one per gpu on the same machine, with default nvidia/cuda docker:
```bash
trains-agent daemon --gpus 0 --queue default --docker nvidia/cuda &
trains-agent daemon --gpus 1 --queue default --docker nvidia/cuda &
trains-agent daemon --detached --gpus 0 --queue default --docker nvidia/cuda
trains-agent daemon --detached --gpus 1 --queue default --docker nvidia/cuda
```
Example: spin two agents, pulling from dedicated `dual_gpu` queue, two gpu's per agent, with default nvidia/cuda docker:
```bash
trains-agent daemon --gpus 0,1 --queue dual_gpu --docker nvidia/cuda &
trains-agent daemon --gpus 2,3 --queue dual_gpu --docker nvidia/cuda &
trains-agent daemon --detached --gpus 0,1 --queue dual_gpu --docker nvidia/cuda
trains-agent daemon --detached --gpus 2,3 --queue dual_gpu --docker nvidia/cuda
```
#### Starting the TRAINS Agent - Priority Queues
@@ -259,3 +264,7 @@ Experiment Pipeline examples
- This example will "process data", and once done, will launch a copy of the 'second step' experiment-template
- [Second step experiment](https://github.com/allegroai/trains/blob/master/examples/automl/toy_base_task.py)
- In order to create an experiment-template in the system, this code must be executed once manually
# License
Apache License, Version 2.0 (see the [LICENSE](https://www.apache.org/licenses/LICENSE-2.0.html) for more information)

View File

@@ -38,7 +38,7 @@ agent {
# currently supported pip and conda
# poetry is used if pip selected and repository contains poetry.lock file
package_manager: {
# supported options: pip, conda
# supported options: pip, conda, poetry
type: pip,
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
@@ -55,6 +55,10 @@ agent {
# additional conda channels to use when installing with conda package manager
conda_channels: ["pytorch", "conda-forge", ]
# set to True to support torch nightly build installation,
# notice: torch nightly builds are ephemeral and are deleted from time to time
torch_nightly: false,
},
# target folder for virtual environments builds, created when executing experiment
@@ -82,9 +86,9 @@ agent {
# reload configuration file every daemon execution
reload_config: false,
# pip cache folder used mapped into docker, for python package caching
# pip cache folder mapped into docker, used for python package caching
docker_pip_cache = ~/.trains/pip-cache
# apt cache folder used mapped into docker, for ubuntu package caching
# apt cache folder mapped into docker, used for ubuntu package caching
docker_apt_cache = ~/.trains/apt-cache
# optional arguments to pass to docker image
@@ -141,6 +145,9 @@ sdk {
quality: 87
subsampling: 0
}
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to True, each series should have its own graph)
tensorboard_single_series_per_graph: False
}
network {

View File

@@ -444,6 +444,12 @@
" os.environ[\"TRAINS_API_SECRET_KEY\"] = TRAINS_SECRET_KEY\n",
" api_client = APIClient()\n",
"\n",
" # Verify the requested queues exist and create those that doesn't exist\n",
" all_queues = [q.name for q in list(api_client.queues.get_all())]\n",
" missing_queues = [q for q in QUEUES if q not in all_queues]\n",
" for q in missing_queues:\n",
" api_client.queues.create(q)\n",
"\n",
" idle_workers = {}\n",
" while True:\n",
" queue_name_to_id = {\n",

View File

@@ -5,7 +5,6 @@ future>=0.16.0
humanfriendly>=2.1
jsonmodels>=2.2
jsonschema>=2.6.0
packaging>=16.0
pathlib2>=2.3.0
psutil>=3.4.2
pyhocon>=0.3.38

View File

@@ -11,7 +11,7 @@ from contextlib import contextmanager
from typing import Iterator, ContextManager, Sequence, IO, Text
from uuid import uuid4
from trains_agent.backend_api.services.tasks import Script
from trains_agent.backend_api.services import tasks
from trains_agent.backend_api.session.client import APIClient
from pathlib2 import Path
from pytest import fixture
@@ -154,7 +154,7 @@ def test_entry_point_warning(client):
"""
with create_task(
client,
script=Script(diff="print('hello')", entry_point="foo.py", repository=""),
script=tasks.Script(diff="print('hello')", entry_point="foo.py", repository=""),
**DEFAULT_TASK_ARGS
) as task, iterate_output(SHORT_TIMEOUT, run_task(task)) as output:
for line in output:
@@ -172,7 +172,7 @@ def test_run_no_dirs(client):
script = "print('{}')".format(uuid)
with create_task(
client,
script=Script(diff=script, entry_point="", repository="", working_dir=""),
script=tasks.Script(diff=script, entry_point="", repository="", working_dir=""),
**DEFAULT_TASK_ARGS
) as task, iterate_output(SHORT_TIMEOUT, run_task(task)) as output:
search_lines(
@@ -196,7 +196,7 @@ def test_run_working_dir(client):
script = "print('{}')".format(uuid)
with create_task(
client,
script=Script(
script=tasks.Script(
diff=script,
entry_point="",
repository="git@bitbucket.org:seematics/roee_test_git.git",
@@ -223,7 +223,7 @@ def test_regular_task(client):
"""
with create_task(
client,
script=Script(
script=tasks.Script(
entry_point="noop.py",
repository="git@bitbucket.org:seematics/roee_test_git.git",
),
@@ -241,7 +241,7 @@ def test_regular_task_nested(client):
"""
with create_task(
client,
script=Script(
script=tasks.Script(
entry_point="noop_nested.py",
working_dir="no_reqs",
repository="git@bitbucket.org:seematics/roee_test_git.git",

View File

@@ -20,6 +20,8 @@ from .interface import get_parser
def run_command(parser, args, command_name):
debug = args.debug
session.Session.set_debug_mode(debug)
if command_name and command_name.lower() in ('config', 'init'):
command_class = commands.Config
elif len(command_name.split('.')) < 2:

View File

@@ -26,7 +26,7 @@
type: pip,
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
pip_version: "<20",
pip_version: "<20.2",
# virtual environment inheres packages from system
system_site_packages: false,
@@ -39,6 +39,10 @@
# additional conda channels to use when installing with conda package manager
conda_channels: ["defaults", "conda-forge", "pytorch", ]
# set to True to support torch nightly build installation,
# notice: torch nightly builds are ephemeral and are deleted from time to time
torch_nightly: false,
},
# target folder for virtual environments builds, created when executing experiment
@@ -66,9 +70,9 @@
# reload configuration file every daemon execution
reload_config: false,
# pip cache folder used mapped into docker, for python package caching
# pip cache folder mapped into docker, used for python package caching
docker_pip_cache = ~/.trains/pip-cache
# apt cache folder used mapped into docker, for ubuntu package caching
# apt cache folder mapped into docker, used for ubuntu package caching
docker_apt_cache = ~/.trains/apt-cache
# optional arguments to pass to docker image

View File

@@ -1,10 +1,10 @@
from .v2_4 import auth
from .v2_4 import debug
from .v2_4 import queues
from .v2_4 import tasks
from .v2_4 import workers
from .v2_4 import events
from .v2_4 import models
from .v2_5 import auth
from .v2_5 import debug
from .v2_5 import queues
from .v2_5 import tasks
from .v2_5 import workers
from .v2_5 import events
from .v2_5 import models
__all__ = [
'auth',

View File

@@ -151,7 +151,7 @@ class CreateCredentialsRequest(Request):
_service = "auth"
_action = "create_credentials"
_version = "2.1"
_version = "2.4"
_schema = {
'additionalProperties': False,
'definitions': {},
@@ -169,7 +169,7 @@ class CreateCredentialsResponse(Response):
"""
_service = "auth"
_action = "create_credentials"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
@@ -230,7 +230,7 @@ class EditUserRequest(Request):
_service = "auth"
_action = "edit_user"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -287,7 +287,7 @@ class EditUserResponse(Response):
"""
_service = "auth"
_action = "edit_user"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -347,7 +347,7 @@ class GetCredentialsRequest(Request):
_service = "auth"
_action = "get_credentials"
_version = "2.1"
_version = "2.4"
_schema = {
'additionalProperties': False,
'definitions': {},
@@ -365,7 +365,7 @@ class GetCredentialsResponse(Response):
"""
_service = "auth"
_action = "get_credentials"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
@@ -433,7 +433,7 @@ class LoginRequest(Request):
_service = "auth"
_action = "login"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -474,7 +474,7 @@ class LoginResponse(Response):
"""
_service = "auth"
_action = "login"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -510,7 +510,7 @@ class LogoutRequest(Request):
_service = "auth"
_action = "logout"
_version = "2.2"
_version = "2.4"
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}
@@ -521,7 +521,7 @@ class LogoutResponse(Response):
"""
_service = "auth"
_action = "logout"
_version = "2.2"
_version = "2.4"
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}
@@ -537,7 +537,7 @@ class RevokeCredentialsRequest(Request):
_service = "auth"
_action = "revoke_credentials"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -577,7 +577,7 @@ class RevokeCredentialsResponse(Response):
"""
_service = "auth"
_action = "revoke_credentials"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},

View File

@@ -19,7 +19,7 @@ class ApiexRequest(Request):
_service = "debug"
_action = "apiex"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'required': [], 'type': 'object'}
@@ -30,7 +30,7 @@ class ApiexResponse(Response):
"""
_service = "debug"
_action = "apiex"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
@@ -43,7 +43,7 @@ class EchoRequest(Request):
_service = "debug"
_action = "echo"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
@@ -54,7 +54,7 @@ class EchoResponse(Response):
"""
_service = "debug"
_action = "echo"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
@@ -65,7 +65,7 @@ class ExRequest(Request):
_service = "debug"
_action = "ex"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'required': [], 'type': 'object'}
@@ -76,7 +76,7 @@ class ExResponse(Response):
"""
_service = "debug"
_action = "ex"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
@@ -89,7 +89,7 @@ class PingRequest(Request):
_service = "debug"
_action = "ping"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
@@ -102,7 +102,7 @@ class PingResponse(Response):
"""
_service = "debug"
_action = "ping"
_version = "1.5"
_version = "2.4"
_schema = {
'definitions': {},
@@ -141,7 +141,7 @@ class PingAuthRequest(Request):
_service = "debug"
_action = "ping_auth"
_version = "1.5"
_version = "2.4"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
@@ -154,7 +154,7 @@ class PingAuthResponse(Response):
"""
_service = "debug"
_action = "ping_auth"
_version = "1.5"
_version = "2.4"
_schema = {
'definitions': {},

View File

@@ -734,7 +734,7 @@ class AddRequest(CompoundRequest):
_service = "events"
_action = "add"
_version = "2.1"
_version = "2.4"
_item_prop_name = "event"
_schema = {
'anyOf': [
@@ -926,7 +926,7 @@ class AddResponse(Response):
"""
_service = "events"
_action = "add"
_version = "2.1"
_version = "2.4"
_schema = {'additionalProperties': True, 'definitions': {}, 'type': 'object'}
@@ -939,7 +939,7 @@ class AddBatchRequest(BatchRequest):
_service = "events"
_action = "add_batch"
_version = "2.1"
_version = "2.4"
_batched_request_cls = AddRequest
@@ -954,7 +954,7 @@ class AddBatchResponse(Response):
"""
_service = "events"
_action = "add_batch"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1015,7 +1015,7 @@ class DebugImagesRequest(Request):
_service = "events"
_action = "debug_images"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1098,7 +1098,7 @@ class DebugImagesResponse(Response):
"""
_service = "events"
_action = "debug_images"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1213,7 +1213,7 @@ class DeleteForTaskRequest(Request):
_service = "events"
_action = "delete_for_task"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {'task': {'description': 'Task ID', 'type': 'string'}},
@@ -1248,7 +1248,7 @@ class DeleteForTaskResponse(Response):
"""
_service = "events"
_action = "delete_for_task"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1293,7 +1293,7 @@ class DownloadTaskLogRequest(Request):
_service = "events"
_action = "download_task_log"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1366,7 +1366,7 @@ class DownloadTaskLogResponse(Response):
"""
_service = "events"
_action = "download_task_log"
_version = "2.1"
_version = "2.4"
_schema = {'definitions': {}, 'type': 'string'}
@@ -1385,7 +1385,7 @@ class GetMultiTaskPlotsRequest(Request):
_service = "events"
_action = "get_multi_task_plots"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1472,7 +1472,7 @@ class GetMultiTaskPlotsResponse(Response):
"""
_service = "events"
_action = "get_multi_task_plots"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1571,7 +1571,7 @@ class GetScalarMetricDataRequest(Request):
_service = "events"
_action = "get_scalar_metric_data"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1628,7 +1628,7 @@ class GetScalarMetricDataResponse(Response):
"""
_service = "events"
_action = "get_scalar_metric_data"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1730,7 +1730,7 @@ class GetScalarMetricsAndVariantsRequest(Request):
_service = "events"
_action = "get_scalar_metrics_and_variants"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {'task': {'description': 'task ID', 'type': 'string'}},
@@ -1765,7 +1765,7 @@ class GetScalarMetricsAndVariantsResponse(Response):
"""
_service = "events"
_action = "get_scalar_metrics_and_variants"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1811,7 +1811,7 @@ class GetTaskEventsRequest(Request):
_service = "events"
_action = "get_task_events"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1928,7 +1928,7 @@ class GetTaskEventsResponse(Response):
"""
_service = "events"
_action = "get_task_events"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2028,7 +2028,7 @@ class GetTaskLatestScalarValuesRequest(Request):
_service = "events"
_action = "get_task_latest_scalar_values"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {'task': {'description': 'Task ID', 'type': 'string'}},
@@ -2063,7 +2063,7 @@ class GetTaskLatestScalarValuesResponse(Response):
"""
_service = "events"
_action = "get_task_latest_scalar_values"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2141,7 +2141,7 @@ class GetTaskLogRequest(Request):
_service = "events"
_action = "get_task_log"
_version = "1.7"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2254,7 +2254,7 @@ class GetTaskLogResponse(Response):
"""
_service = "events"
_action = "get_task_log"
_version = "1.7"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2358,7 +2358,7 @@ class GetTaskPlotsRequest(Request):
_service = "events"
_action = "get_task_plots"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2439,7 +2439,7 @@ class GetTaskPlotsResponse(Response):
"""
_service = "events"
_action = "get_task_plots"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2537,7 +2537,7 @@ class GetVectorMetricsAndVariantsRequest(Request):
_service = "events"
_action = "get_vector_metrics_and_variants"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {'task': {'description': 'Task ID', 'type': 'string'}},
@@ -2572,7 +2572,7 @@ class GetVectorMetricsAndVariantsResponse(Response):
"""
_service = "events"
_action = "get_vector_metrics_and_variants"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2623,7 +2623,7 @@ class MultiTaskScalarMetricsIterHistogramRequest(Request):
_service = "events"
_action = "multi_task_scalar_metrics_iter_histogram"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
'scalar_key_enum': {'enum': ['iter', 'timestamp', 'iso_time'], 'type': 'string'},
@@ -2712,7 +2712,7 @@ class MultiTaskScalarMetricsIterHistogramResponse(Response):
"""
_service = "events"
_action = "multi_task_scalar_metrics_iter_histogram"
_version = "2.1"
_version = "2.4"
_schema = {'additionalProperties': True, 'definitions': {}, 'type': 'object'}
@@ -2734,7 +2734,7 @@ class ScalarMetricsIterHistogramRequest(Request):
_service = "events"
_action = "scalar_metrics_iter_histogram"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
'scalar_key_enum': {'enum': ['iter', 'timestamp', 'iso_time'], 'type': 'string'},
@@ -2816,7 +2816,7 @@ class ScalarMetricsIterHistogramResponse(Response):
"""
_service = "events"
_action = "scalar_metrics_iter_histogram"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2860,7 +2860,7 @@ class VectorMetricsIterHistogramRequest(Request):
_service = "events"
_action = "vector_metrics_iter_histogram"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2927,7 +2927,7 @@ class VectorMetricsIterHistogramResponse(Response):
"""
_service = "events"
_action = "vector_metrics_iter_histogram"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},

View File

@@ -464,7 +464,7 @@ class CreateRequest(Request):
_service = "models"
_action = "create"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -720,7 +720,7 @@ class CreateResponse(Response):
"""
_service = "models"
_action = "create"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -779,7 +779,7 @@ class DeleteRequest(Request):
_service = "models"
_action = "delete"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -834,7 +834,7 @@ class DeleteResponse(Response):
"""
_service = "models"
_action = "delete"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -904,7 +904,7 @@ class EditRequest(Request):
_service = "models"
_action = "edit"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1175,7 +1175,7 @@ class EditResponse(Response):
"""
_service = "models"
_action = "edit"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1279,7 +1279,7 @@ class GetAllRequest(Request):
_service = "models"
_action = "get_all"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
'multi_field_pattern_data': {
@@ -1647,7 +1647,7 @@ class GetAllResponse(Response):
"""
_service = "models"
_action = "get_all"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
@@ -1770,7 +1770,7 @@ class GetByIdRequest(Request):
_service = "models"
_action = "get_by_id"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {'model': {'description': 'Model id', 'type': 'string'}},
@@ -1805,7 +1805,7 @@ class GetByIdResponse(Response):
"""
_service = "models"
_action = "get_by_id"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
@@ -1925,7 +1925,7 @@ class GetByTaskIdRequest(Request):
_service = "models"
_action = "get_by_task_id"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1961,7 +1961,7 @@ class GetByTaskIdResponse(Response):
"""
_service = "models"
_action = "get_by_task_id"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
@@ -2087,7 +2087,7 @@ class SetReadyRequest(Request):
_service = "models"
_action = "set_ready"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2164,7 +2164,7 @@ class SetReadyResponse(Response):
"""
_service = "models"
_action = "set_ready"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2276,7 +2276,7 @@ class UpdateRequest(Request):
_service = "models"
_action = "update"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2502,7 +2502,7 @@ class UpdateResponse(Response):
"""
_service = "models"
_action = "update"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2581,7 +2581,7 @@ class UpdateForTaskRequest(Request):
_service = "models"
_action = "update_for_task"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2752,7 +2752,7 @@ class UpdateForTaskResponse(Response):
"""
_service = "models"
_action = "update_for_task"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},

View File

@@ -1518,7 +1518,7 @@ class CloseRequest(Request):
_service = "tasks"
_action = "close"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1612,7 +1612,7 @@ class CloseResponse(Response):
"""
_service = "tasks"
_action = "close"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1682,7 +1682,7 @@ class CompletedRequest(Request):
_service = "tasks"
_action = "completed"
_version = "2.2"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -1776,7 +1776,7 @@ class CompletedResponse(Response):
"""
_service = "tasks"
_action = "completed"
_version = "2.2"
_version = "2.4"
_schema = {
'definitions': {},
@@ -1862,7 +1862,7 @@ class CreateRequest(Request):
_service = "tasks"
_action = "create"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
'artifact': {
@@ -2229,7 +2229,7 @@ class CreateResponse(Response):
"""
_service = "tasks"
_action = "create"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2280,7 +2280,7 @@ class DeleteRequest(Request):
_service = "tasks"
_action = "delete"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2403,7 +2403,7 @@ class DeleteResponse(Response):
"""
_service = "tasks"
_action = "delete"
_version = "1.5"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2547,7 +2547,7 @@ class DequeueRequest(Request):
_service = "tasks"
_action = "dequeue"
_version = "1.5"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -2624,7 +2624,7 @@ class DequeueResponse(Response):
"""
_service = "tasks"
_action = "dequeue"
_version = "1.5"
_version = "2.4"
_schema = {
'definitions': {},
@@ -2733,7 +2733,7 @@ class EditRequest(Request):
_service = "tasks"
_action = "edit"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
'artifact': {
@@ -3123,7 +3123,7 @@ class EditResponse(Response):
"""
_service = "tasks"
_action = "edit"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -3201,7 +3201,7 @@ class EnqueueRequest(Request):
_service = "tasks"
_action = "enqueue"
_version = "1.5"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -3296,7 +3296,7 @@ class EnqueueResponse(Response):
"""
_service = "tasks"
_action = "enqueue"
_version = "1.5"
_version = "2.4"
_schema = {
'definitions': {},
@@ -3386,7 +3386,7 @@ class FailedRequest(Request):
_service = "tasks"
_action = "failed"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -3480,7 +3480,7 @@ class FailedResponse(Response):
"""
_service = "tasks"
_action = "failed"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -3587,7 +3587,7 @@ class GetAllRequest(Request):
_service = "tasks"
_action = "get_all"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
'multi_field_pattern_data': {
@@ -3986,7 +3986,7 @@ class GetAllResponse(Response):
"""
_service = "tasks"
_action = "get_all"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
@@ -4373,7 +4373,7 @@ class GetByIdRequest(Request):
_service = "tasks"
_action = "get_by_id"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {'task': {'description': 'Task ID', 'type': 'string'}},
@@ -4408,7 +4408,7 @@ class GetByIdResponse(Response):
"""
_service = "tasks"
_action = "get_by_id"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
@@ -4792,7 +4792,7 @@ class PingRequest(Request):
_service = "tasks"
_action = "ping"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {'task': {'description': 'Task ID', 'type': 'string'}},
@@ -4825,7 +4825,7 @@ class PingResponse(Response):
"""
_service = "tasks"
_action = "ping"
_version = "2.1"
_version = "2.4"
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}
@@ -4853,7 +4853,7 @@ class PublishRequest(Request):
_service = "tasks"
_action = "publish"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -4967,7 +4967,7 @@ class PublishResponse(Response):
"""
_service = "tasks"
_action = "publish"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -5057,7 +5057,7 @@ class ResetRequest(Request):
_service = "tasks"
_action = "reset"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -5160,7 +5160,7 @@ class ResetResponse(Response):
"""
_service = "tasks"
_action = "reset"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -5305,7 +5305,7 @@ class SetRequirementsRequest(Request):
_service = "tasks"
_action = "set_requirements"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -5362,7 +5362,7 @@ class SetRequirementsResponse(Response):
"""
_service = "tasks"
_action = "set_requirements"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -5431,7 +5431,7 @@ class StartedRequest(Request):
_service = "tasks"
_action = "started"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -5527,7 +5527,7 @@ class StartedResponse(Response):
"""
_service = "tasks"
_action = "started"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -5617,7 +5617,7 @@ class StopRequest(Request):
_service = "tasks"
_action = "stop"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -5711,7 +5711,7 @@ class StopResponse(Response):
"""
_service = "tasks"
_action = "stop"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -5780,7 +5780,7 @@ class StoppedRequest(Request):
_service = "tasks"
_action = "stopped"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -5874,7 +5874,7 @@ class StoppedResponse(Response):
"""
_service = "tasks"
_action = "stopped"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -5952,7 +5952,7 @@ class UpdateRequest(Request):
_service = "tasks"
_action = "update"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
'properties': {
@@ -6120,7 +6120,7 @@ class UpdateResponse(Response):
"""
_service = "tasks"
_action = "update"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -6183,7 +6183,7 @@ class UpdateBatchRequest(BatchRequest):
_service = "tasks"
_action = "update_batch"
_version = "2.1"
_version = "2.4"
_batched_request_cls = UpdateRequest
@@ -6196,7 +6196,7 @@ class UpdateBatchResponse(Response):
"""
_service = "tasks"
_action = "update_batch"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {},
@@ -6261,7 +6261,7 @@ class ValidateRequest(Request):
_service = "tasks"
_action = "validate"
_version = "2.1"
_version = "2.4"
_schema = {
'definitions': {
'artifact': {
@@ -6614,7 +6614,7 @@ class ValidateResponse(Response):
"""
_service = "tasks"
_action = "validate"
_version = "2.1"
_version = "2.4"
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}

View File

@@ -0,0 +1,623 @@
"""
auth service
This service provides authentication management and authorization
validation for the entire system.
"""
import six
import types
from datetime import datetime
import enum
from dateutil.parser import parse as parse_datetime
from ....backend_api.session import Request, BatchRequest, Response, DataModel, NonStrictDataModel, CompoundRequest, schema_property, StringEnum
class Credentials(NonStrictDataModel):
"""
:param access_key: Credentials access key
:type access_key: str
:param secret_key: Credentials secret key
:type secret_key: str
"""
_schema = {
'properties': {
'access_key': {
'description': 'Credentials access key',
'type': ['string', 'null'],
},
'secret_key': {
'description': 'Credentials secret key',
'type': ['string', 'null'],
},
},
'type': 'object',
}
def __init__(
self, access_key=None, secret_key=None, **kwargs):
super(Credentials, self).__init__(**kwargs)
self.access_key = access_key
self.secret_key = secret_key
@schema_property('access_key')
def access_key(self):
return self._property_access_key
@access_key.setter
def access_key(self, value):
if value is None:
self._property_access_key = None
return
self.assert_isinstance(value, "access_key", six.string_types)
self._property_access_key = value
@schema_property('secret_key')
def secret_key(self):
return self._property_secret_key
@secret_key.setter
def secret_key(self, value):
if value is None:
self._property_secret_key = None
return
self.assert_isinstance(value, "secret_key", six.string_types)
self._property_secret_key = value
class CredentialKey(NonStrictDataModel):
"""
:param access_key:
:type access_key: str
:param last_used:
:type last_used: datetime.datetime
:param last_used_from:
:type last_used_from: str
"""
_schema = {
'properties': {
'access_key': {'description': '', 'type': ['string', 'null']},
'last_used': {
'description': '',
'format': 'date-time',
'type': ['string', 'null'],
},
'last_used_from': {'description': '', 'type': ['string', 'null']},
},
'type': 'object',
}
def __init__(
self, access_key=None, last_used=None, last_used_from=None, **kwargs):
super(CredentialKey, self).__init__(**kwargs)
self.access_key = access_key
self.last_used = last_used
self.last_used_from = last_used_from
@schema_property('access_key')
def access_key(self):
return self._property_access_key
@access_key.setter
def access_key(self, value):
if value is None:
self._property_access_key = None
return
self.assert_isinstance(value, "access_key", six.string_types)
self._property_access_key = value
@schema_property('last_used')
def last_used(self):
return self._property_last_used
@last_used.setter
def last_used(self, value):
if value is None:
self._property_last_used = None
return
self.assert_isinstance(value, "last_used", six.string_types + (datetime,))
if not isinstance(value, datetime):
value = parse_datetime(value)
self._property_last_used = value
@schema_property('last_used_from')
def last_used_from(self):
return self._property_last_used_from
@last_used_from.setter
def last_used_from(self, value):
if value is None:
self._property_last_used_from = None
return
self.assert_isinstance(value, "last_used_from", six.string_types)
self._property_last_used_from = value
class CreateCredentialsRequest(Request):
"""
Creates a new set of credentials for the authenticated user.
New key/secret is returned.
Note: Secret will never be returned in any other API call.
If a secret is lost or compromised, the key should be revoked
and a new set of credentials can be created.
"""
_service = "auth"
_action = "create_credentials"
_version = "2.5"
_schema = {
'additionalProperties': False,
'definitions': {},
'properties': {},
'type': 'object',
}
class CreateCredentialsResponse(Response):
"""
Response of auth.create_credentials endpoint.
:param credentials: Created credentials
:type credentials: Credentials
"""
_service = "auth"
_action = "create_credentials"
_version = "2.5"
_schema = {
'definitions': {
'credentials': {
'properties': {
'access_key': {
'description': 'Credentials access key',
'type': ['string', 'null'],
},
'secret_key': {
'description': 'Credentials secret key',
'type': ['string', 'null'],
},
},
'type': 'object',
},
},
'properties': {
'credentials': {
'description': 'Created credentials',
'oneOf': [{'$ref': '#/definitions/credentials'}, {'type': 'null'}],
},
},
'type': 'object',
}
def __init__(
self, credentials=None, **kwargs):
super(CreateCredentialsResponse, self).__init__(**kwargs)
self.credentials = credentials
@schema_property('credentials')
def credentials(self):
return self._property_credentials
@credentials.setter
def credentials(self, value):
if value is None:
self._property_credentials = None
return
if isinstance(value, dict):
value = Credentials.from_dict(value)
else:
self.assert_isinstance(value, "credentials", Credentials)
self._property_credentials = value
class EditUserRequest(Request):
"""
Edit a users' auth data properties
:param user: User ID
:type user: str
:param role: The new user's role within the company
:type role: str
"""
_service = "auth"
_action = "edit_user"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'role': {
'description': "The new user's role within the company",
'enum': ['admin', 'superuser', 'user', 'annotator'],
'type': ['string', 'null'],
},
'user': {'description': 'User ID', 'type': ['string', 'null']},
},
'type': 'object',
}
def __init__(
self, user=None, role=None, **kwargs):
super(EditUserRequest, self).__init__(**kwargs)
self.user = user
self.role = role
@schema_property('user')
def user(self):
return self._property_user
@user.setter
def user(self, value):
if value is None:
self._property_user = None
return
self.assert_isinstance(value, "user", six.string_types)
self._property_user = value
@schema_property('role')
def role(self):
return self._property_role
@role.setter
def role(self, value):
if value is None:
self._property_role = None
return
self.assert_isinstance(value, "role", six.string_types)
self._property_role = value
class EditUserResponse(Response):
"""
Response of auth.edit_user endpoint.
:param updated: Number of users updated (0 or 1)
:type updated: float
:param fields: Updated fields names and values
:type fields: dict
"""
_service = "auth"
_action = "edit_user"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'fields': {
'additionalProperties': True,
'description': 'Updated fields names and values',
'type': ['object', 'null'],
},
'updated': {
'description': 'Number of users updated (0 or 1)',
'enum': [0, 1],
'type': ['number', 'null'],
},
},
'type': 'object',
}
def __init__(
self, updated=None, fields=None, **kwargs):
super(EditUserResponse, self).__init__(**kwargs)
self.updated = updated
self.fields = fields
@schema_property('updated')
def updated(self):
return self._property_updated
@updated.setter
def updated(self, value):
if value is None:
self._property_updated = None
return
self.assert_isinstance(value, "updated", six.integer_types + (float,))
self._property_updated = value
@schema_property('fields')
def fields(self):
return self._property_fields
@fields.setter
def fields(self, value):
if value is None:
self._property_fields = None
return
self.assert_isinstance(value, "fields", (dict,))
self._property_fields = value
class GetCredentialsRequest(Request):
"""
Returns all existing credential keys for the authenticated user.
Note: Only credential keys are returned.
"""
_service = "auth"
_action = "get_credentials"
_version = "2.5"
_schema = {
'additionalProperties': False,
'definitions': {},
'properties': {},
'type': 'object',
}
class GetCredentialsResponse(Response):
"""
Response of auth.get_credentials endpoint.
:param credentials: List of credentials, each with an empty secret field.
:type credentials: Sequence[CredentialKey]
"""
_service = "auth"
_action = "get_credentials"
_version = "2.5"
_schema = {
'definitions': {
'credential_key': {
'properties': {
'access_key': {'description': '', 'type': ['string', 'null']},
'last_used': {
'description': '',
'format': 'date-time',
'type': ['string', 'null'],
},
'last_used_from': {
'description': '',
'type': ['string', 'null'],
},
},
'type': 'object',
},
},
'properties': {
'credentials': {
'description': 'List of credentials, each with an empty secret field.',
'items': {'$ref': '#/definitions/credential_key'},
'type': ['array', 'null'],
},
},
'type': 'object',
}
def __init__(
self, credentials=None, **kwargs):
super(GetCredentialsResponse, self).__init__(**kwargs)
self.credentials = credentials
@schema_property('credentials')
def credentials(self):
return self._property_credentials
@credentials.setter
def credentials(self, value):
if value is None:
self._property_credentials = None
return
self.assert_isinstance(value, "credentials", (list, tuple))
if any(isinstance(v, dict) for v in value):
value = [CredentialKey.from_dict(v) if isinstance(v, dict) else v for v in value]
else:
self.assert_isinstance(value, "credentials", CredentialKey, is_array=True)
self._property_credentials = value
class LoginRequest(Request):
"""
Get a token based on supplied credentials (key/secret).
Intended for use by users with key/secret credentials that wish to obtain a token
for use with other services. Token will be limited by the same permissions that
exist for the credentials used in this call.
:param expiration_sec: Requested token expiration time in seconds. Not
guaranteed, might be overridden by the service
:type expiration_sec: int
"""
_service = "auth"
_action = "login"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'expiration_sec': {
'description': 'Requested token expiration time in seconds. \n Not guaranteed, might be overridden by the service',
'type': ['integer', 'null'],
},
},
'type': 'object',
}
def __init__(
self, expiration_sec=None, **kwargs):
super(LoginRequest, self).__init__(**kwargs)
self.expiration_sec = expiration_sec
@schema_property('expiration_sec')
def expiration_sec(self):
return self._property_expiration_sec
@expiration_sec.setter
def expiration_sec(self, value):
if value is None:
self._property_expiration_sec = None
return
if isinstance(value, float) and value.is_integer():
value = int(value)
self.assert_isinstance(value, "expiration_sec", six.integer_types)
self._property_expiration_sec = value
class LoginResponse(Response):
"""
Response of auth.login endpoint.
:param token: Token string
:type token: str
"""
_service = "auth"
_action = "login"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'token': {'description': 'Token string', 'type': ['string', 'null']},
},
'type': 'object',
}
def __init__(
self, token=None, **kwargs):
super(LoginResponse, self).__init__(**kwargs)
self.token = token
@schema_property('token')
def token(self):
return self._property_token
@token.setter
def token(self, value):
if value is None:
self._property_token = None
return
self.assert_isinstance(value, "token", six.string_types)
self._property_token = value
class LogoutRequest(Request):
"""
Removes the authentication cookie from the current session
"""
_service = "auth"
_action = "logout"
_version = "2.5"
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}
class LogoutResponse(Response):
"""
Response of auth.logout endpoint.
"""
_service = "auth"
_action = "logout"
_version = "2.5"
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}
class RevokeCredentialsRequest(Request):
"""
Revokes (and deletes) a set (key, secret) of credentials for
the authenticated user.
:param access_key: Credentials key
:type access_key: str
"""
_service = "auth"
_action = "revoke_credentials"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'access_key': {
'description': 'Credentials key',
'type': ['string', 'null'],
},
},
'required': ['key_id'],
'type': 'object',
}
def __init__(
self, access_key=None, **kwargs):
super(RevokeCredentialsRequest, self).__init__(**kwargs)
self.access_key = access_key
@schema_property('access_key')
def access_key(self):
return self._property_access_key
@access_key.setter
def access_key(self, value):
if value is None:
self._property_access_key = None
return
self.assert_isinstance(value, "access_key", six.string_types)
self._property_access_key = value
class RevokeCredentialsResponse(Response):
"""
Response of auth.revoke_credentials endpoint.
:param revoked: Number of credentials revoked
:type revoked: int
"""
_service = "auth"
_action = "revoke_credentials"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'revoked': {
'description': 'Number of credentials revoked',
'enum': [0, 1],
'type': ['integer', 'null'],
},
},
'type': 'object',
}
def __init__(
self, revoked=None, **kwargs):
super(RevokeCredentialsResponse, self).__init__(**kwargs)
self.revoked = revoked
@schema_property('revoked')
def revoked(self):
return self._property_revoked
@revoked.setter
def revoked(self, value):
if value is None:
self._property_revoked = None
return
if isinstance(value, float) and value.is_integer():
value = int(value)
self.assert_isinstance(value, "revoked", six.integer_types)
self._property_revoked = value
response_mapping = {
LoginRequest: LoginResponse,
LogoutRequest: LogoutResponse,
CreateCredentialsRequest: CreateCredentialsResponse,
GetCredentialsRequest: GetCredentialsResponse,
RevokeCredentialsRequest: RevokeCredentialsResponse,
EditUserRequest: EditUserResponse,
}

View File

@@ -0,0 +1,194 @@
"""
debug service
Debugging utilities
"""
import six
import types
from datetime import datetime
import enum
from dateutil.parser import parse as parse_datetime
from ....backend_api.session import Request, BatchRequest, Response, DataModel, NonStrictDataModel, CompoundRequest, schema_property, StringEnum
class ApiexRequest(Request):
"""
"""
_service = "debug"
_action = "apiex"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'required': [], 'type': 'object'}
class ApiexResponse(Response):
"""
Response of debug.apiex endpoint.
"""
_service = "debug"
_action = "apiex"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
class EchoRequest(Request):
"""
Return request data
"""
_service = "debug"
_action = "echo"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
class EchoResponse(Response):
"""
Response of debug.echo endpoint.
"""
_service = "debug"
_action = "echo"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
class ExRequest(Request):
"""
"""
_service = "debug"
_action = "ex"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'required': [], 'type': 'object'}
class ExResponse(Response):
"""
Response of debug.ex endpoint.
"""
_service = "debug"
_action = "ex"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
class PingRequest(Request):
"""
Return a message. Does not require authorization.
"""
_service = "debug"
_action = "ping"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
class PingResponse(Response):
"""
Response of debug.ping endpoint.
:param msg: A friendly message
:type msg: str
"""
_service = "debug"
_action = "ping"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'msg': {
'description': 'A friendly message',
'type': ['string', 'null'],
},
},
'type': 'object',
}
def __init__(
self, msg=None, **kwargs):
super(PingResponse, self).__init__(**kwargs)
self.msg = msg
@schema_property('msg')
def msg(self):
return self._property_msg
@msg.setter
def msg(self, value):
if value is None:
self._property_msg = None
return
self.assert_isinstance(value, "msg", six.string_types)
self._property_msg = value
class PingAuthRequest(Request):
"""
Return a message. Requires authorization.
"""
_service = "debug"
_action = "ping_auth"
_version = "2.5"
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
class PingAuthResponse(Response):
"""
Response of debug.ping_auth endpoint.
:param msg: A friendly message
:type msg: str
"""
_service = "debug"
_action = "ping_auth"
_version = "2.5"
_schema = {
'definitions': {},
'properties': {
'msg': {
'description': 'A friendly message',
'type': ['string', 'null'],
},
},
'type': 'object',
}
def __init__(
self, msg=None, **kwargs):
super(PingAuthResponse, self).__init__(**kwargs)
self.msg = msg
@schema_property('msg')
def msg(self):
return self._property_msg
@msg.setter
def msg(self, value):
if value is None:
self._property_msg = None
return
self.assert_isinstance(value, "msg", six.string_types)
self._property_msg = value
response_mapping = {
EchoRequest: EchoResponse,
PingRequest: PingResponse,
PingAuthRequest: PingAuthResponse,
ApiexRequest: ApiexResponse,
ExRequest: ExResponse,
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -16,6 +16,7 @@ from .request import Request, BatchRequest
from .token_manager import TokenManager
from ..config import load
from ..utils import get_http_session_with_retry, urllib_log_warning_setup
from ...backend_config.environment import backward_compatibility_support
from ...version import __version__
@@ -86,6 +87,8 @@ class Session(TokenManager):
config=None,
**kwargs
):
# add backward compatibility support for old environment variables
backward_compatibility_support()
if config is not None:
self.config = config

View File

@@ -23,3 +23,31 @@ class EnvEntry(Entry):
def error(self, message):
print("Environment configuration: {}".format(message))
def backward_compatibility_support():
from ..definitions import ENVIRONMENT_CONFIG, ENVIRONMENT_SDK_PARAMS, ENVIRONMENT_BACKWARD_COMPATIBLE
if not ENVIRONMENT_BACKWARD_COMPATIBLE.get():
return
# Add ALG_ prefix on every TRAINS_ os environment we support
for k, v in ENVIRONMENT_CONFIG.items():
try:
trains_vars = [var for var in v.vars if var.startswith('TRAINS_')]
if not trains_vars:
continue
alg_var = trains_vars[0].replace('TRAINS_', 'ALG_', 1)
if alg_var not in v.vars:
v.vars = tuple(list(v.vars) + [alg_var])
except:
continue
for k, v in ENVIRONMENT_SDK_PARAMS.items():
try:
trains_vars = [var for var in v if var.startswith('TRAINS_')]
if not trains_vars:
continue
alg_var = trains_vars[0].replace('TRAINS_', 'ALG_', 1)
if alg_var not in v:
ENVIRONMENT_SDK_PARAMS[k] = tuple(list(v) + [alg_var])
except:
continue

View File

@@ -94,9 +94,20 @@ class ServiceCommandSection(BaseCommandSection):
def __init__(self, *args, **kwargs):
super(ServiceCommandSection, self).__init__()
kwargs = self._verify_command_states(kwargs)
self._session = self._get_session(*args, **kwargs)
self._list_formatter = ListFormatter(self.service)
@classmethod
def _verify_command_states(cls, kwargs):
"""
Conform and enforce command argument
This is where you can automatically turn on/off switches based on different states.
:param kwargs:
:return: kwargs
"""
return kwargs
@staticmethod
def _get_session(*args, **kwargs):
return Session(*args, **kwargs)

View File

@@ -44,7 +44,7 @@ def main():
sentinel = ''
parse_input = '\n'.join(iter(input, sentinel))
credentials = None
api_host = None
api_server = None
web_server = None
# noinspection PyBroadException
try:
@@ -52,11 +52,11 @@ def main():
if parsed:
# Take the credentials in raw form or from api section
credentials = get_parsed_field(parsed, ["credentials"])
api_host = get_parsed_field(parsed, ["api_server", "host"])
api_server = get_parsed_field(parsed, ["api_server", "host"])
web_server = get_parsed_field(parsed, ["web_server"])
except Exception:
credentials = credentials or None
api_host = api_host or None
api_server = api_server or None
web_server = web_server or None
while not credentials or set(credentials) != {"access_key", "secret_key"}:
@@ -65,63 +65,25 @@ def main():
print('Detected credentials key=\"{}\" secret=\"{}\"'.format(credentials['access_key'],
credentials['secret_key'][0:4] + "***"))
if api_host:
api_host = input_url('API Host', api_host)
web_input = True
if web_server:
host = input_url('WEB Host', web_server)
elif api_server:
web_input = False
host = input_url('API Host', api_server)
else:
print(host_description)
api_host = input_url('API Host', '')
parsed_host = verify_url(api_host)
host = input_url('WEB Host', '')
if parsed_host.netloc.startswith('demoapp.'):
# this is our demo server
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapp.', 'demoapi.', 1) + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapp.', 'demofiles.', 1) + parsed_host.path
elif parsed_host.netloc.startswith('app.'):
# this is our application server
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('app.', 'api.', 1) + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('app.', 'files.', 1) + parsed_host.path
elif parsed_host.netloc.startswith('demoapi.'):
print('{} is the api server, we need the web server. Replacing \'demoapi.\' with \'demoapp.\''.format(
parsed_host.netloc))
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapi.', 'demoapp.', 1) + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapi.', 'demofiles.', 1) + parsed_host.path
elif parsed_host.netloc.startswith('api.'):
print('{} is the api server, we need the web server. Replacing \'api.\' with \'app.\''.format(
parsed_host.netloc))
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('api.', 'app.', 1) + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('api.', 'files.', 1) + parsed_host.path
elif parsed_host.port == 8008:
print('Port 8008 is the api port. Replacing 8080 with 8008 for Web application')
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8080', 1) + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8081', 1) + parsed_host.path
elif parsed_host.port == 8080:
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8008', 1) + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8081', 1) + parsed_host.path
parsed_host = verify_url(host)
api_host, files_host, web_host = parse_host(parsed_host, allow_input=True)
# on of these two we configured
if not web_input:
web_host = input_url('Web Application Host', web_host)
else:
api_host = ''
web_host = ''
files_host = ''
if not parsed_host.port:
print('Host port not detected, do you wish to use the default 8080 port n/[y]? ', end='')
replace_port = input().lower()
if not replace_port or replace_port == 'y' or replace_port == 'yes':
api_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8008' + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8080' + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8081' + parsed_host.path
elif not replace_port or replace_port.lower() == 'n' or replace_port.lower() == 'no':
web_host = input_host_port("Web", parsed_host)
api_host = input_host_port("API", parsed_host)
files_host = input_host_port("Files", parsed_host)
if not api_host:
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
api_host = input_url('API Host', api_host)
web_host = input_url('Web Application Host', web_server if web_server else web_host)
files_host = input_url('File Store Host', files_host)
print('\nTRAINS Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
@@ -208,6 +170,63 @@ def main():
print('TRAINS-AGENT setup completed successfully.')
def parse_host(parsed_host, allow_input=True):
if parsed_host.netloc.startswith('demoapp.'):
# this is our demo server
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapp.', 'demoapi.', 1) + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapp.', 'demofiles.',
1) + parsed_host.path
elif parsed_host.netloc.startswith('app.'):
# this is our application server
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('app.', 'api.', 1) + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('app.', 'files.', 1) + parsed_host.path
elif parsed_host.netloc.startswith('demoapi.'):
print('{} is the api server, we need the web server. Replacing \'demoapi.\' with \'demoapp.\''.format(
parsed_host.netloc))
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapi.', 'demoapp.', 1) + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapi.', 'demofiles.',
1) + parsed_host.path
elif parsed_host.netloc.startswith('api.'):
print('{} is the api server, we need the web server. Replacing \'api.\' with \'app.\''.format(
parsed_host.netloc))
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('api.', 'app.', 1) + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('api.', 'files.', 1) + parsed_host.path
elif parsed_host.port == 8008:
print('Port 8008 is the api port. Replacing 8080 with 8008 for Web application')
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8080', 1) + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8081', 1) + parsed_host.path
elif parsed_host.port == 8080:
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8008', 1) + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8081', 1) + parsed_host.path
elif allow_input:
api_host = ''
web_host = ''
files_host = ''
if not parsed_host.port:
print('Host port not detected, do you wish to use the default 8080 port n/[y]? ', end='')
replace_port = input().lower()
if not replace_port or replace_port == 'y' or replace_port == 'yes':
api_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8008' + parsed_host.path
web_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8080' + parsed_host.path
files_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8081' + parsed_host.path
elif not replace_port or replace_port.lower() == 'n' or replace_port.lower() == 'no':
web_host = input_host_port("Web", parsed_host)
api_host = input_host_port("API", parsed_host)
files_host = input_host_port("Files", parsed_host)
if not api_host:
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
else:
raise ValueError("Could not parse host name")
return api_host, files_host, web_host
def verify_credentials(api_host, credentials):
"""check if the credentials are valid"""
# noinspection PyBroadException

File diff suppressed because it is too large Load Diff

View File

@@ -55,35 +55,41 @@ class EnvironmentConfig(object):
ENVIRONMENT_CONFIG = {
"api.api_server": EnvironmentConfig("TRAINS_API_HOST", "ALG_API_HOST"),
"api.api_server": EnvironmentConfig("TRAINS_API_HOST", ),
"api.credentials.access_key": EnvironmentConfig(
"TRAINS_API_ACCESS_KEY", "ALG_API_ACCESS_KEY"
"TRAINS_API_ACCESS_KEY",
),
"api.credentials.secret_key": EnvironmentConfig(
"TRAINS_API_SECRET_KEY", "ALG_API_SECRET_KEY"
"TRAINS_API_SECRET_KEY",
),
"agent.worker_name": EnvironmentConfig("TRAINS_WORKER_NAME", "ALG_WORKER_NAME"),
"agent.worker_id": EnvironmentConfig("TRAINS_WORKER_ID", "ALG_WORKER_ID"),
"agent.worker_name": EnvironmentConfig("TRAINS_WORKER_NAME", ),
"agent.worker_id": EnvironmentConfig("TRAINS_WORKER_ID", ),
"agent.cuda_version": EnvironmentConfig(
"TRAINS_CUDA_VERSION", "ALG_CUDA_VERSION", "CUDA_VERSION"
"TRAINS_CUDA_VERSION", "CUDA_VERSION"
),
"agent.cudnn_version": EnvironmentConfig(
"TRAINS_CUDNN_VERSION", "ALG_CUDNN_VERSION", "CUDNN_VERSION"
"TRAINS_CUDNN_VERSION", "CUDNN_VERSION"
),
"agent.cpu_only": EnvironmentConfig(
"TRAINS_CPU_ONLY", "ALG_CPU_ONLY", "CPU_ONLY", type=bool
"TRAINS_CPU_ONLY", "CPU_ONLY", type=bool
),
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
"sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"),
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
"sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
'account_key': EnvironmentConfig("AZURE_STORAGE_KEY")},
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
}
CONFIG_FILE_ENV = EnvironmentConfig("ALG_CONFIG_FILE")
ENVIRONMENT_SDK_PARAMS = {
"task_id": ("TRAINS_TASK_ID", "ALG_TASK_ID"),
"config_file": ("TRAINS_CONFIG_FILE", "ALG_CONFIG_FILE", "TRAINS_CONFIG_FILE"),
"log_level": ("TRAINS_LOG_LEVEL", "ALG_LOG_LEVEL"),
"log_to_backend": ("TRAINS_LOG_TASK_TO_BACKEND", "ALG_LOG_TASK_TO_BACKEND"),
"task_id": ("TRAINS_TASK_ID", ),
"config_file": ("TRAINS_CONFIG_FILE", ),
"log_level": ("TRAINS_LOG_LEVEL", ),
"log_to_backend": ("TRAINS_LOG_TASK_TO_BACKEND", ),
}
ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig("TRAINS_AGENT_ALG_ENV", type=bool)
VIRTUAL_ENVIRONMENT_PATH = {
"python2": normalize_path(CONFIG_DIR, "py2venv"),
"python3": normalize_path(CONFIG_DIR, "py3venv"),
@@ -107,13 +113,18 @@ HTTP_HEADERS = {
METADATA_EXTENSION = ".json"
DEFAULT_VENV_UPDATE_URL = (
"https://raw.githubusercontent.com/Yelp/venv-update/v3.2.2/venv_update.py"
"https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
)
WORKING_REPOSITORY_DIR = "task_repository"
DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
PIP_EXTRA_INDICES = [
]
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
ENV_AGENT_GIT_USER = EnvironmentConfig('TRAINS_AGENT_GIT_USER')
ENV_AGENT_GIT_PASS = EnvironmentConfig('TRAINS_AGENT_GIT_PASS')
ENV_TASK_EXECUTE_AS_USER = 'TRAINS_AGENT_EXEC_USER'
ENV_TASK_EXTRA_PYTHON_PATH = 'TRAINS_AGENT_EXTRA_PYTHON_PATH'
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
class FileBuffering(IntEnum):

View File

@@ -0,0 +1 @@

169
trains_agent/glue/k8s.py Normal file
View File

@@ -0,0 +1,169 @@
from __future__ import print_function, division, unicode_literals
import logging
import os
import subprocess
from time import sleep
from typing import Text, List
from pyhocon import HOCONConverter
from trains_agent.commands.events import Events
from trains_agent.commands.worker import Worker
from trains_agent.helper.process import get_bash_output
from trains_agent.helper.resource_monitor import ResourceMonitor
class K8sIntegration(Worker):
K8S_PENDING_QUEUE = "k8s_scheduler"
KUBECTL_RUN_CMD = "kubectl run trains_id_{task_id} " \
"--image {docker_image} " \
"--restart=Never --replicas=1 " \
"--generator=run-pod/v1"
KUBECTL_DELETE_CMD = "kubectl delete pods " \
"--selector=TRAINS=agent " \
"--field-selector=status.phase!=Pending,status.phase!=Running"
CONTAINER_BASH_SCRIPT = "apt-get install -y git python-pip && " \
"pip install trains-agent && " \
"python -u -m trains_agent execute --full-monitoring --require-queue --id {}"
def __init__(self, k8s_pending_queue_name=None, kubectl_cmd=None, container_bash_script=None, debug=False):
"""
Initialize the k8s integration glue layer daemon
:param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
:param str|callable kubectl_cmd: kubectl command line str, supports formating (default: KUBECTL_RUN_CMD)
example: "task={task_id} image={docker_image} queue_id={queue_id}"
or a callable function: kubectl_cmd(task_id, docker_image, queue_id, task_data)
:param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
:param bool debug: Switch logging on
"""
super(K8sIntegration, self).__init__()
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
# Always do system packages, because by we will be running inside a docker
self._session.config.put("agent.package_manager.system_site_packages", True)
# Add debug logging
if debug:
self.log.logger.disabled = False
self.log.logger.setLevel(logging.INFO)
def run_one_task(self, queue: Text, task_id: Text, worker_args=None):
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
try:
self._session.api_client.tasks.enqueue(task_id, queue=self.k8s_pending_queue_name,
status_reason='k8s pending scheduler')
except Exception as e:
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
task_id, self.k8s_pending_queue_name, e))
return
if task_data.execution.docker_cmd:
docker_image = task_data.execution.docker_cmd
else:
docker_image = str(os.environ.get("TRAINS_DOCKER_IMAGE") or
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
# take the first part, this is the docker image name (not arguments)
docker_image = docker_image.split()[0]
create_trains_conf = "echo '{}' >> ~/trains.conf && ".format(
HOCONConverter.to_hocon(self._session.config._config))
if callable(self.kubectl_cmd):
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
else:
kubectl_cmd = self.kubectl_cmd.format(task_id=task_id, docker_image=docker_image, queue_id=queue)
# make sure we gave a list
if isinstance(kubectl_cmd, str):
kubectl_cmd = kubectl_cmd.split()
kubectl_cmd += ["--labels=TRAINS=agent", "--command", "--", "/bin/sh", "-c",
create_trains_conf + self.container_bash_script.format(task_id)]
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
self.log.info("K8s scheduling experiment task id={}".format(task_id))
if error:
self.log.error("Running kubectl encountered an error: {}".format(
error if isinstance(error, str) else error.decode()))
def run_tasks_loop(self, queues: List[Text], worker_params):
"""
:summary: Pull and run tasks from queues.
:description: 1. Go through ``queues`` by order.
2. Try getting the next task for each and run the first one that returns.
3. Go to step 1
:param queues: IDs of queues to pull tasks from
:type queues: list of ``Text``
:param worker_params: Worker command line arguments
:type worker_params: ``trains_agent.helper.process.WorkerParams``
"""
events_service = self.get_service(Events)
# make sure we have a k8s pending queue
try:
self._session.api_client.queues.create(self.k8s_pending_queue_name)
except Exception:
pass
# get queue id
self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")
_last_machine_update_ts = 0
while True:
# iterate over queues (priority style, queues[0] is highest)
for queue in queues:
# delete old completed /failed pods
get_bash_output(self.KUBECTL_DELETE_CMD)
# get next task in queue
try:
response = self._session.api_client.queues.get_next_task(queue=queue)
except Exception as e:
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
continue
else:
try:
task_id = response.entry.task
except AttributeError:
print("No tasks in queue {}".format(queue))
continue
events_service.send_log_events(
self.worker_id,
task_id=task_id,
lines="task {} pulled from {} by worker {}".format(
task_id, queue, self.worker_id
),
level="INFO",
)
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
self.run_one_task(queue, task_id, worker_params)
self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
break
else:
# sleep and retry polling
print("No tasks in Queues, sleeping for {:.1f} seconds".format(self._polling_interval))
sleep(self._polling_interval)
if self._session.config["agent.reload_config"]:
self.reload_config()
def k8s_daemon(self, queues):
"""
Start the k8s Glue service.
This service will be pulling tasks from *queues* and scheduling them for execution using kubectl.
Notice all scheduled tasks are pushed back into K8S_PENDING_QUEUE,
and popped when execution actually starts. This creates full visibility into the k8s scheduler.
Manually popping a task from the K8S_PENDING_QUEUE,
will cause the k8s scheduler to skip the execution once the scheduled tasks needs to be executed
:param list(str) queues: List of queue names to pull from
"""
return self.daemon(queues=queues, log_level=logging.INFO, foreground=True, docker=False)

View File

@@ -199,6 +199,20 @@ def get_python_path(script_dir, entry_point, package_api):
return None
def add_python_path(base_path, extra_path):
try:
if not extra_path:
return base_path
python_path_sep = ';' if is_windows_platform() else ':'
base_path = base_path or ''
if not base_path.endswith(python_path_sep):
base_path += python_path_sep
base_path += extra_path.replace(':', python_path_sep)
except:
pass
return base_path
class Singleton(ABCMeta):
_instances = {}
@@ -463,6 +477,17 @@ def rm_tree(root): # type: (Union[Path, Text]) -> None
return shutil.rmtree(os.path.expanduser(os.path.expandvars(Text(root))), onerror=on_error)
def rm_file(filename): # type: (Union[Path, Text]) -> None
"""
A version of os.unlink that will not raise error
"""
try:
os.unlink(os.path.expanduser(os.path.expandvars(Text(filename))))
except:
return False
return True
def is_conda(config):
return config['agent.package_manager.type'].lower() == 'conda'

View File

@@ -4,7 +4,7 @@ from time import sleep
import requests
import json
from threading import Thread
from packaging import version as packaging_version
from .package.requirements import SimpleVersion
from ..version import __version__
__check_update_thread = None
@@ -30,11 +30,11 @@ def _check_new_version_available():
return None
trains_answer = update_server_releases.get("trains-agent", {})
latest_version = trains_answer.get("version")
cur_version = packaging_version.parse(cur_version)
latest_version = packaging_version.parse(latest_version or '')
if cur_version >= latest_version:
cur_version = cur_version
latest_version = latest_version or ''
if SimpleVersion.compare_versions(cur_version, '>=', latest_version):
return None
patch_upgrade = latest_version.major == cur_version.major and latest_version.minor == cur_version.minor
patch_upgrade = True # latest_version.major == cur_version.major and latest_version.minor == cur_version.minor
return str(latest_version), patch_upgrade, trains_answer.get("description").split("\r\n")

View File

@@ -22,6 +22,18 @@ def print_text(text, newline=True):
sys.stdout.write(data)
def decode_binary_lines(binary_lines, encoding='utf-8'):
# decode per line, if we failed decoding skip the line
lines = []
for b in binary_lines:
try:
l = b.decode(encoding=encoding, errors='replace').replace('\r', '\n')
except:
l = ''
lines.append(l + '\n' if l and l[-1] != '\n' else l)
return lines
def ensure_text(s, encoding='utf-8', errors='strict'):
"""Coerce *s* to six.text_type.
For Python 2:

View File

View File

@@ -0,0 +1,74 @@
import os
def daemonize_process(redirect_fd=None):
"""
Detach a process from the controlling terminal and run it in the background as a daemon.
"""
assert redirect_fd is None or isinstance(redirect_fd, int)
# re-spawn in the same directory
WORKDIR = os.getcwd()
# The standard I/O file descriptors are redirected to /dev/null by default.
if hasattr(os, "devnull"):
devnull = os.devnull
else:
devnull = "/dev/null"
try:
# Fork a child process so the parent can exit. This returns control to
# the command-line or shell. It also guarantees that the child will not
# be a process group leader, since the child receives a new process ID
# and inherits the parent's process group ID. This step is required
# to insure that the next call to os.setsid is successful.
pid = os.fork()
except OSError as e:
raise Exception("%s [%d]" % (e.strerror, e.errno))
if pid == 0: # The first child.
# To become the session leader of this new session and the process group
# leader of the new process group, we call os.setsid().
# The process is also guaranteed not to have a controlling terminal.
os.setsid()
# Is ignoring SIGHUP necessary? (Set handlers for asynchronous events.)
# import signal
# signal.signal(signal.SIGHUP, signal.SIG_IGN)
try:
# Fork a second child and exit immediately to prevent zombies. This
# causes the second child process to be orphaned, making the init
# process responsible for its cleanup.
pid = os.fork() # Fork a second child.
except OSError as e:
raise Exception("%s [%d]" % (e.strerror, e.errno))
if pid == 0: # The second child.
# Since the current working directory may be a mounted filesystem, we
# avoid the issue of not being able to unmount the filesystem at
# shutdown time by changing it to the root directory.
os.chdir(WORKDIR)
# We probably don't want the file mode creation mask inherited from
# the parent, so we give the child complete control over permissions.
os.umask(0)
else:
# Exit parent (the first child) of the second child.
os._exit(0)
else:
# Exit parent of the first child.
os._exit(0)
# notice we count on the fact that we keep all file descriptors open,
# since we opened then in the parent process, but the daemon process will use them
# Redirect the standard I/O file descriptors to the specified file /dev/null.
if redirect_fd is None:
redirect_fd = os.open(devnull, os.O_RDWR)
# Duplicate standard input to standard output and standard error.
# standard output (1), standard error (2)
os.dup2(redirect_fd, 1)
os.dup2(redirect_fd, 2)
return 0

View File

@@ -14,13 +14,13 @@ import yaml
from time import time
from attr import attrs, attrib, Factory
from pathlib2 import Path
from packaging import version as packaging_version
from requirements import parse
from requirements.requirement import Requirement
from trains_agent.errors import CommandFailedError
from trains_agent.helper.base import rm_tree, NonStrictAttrs, select_for_platform, is_windows_platform
from trains_agent.helper.process import Argv, Executable, DEVNULL, CommandSequence, PathLike
from trains_agent.helper.package.requirements import SimpleVersion
from trains_agent.session import Session
from .base import PackageManager
from .pip_api.venv import VirtualenvPip
@@ -59,7 +59,7 @@ class CondaAPI(PackageManager):
A programmatic interface for controlling conda
"""
MINIMUM_VERSION = packaging_version.parse("4.3.30")
MINIMUM_VERSION = "4.3.30"
def __init__(self, session, path, python, requirements_manager):
# type: (Session, PathLike, float, RequirementsManager) -> None
@@ -93,7 +93,7 @@ class CondaAPI(PackageManager):
)
)
self.conda_version = self.get_conda_version(output)
if packaging_version.parse(self.conda_version) < self.MINIMUM_VERSION:
if SimpleVersion.compare_versions(self.conda_version, '<', self.MINIMUM_VERSION):
raise CommandFailedError(
"conda version '{}' is smaller than minimum supported conda version '{}'".format(
self.conda_version, self.MINIMUM_VERSION
@@ -112,7 +112,7 @@ class CondaAPI(PackageManager):
return self.pip.bin
def upgrade_pip(self):
return self.pip.upgrade_pip()
return self._install("pip" + self.pip.get_pip_version())
def create(self):
"""
@@ -262,6 +262,7 @@ class CondaAPI(PackageManager):
# this should happen if experiment was executed on non-conda machine or old trains client
conda_supported_req = requirements['pip'] if requirements.get('conda', None) is None else requirements['conda']
conda_supported_req_names = []
pip_requirements = []
for r in conda_supported_req:
try:
marker = list(parse(r))
@@ -271,6 +272,10 @@ class CondaAPI(PackageManager):
continue
m = MarkerRequirement(marker[0])
# conda does not support version control links
if m.vcs:
pip_requirements.append(m)
continue
conda_supported_req_names.append(m.name.lower())
if m.req.name.lower() == 'matplotlib':
has_matplotlib = True
@@ -287,7 +292,6 @@ class CondaAPI(PackageManager):
reqs.append(m)
pip_requirements = []
# if we have a conda list, the rest should be installed with pip,
if requirements.get('conda', None) is not None:
for r in requirements['pip']:
@@ -416,10 +420,14 @@ class CondaAPI(PackageManager):
try:
print('Executing Conda: {}'.format(command.serialize()))
result = command.get_output(stdin=DEVNULL, **kwargs)
if self.session.debug_mode:
print(result)
except Exception as e:
result = e.output if hasattr(e, 'output') else ''
if self.session.debug_mode:
print(result)
if raw:
raise
result = e.output if hasattr(e, 'output') else ''
if raw:
return result

View File

@@ -1,22 +1,24 @@
import sys
from itertools import chain
from typing import Text
from typing import Text, Optional
from trains_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
from trains_agent.helper.package.base import PackageManager
from trains_agent.helper.process import Argv, DEVNULL
from trains_agent.session import Session
class SystemPip(PackageManager):
indices_args = None
def __init__(self, interpreter=None):
# type: (Text) -> ()
def __init__(self, interpreter=None, session=None):
# type: (Optional[Text], Optional[Session]) -> ()
"""
Program interface to the system pip.
"""
self._bin = interpreter or sys.executable
self.session = session
@property
def bin(self):

View File

@@ -15,19 +15,17 @@ class VirtualenvPip(SystemPip, PackageManager):
Program interface to virtualenv pip.
Must be given either path to virtualenv or source command.
Either way, ``self.source`` is exposed.
:param session: a Session object for communication
:param python: interpreter path
:param path: path of virtual environment to create/manipulate
:param python: python version
:param interpreter: path of python interpreter
"""
super(VirtualenvPip, self).__init__(
interpreter
or Path(
path,
select_for_platform(linux="bin/python", windows="scripts/python.exe"),
)
session=session,
interpreter=interpreter or Path(
path, select_for_platform(linux="bin/python", windows="scripts/python.exe"))
)
self.session = session
self.path = path
self.requirements_manager = requirements_manager
self.python = python

View File

@@ -82,9 +82,13 @@ class PoetryConfig:
def initialize(self, cwd=None):
if not self._initialized:
self._initialized = True
self._config("--local", "virtualenvs.in-project", "true", cwd=cwd)
# self._config("repositories.{}".format(self.REPO_NAME), PYTHON_INDEX)
# self._config("http-basic.{}".format(self.REPO_NAME), *PYTHON_INDEX_CREDENTIALS)
try:
self._config("--local", "virtualenvs.in-project", "true", cwd=cwd)
# self._config("repositories.{}".format(self.REPO_NAME), PYTHON_INDEX)
# self._config("http-basic.{}".format(self.REPO_NAME), *PYTHON_INDEX_CREDENTIALS)
except Exception as ex:
print("Exception: {}\nError: Failed configuring Poetry virtualenvs.in-project".format(ex))
raise
def get_api(self, path):
# type: (Path) -> PoetryAPI

View File

@@ -10,11 +10,9 @@ from typing import Text
import attr
import requests
from packaging import version as packaging_version
from packaging.specifiers import SpecifierSet
import six
from .requirements import SimpleSubstitution, FatalSpecsResolutionError
from .requirements import SimpleSubstitution, FatalSpecsResolutionError, SimpleVersion
OS_TO_WHEEL_NAME = {"linux": "linux_x86_64", "windows": "win_amd64"}
@@ -76,6 +74,7 @@ class SimplePytorchRequirement(SimpleSubstitution):
packages = ("torch", "torchvision", "torchaudio")
page_lookup_template = 'https://download.pytorch.org/whl/cu{}/torch_stable.html'
nightly_page_lookup_template = 'https://download.pytorch.org/whl/nightly/cu{}/torch_nightly.html'
torch_page_lookup = {
0: 'https://download.pytorch.org/whl/cpu/torch_stable.html',
80: 'https://download.pytorch.org/whl/cu80/torch_stable.html',
@@ -117,11 +116,23 @@ class SimplePytorchRequirement(SimpleSubstitution):
package_manager.add_extra_install_flags(('-f', extra_url))
@classmethod
def get_torch_page(cls, cuda_version):
def get_torch_page(cls, cuda_version, nightly=False):
try:
cuda = int(cuda_version)
except:
cuda = 0
if nightly:
# then try the nightly builds, it might be there...
torch_url = cls.nightly_page_lookup_template.format(cuda)
try:
if requests.get(torch_url, timeout=10).ok:
cls.torch_page_lookup[cuda] = torch_url
return cls.torch_page_lookup[cuda], cuda
except Exception:
pass
return
# first check if key is valid
if cuda in cls.torch_page_lookup:
return cls.torch_page_lookup[cuda], cuda
@@ -156,8 +167,7 @@ class PytorchRequirement(SimpleSubstitution):
self.os = os_name or self.get_platform()
self.cuda = "cuda{}".format(self.cuda_version).lower()
self.python_version_string = str(self.config["agent.default_python"])
self.python_major_minor_str = '.'.join(packaging_version.parse(
self.python_version_string).base_version.split('.')[:2])
self.python_major_minor_str = '.'.join(self.python_version_string.split('.')[:2])
if '.' not in self.python_major_minor_str:
raise PytorchResolutionError(
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
@@ -183,6 +193,8 @@ class PytorchRequirement(SimpleSubstitution):
except PytorchResolutionError as e:
self.log.warn("will not be able to install pytorch wheels: %s", e.args[0])
self._original_req = []
@property
def is_conda(self):
return self.package_manager == "conda"
@@ -222,7 +234,6 @@ class PytorchRequirement(SimpleSubstitution):
platform_wheel = "win" if self.get_platform() == "windows" else self.get_platform()
py_ver = self.python_major_minor_str.replace('.', '')
url = None
spec = SpecifierSet(req.format_specs())
last_v = None
# search for our package
for l in links_parser.links:
@@ -234,10 +245,11 @@ class PytorchRequirement(SimpleSubstitution):
# version (ignore +cpu +cu92 etc. + is %2B in the file link)
# version ignore .postX suffix (treat as regular version)
try:
v = packaging_version.parse(parts[1].split('%')[0].split('+')[0])
v = str(parts[1].split('%')[0].split('+')[0])
except Exception:
continue
if v not in spec or (last_v and last_v > v):
if not req.compare_version(v) or \
(last_v and SimpleVersion.compare_versions(last_v, '>', v, ignore_sub_versions=False)):
continue
if not parts[2].endswith(py_ver):
continue
@@ -245,6 +257,13 @@ class PytorchRequirement(SimpleSubstitution):
continue
url = '/'.join(torch_url.split('/')[:-1] + l.split('/'))
last_v = v
# if we found an exact match, use it
try:
if req.specs[0][0] == '==' and \
SimpleVersion.compare_versions(req.specs[0][1], '==', v, ignore_sub_versions=False):
break
except:
pass
return url
@@ -254,9 +273,10 @@ class PytorchRequirement(SimpleSubstitution):
if self.config.get("agent.package_manager.system_site_packages"):
from pip._internal.commands.show import search_packages_info
installed_torch = list(search_packages_info([req.name]))
op, version = req.specs[0] if req.specs else (None, None)
# notice the comparision order, the first part will make sure we have a valid installed package
if installed_torch[0]['version'] and (installed_torch[0]['version'] == version or not version):
if installed_torch[0]['version'] and req.compare_version(installed_torch[0]['version']):
print('PyTorch: requested "{}" version {}, using pre-installed version {}'.format(
req.name, req.specs[0] if req.specs else 'unspecified', installed_torch[0]['version']))
# package already installed, do nothing
return str(req), True
except:
@@ -275,6 +295,9 @@ class PytorchRequirement(SimpleSubstitution):
torch_url, torch_url_key = SimplePytorchRequirement.get_torch_page(self.cuda_version)
url = self._get_link_from_torch_page(req, torch_url)
if not url and self.config.get("agent.package_manager.torch_nightly"):
torch_url, torch_url_key = SimplePytorchRequirement.get_torch_page(self.cuda_version, nightly=True)
url = self._get_link_from_torch_page(req, torch_url)
# try one more time, with a lower cuda version (never fallback to CPU):
while not url and torch_url_key > 0:
previous_cuda_key = torch_url_key
@@ -306,20 +329,17 @@ class PytorchRequirement(SimpleSubstitution):
@staticmethod
def match_version(req, options):
versioned_options = sorted(
((packaging_version.parse(fix_version(key)), value) for key, value in options.items()),
((fix_version(key), value) for key, value in options.items()),
key=itemgetter(0),
reverse=True,
)
req.specs = [(op, fix_version(version)) for op, version in req.specs]
if req.specs:
specs = SpecifierSet(req.format_specs())
else:
specs = None
try:
return next(
replacement
for version, replacement in versioned_options
if not specs or version in specs
if req.compare_version(version)
)
except StopIteration:
raise PytorchResolutionError(
@@ -368,7 +388,10 @@ class PytorchRequirement(SimpleSubstitution):
def replace(self, req):
try:
return self._replace(req)
new_req = self._replace(req)
if new_req:
self._original_req.append((req, new_req))
return new_req
except Exception as e:
message = "Exception when trying to resolve python wheel"
self.log.debug(message, exc_info=True)
@@ -383,17 +406,17 @@ class PytorchRequirement(SimpleSubstitution):
except:
pass
try:
result = self._table_lookup(req)
except Exception as e:
exc = e
else:
self.log.debug('Replacing requirement "%s" with %r', req, result)
return result
# try:
# result = self._table_lookup(req)
# except Exception as e:
# exc = e
# else:
# self.log.debug('Replacing requirement "%s" with %r', req, result)
# return result
# self.log.debug(
# "Could not find Pytorch wheel in table, trying manually constructing URL"
# )
self.log.debug(
"Could not find Pytorch wheel in table, trying manually constructing URL"
)
result = ok = None
# try:
# result, ok = self.get_url_for_platform(req)
@@ -404,7 +427,7 @@ class PytorchRequirement(SimpleSubstitution):
if result:
self.log.debug("URL not found: {}".format(result))
exc = PytorchResolutionError(
"Was not able to find pytorch wheel URL: {}".format(exc)
"Could not find pytorch wheel URL for: {} with cuda {} support".format(req, self.cuda_version)
)
# cancel exception chaining
six.raise_from(exc, None)
@@ -412,6 +435,37 @@ class PytorchRequirement(SimpleSubstitution):
self.log.debug('Replacing requirement "%s" with %r', req, result)
return result
def replace_back(self, list_of_requirements): # type: (Dict) -> Dict
"""
:param list_of_requirements: {'pip': ['a==1.0', ]}
:return: {'pip': ['a==1.0', ]}
"""
if not self._original_req:
return list_of_requirements
try:
for k, lines in list_of_requirements.items():
# k is either pip/conda
if k not in ('pip', 'conda'):
continue
for i, line in enumerate(lines):
if not line or line.lstrip().startswith('#'):
continue
parts = [p for p in re.split('\s|=|\.|<|>|~|!|@|#', line) if p]
if not parts:
continue
for req, new_req in self._original_req:
if req.req.name == parts[0]:
# support for pip >= 20.1
if '@' in line:
lines[i] = '{} # {}'.format(str(req), str(new_req))
else:
lines[i] = '{} # {}'.format(line, str(new_req))
break
except:
pass
return list_of_requirements
MAP = {
"windows": {
"cuda100": {

View File

@@ -10,7 +10,6 @@ from operator import itemgetter
from os import path
from typing import Text, List, Type, Optional, Tuple, Dict
from packaging import version as packaging_version
from pathlib2 import Path
from pyhocon import ConfigTree
from requirements import parse
@@ -69,8 +68,19 @@ class MarkerRequirement(object):
def __repr__(self):
return '{self.__class__.__name__}[{self}]'.format(self=self)
def format_specs(self):
return ','.join(starmap(operator.add, self.specs))
def format_specs(self, num_parts=None, max_num_parts=None):
max_num_parts = max_num_parts or num_parts
if max_num_parts is None or not self.specs:
return ','.join(starmap(operator.add, self.specs))
op, version = self.specs[0]
for v in self._sub_versions_pep440:
version = version.replace(v, '.')
if num_parts:
version = (version.strip('.').split('.') + ['0'] * num_parts)[:max_num_parts]
else:
version = version.strip('.').split('.')[:max_num_parts]
return op+'.'.join(version)
def __getattr__(self, item):
return getattr(self.req, item)
@@ -99,6 +109,186 @@ class MarkerRequirement(object):
else:
self.specs = greater + smaller
def compare_version(self, requested_version, op=None, num_parts=3):
"""
compare the requested version with the one we have in the spec,
If the requested version is 1.2.3 the self.spec should be 1.2.3*
If the requested version is 1.2 the self.spec should be 1.2*
etc.
:param str requested_version:
:param str op: '==', '>', '>=', '<=', '<', '~='
:param int num_parts: number of parts to compare
:return: True if we answer the requested version
"""
# if we have no specific version, we cannot compare, so assume it's okay
if not self.specs:
return True
version = self.specs[0][1]
op = (op or self.specs[0][0]).strip()
return SimpleVersion.compare_versions(requested_version, op, version)
class SimpleVersion:
_sub_versions_pep440 = ['a', 'b', 'rc', '.post', '.dev', '+', ]
VERSION_PATTERN = r"""
v?
(?:
(?:(?P<epoch>[0-9]+)!)? # epoch
(?P<release>[0-9]+(?:\.[0-9]+)*) # release segment
(?P<pre> # pre-release
[-_\.]?
(?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))
[-_\.]?
(?P<pre_n>[0-9]+)?
)?
(?P<post> # post release
(?:-(?P<post_n1>[0-9]+))
|
(?:
[-_\.]?
(?P<post_l>post|rev|r)
[-_\.]?
(?P<post_n2>[0-9]+)?
)
)?
(?P<dev> # dev release
[-_\.]?
(?P<dev_l>dev)
[-_\.]?
(?P<dev_n>[0-9]+)?
)?
)
(?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))? # local version
"""
_local_version_separators = re.compile(r"[\._-]")
_regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
@classmethod
def compare_versions(cls, version_a, op, version_b, ignore_sub_versions=True):
"""
Compare two versions based on the op operator
returns bool(version_a op version_b)
Notice: Ignores a/b/rc/post/dev markers on the version
:param str version_a:
:param str op: '==', '===', '>', '>=', '<=', '<', '~='
:param str version_b:
:param bool ignore_sub_versions: if true compare only major.minor.patch
(ignore a/b/rc/post/dev in the comparison)
:return bool: version_a op version_b
"""
if not version_b:
return True
num_parts = 3
if op == '~=':
num_parts = max(num_parts, 2)
op = '=='
ignore_sub_versions = True
elif op == '===':
op = '=='
try:
version_a_key = cls._get_match_key(cls._regex.search(version_a), num_parts, ignore_sub_versions)
version_b_key = cls._get_match_key(cls._regex.search(version_b), num_parts, ignore_sub_versions)
except:
# revert to string based
for v in cls._sub_versions_pep440:
version_a = version_a.replace(v, '.')
version_b = version_b.replace(v, '.')
version_a = (version_a.strip('.').split('.') + ['0'] * num_parts)[:num_parts]
version_b = (version_b.strip('.').split('.') + ['0'] * num_parts)[:num_parts]
version_a_key = ''
version_b_key = ''
for i in range(num_parts):
pad = '{:0>%d}.' % max([9, 1 + len(version_a[i]), 1 + len(version_b[i])])
version_a_key += pad.format(version_a[i])
version_b_key += pad.format(version_b[i])
if op == '==':
return version_a_key == version_b_key
if op == '<=':
return version_a_key <= version_b_key
if op == '>=':
return version_a_key >= version_b_key
if op == '>':
return version_a_key > version_b_key
if op == '<':
return version_a_key < version_b_key
raise ValueError('Unrecognized comparison operator [{}]'.format(op))
@staticmethod
def _parse_letter_version(
letter, # type: str
number, # type: Union[str, bytes, SupportsInt]
):
# type: (...) -> Optional[Tuple[str, int]]
if letter:
# We consider there to be an implicit 0 in a pre-release if there is
# not a numeral associated with it.
if number is None:
number = 0
# We normalize any letters to their lower case form
letter = letter.lower()
# We consider some words to be alternate spellings of other words and
# in those cases we want to normalize the spellings to our preferred
# spelling.
if letter == "alpha":
letter = "a"
elif letter == "beta":
letter = "b"
elif letter in ["c", "pre", "preview"]:
letter = "rc"
elif letter in ["rev", "r"]:
letter = "post"
return letter, int(number)
if not letter and number:
# We assume if we are given a number, but we are not given a letter
# then this is using the implicit post release syntax (e.g. 1.0-1)
letter = "post"
return letter, int(number)
return ()
@staticmethod
def _get_match_key(match, num_parts, ignore_sub_versions):
if ignore_sub_versions:
return (0, tuple(int(i) for i in match.group("release").split(".")[:num_parts]),
(), (), (), (),)
return (
int(match.group("epoch")) if match.group("epoch") else 0,
tuple(int(i) for i in match.group("release").split(".")[:num_parts]),
SimpleVersion._parse_letter_version(match.group("pre_l"), match.group("pre_n")),
SimpleVersion._parse_letter_version(
match.group("post_l"), match.group("post_n1") or match.group("post_n2")
),
SimpleVersion._parse_letter_version(match.group("dev_l"), match.group("dev_n")),
SimpleVersion._parse_local_version(match.group("local")),
)
@staticmethod
def _parse_local_version(local):
# type: (str) -> Optional[LocalType]
"""
Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
"""
if local is not None:
return tuple(
part.lower() if not part.isdigit() else int(part)
for part in SimpleVersion._local_version_separators.split(local)
)
return ()
@six.add_metaclass(ABCMeta)
class RequirementSubstitution(object):
@@ -177,7 +367,7 @@ class SimpleSubstitution(RequirementSubstitution):
if req.specs:
_, version_number = req.specs[0]
assert packaging_version.parse(version_number)
# assert packaging_version.parse(version_number)
else:
version_number = self.get_pip_version(self.name)

View File

@@ -22,7 +22,7 @@ class RequirementsTranslator(object):
self.enabled = config["agent.pip_download_cache.enabled"]
Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
self.config = Config()
self.pip = SystemPip(interpreter=interpreter)
self.pip = SystemPip(interpreter=interpreter, session=self._session)
def download(self, url):
self.pip.download_package(url, cache_dir=self.cache_dir)

View File

@@ -83,7 +83,15 @@ def shutdown_docker_process(docker_cmd_contains=None, docker_id=None):
pass
def commit_docker(container_name, docker_cmd_contains=None, docker_id=None):
def commit_docker(container_name, docker_cmd_contains=None, docker_id=None, apply_change=None):
"""
Commit a docker into a new image
:param str container_name: Name for the new image
:param docker_cmd_contains: partial container id to be committed
:param str docker_id: Id of container to be comitted
:param str apply_change: apply Dockerfile instructions to the image that is created
(see docker commit documentation for '--change').
"""
try:
if not docker_id:
docker_id = get_docker_id(docker_cmd_contains=docker_cmd_contains)
@@ -93,7 +101,8 @@ def commit_docker(container_name, docker_cmd_contains=None, docker_id=None):
if docker_id:
# we found our docker, stop it
output = get_bash_output(cmd='docker commit {} {}'.format(docker_id, container_name))
apply_change = '--change=\'{}\''.format(apply_change) if apply_change else ''
output = get_bash_output(cmd='docker commit {} {} {}'.format(apply_change, docker_id, container_name))
return output
except Exception:
pass

View File

@@ -12,6 +12,8 @@ from furl import furl
from pathlib2 import Path
import six
from trains_agent.definitions import ENV_AGENT_GIT_USER, ENV_AGENT_GIT_PASS
from trains_agent.helper.console import ensure_text, ensure_binary
from trains_agent.errors import CommandFailedError
from trains_agent.helper.base import (
@@ -249,8 +251,10 @@ class VCS(object):
return
ssh_agent_variable = "SSH_AUTH_SOCK"
if not getenv(ssh_agent_variable) and (self.session.config.get('agent.git_user', None) and
self.session.config.get('agent.git_pass', None)):
if not getenv(ssh_agent_variable) and (
(ENV_AGENT_GIT_USER.get() or self.session.config.get('agent.git_user', None)) and
(ENV_AGENT_GIT_PASS.get() or self.session.config.get('agent.git_pass', None))
):
new_url = self.resolve_ssh_url(self.url)
if new_url != self.url:
print("Using user/pass credentials - replacing ssh url '{}' with https url '{}'".format(
@@ -395,8 +399,8 @@ class VCS(object):
parsed_url = furl(url)
if parsed_url.scheme in ["", "ssh"] or parsed_url.scheme.startswith("git"):
return parsed_url.url
config_user = config.get("agent.{}_user".format(cls.executable_name), None)
config_pass = config.get("agent.{}_pass".format(cls.executable_name), None)
config_user = ENV_AGENT_GIT_USER.get() or config.get("agent.{}_user".format(cls.executable_name), None)
config_pass = ENV_AGENT_GIT_PASS.get() or config.get("agent.{}_pass".format(cls.executable_name), None)
if (
(not (parsed_url.username and parsed_url.password))
and config_user
@@ -456,7 +460,17 @@ class Git(VCS):
)
def pull(self):
self.call("fetch", "--all", cwd=self.location)
self.call("fetch", "--all", "--recurse-submodules", cwd=self.location)
def checkout(self): # type: () -> None
"""
Checkout repository at specified revision
"""
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
try:
self.call("submodule", "update", "--recursive", cwd=self.location)
except:
pass
info_commands = dict(
url=Argv(executable_name, "ls-remote", "--get-url", "origin"),
@@ -519,11 +533,16 @@ def clone_repository_cached(session, execution, destination):
clone_folder_name = Path(str(furl(repo_url).path)).name # type: str
clone_folder = Path(destination) / clone_folder_name
cached_repo_path = (
Path(session.config["agent.vcs_cache.path"]).expanduser()
/ "{}.{}".format(clone_folder_name, md5(ensure_binary(repo_url)).hexdigest())
/ clone_folder_name
) # type: Path
standalone_mode = session.config.get("agent.standalone_mode", False)
if standalone_mode:
cached_repo_path = clone_folder
else:
cached_repo_path = (
Path(session.config["agent.vcs_cache.path"]).expanduser()
/ "{}.{}".format(clone_folder_name, md5(ensure_binary(repo_url)).hexdigest())
/ clone_folder_name
) # type: Path
vcs = VcsFactory.create(
session, execution_info=execution, location=cached_repo_path
@@ -531,23 +550,25 @@ def clone_repository_cached(session, execution, destination):
if not find_executable(vcs.executable_name):
raise CommandFailedError(vcs.executable_not_found_error_help())
if session.config["agent.vcs_cache.enabled"] and cached_repo_path.exists():
print('Using cached repository in "{}"'.format(cached_repo_path))
else:
print("cloning: {}".format(no_password_url))
rm_tree(cached_repo_path)
# We clone the entire repository, not a specific branch
vcs.clone() # branch=execution.branch)
if not standalone_mode:
if session.config["agent.vcs_cache.enabled"] and cached_repo_path.exists():
print('Using cached repository in "{}"'.format(cached_repo_path))
vcs.pull()
rm_tree(destination)
shutil.copytree(Text(cached_repo_path), Text(clone_folder))
if not clone_folder.is_dir():
raise CommandFailedError(
"copying of repository failed: from {} to {}".format(
cached_repo_path, clone_folder
else:
print("cloning: {}".format(no_password_url))
rm_tree(cached_repo_path)
# We clone the entire repository, not a specific branch
vcs.clone() # branch=execution.branch)
vcs.pull()
rm_tree(destination)
shutil.copytree(Text(cached_repo_path), Text(clone_folder))
if not clone_folder.is_dir():
raise CommandFailedError(
"copying of repository failed: from {} to {}".format(
cached_repo_path, clone_folder
)
)
)
# checkout in the newly copy destination
vcs.location = Text(clone_folder)

View File

@@ -75,9 +75,15 @@ class ResourceMonitor(object):
self._exit_event = Event()
self._gpustat_fail = 0
self._gpustat = gpustat
if not self._gpustat:
self._active_gpus = None
if os.environ.get('NVIDIA_VISIBLE_DEVICES') == 'none':
# NVIDIA_VISIBLE_DEVICES set to none, marks cpu_only flag
# active_gpus == False means no GPU reporting
self._active_gpus = False
elif not self._gpustat:
log.warning('Trains-Agent Resource Monitor: GPU monitoring is not available')
else:
# None means no filtering, report all gpus
self._active_gpus = None
try:
active_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES', '') or \
@@ -244,8 +250,8 @@ class ResourceMonitor(object):
stats["io_read_mbs"] = BytesSizes.megabytes(io_stats.read_bytes)
stats["io_write_mbs"] = BytesSizes.megabytes(io_stats.write_bytes)
# check if we can access the gpu statistics
if self._gpustat:
# check if we need to monitor gpus and if we can access the gpu statistics
if self._active_gpus is not False and self._gpustat:
try:
gpu_stat = self._gpustat.new_query()
for i, g in enumerate(gpu_stat.gpus):

View File

@@ -4,11 +4,12 @@ from time import sleep
from glob import glob
from tempfile import gettempdir, NamedTemporaryFile
from trains_agent.definitions import ENV_DOCKER_HOST_MOUNT
from trains_agent.helper.base import warning
class Singleton(object):
prefix = 'trainsagent'
prefix = '.trainsagent'
sep = '_'
ext = '.tmp'
worker_id = None
@@ -17,9 +18,27 @@ class Singleton(object):
_pid_file = None
_lock_file_name = sep+prefix+sep+'global.lock'
_lock_timeout = 10
_pid = None
@classmethod
def register_instance(cls, unique_worker_id=None, worker_name=None):
def update_pid_file(cls):
new_pid = str(os.getpid())
if not cls._pid_file or cls._pid == new_pid:
return
old_name = cls._pid_file.name
parts = cls._pid_file.name.split(os.path.sep)
parts[-1] = parts[-1].replace(cls.sep + cls._pid + cls.sep, cls.sep + new_pid + cls.sep)
new_pid_file = os.path.sep.join(parts)
cls._pid = new_pid
cls._pid_file.name = new_pid_file
# we need to rename to match new pid
try:
os.rename(old_name, new_pid_file)
except:
pass
@classmethod
def register_instance(cls, unique_worker_id=None, worker_name=None, api_client=None, allow_double=False):
"""
# Exit the process if another instance of us is using the same worker_id
@@ -28,7 +47,7 @@ class Singleton(object):
:return: (str worker_id, int slot_number) Return None value on instance already running
"""
# try to lock file
lock_file = os.path.join(gettempdir(), cls._lock_file_name)
lock_file = os.path.join(cls._get_temp_folder(), cls._lock_file_name)
timeout = 0
while os.path.exists(lock_file):
if timeout > cls._lock_timeout:
@@ -46,7 +65,9 @@ class Singleton(object):
f.write(bytes(os.getpid()))
f.flush()
try:
ret = cls._register_instance(unique_worker_id=unique_worker_id, worker_name=worker_name)
ret = cls._register_instance(
unique_worker_id=unique_worker_id, worker_name=worker_name,
api_client=api_client, allow_double=allow_double)
except:
ret = None, None
@@ -58,12 +79,12 @@ class Singleton(object):
return ret
@classmethod
def _register_instance(cls, unique_worker_id=None, worker_name=None):
def _register_instance(cls, unique_worker_id=None, worker_name=None, api_client=None, allow_double=False):
if cls.worker_id:
return cls.worker_id, cls.instance_slot
# make sure we have a unique name
instance_num = 0
temp_folder = gettempdir()
temp_folder = cls._get_temp_folder()
files = glob(os.path.join(temp_folder, cls.prefix + cls.sep + '*' + cls.ext))
slots = {}
for file in files:
@@ -73,8 +94,24 @@ class Singleton(object):
except Exception:
# something is wrong, use non existing pid and delete the file
pid = -1
uid, slot = None, None
try:
with open(file, 'r') as f:
uid, slot = str(f.read()).split('\n')
slot = int(slot)
except Exception:
pass
worker = None
if api_client and ENV_DOCKER_HOST_MOUNT.get() and uid:
try:
worker = [w for w in api_client.workers.get_all() if w.id == uid]
except Exception:
worker = None
# count active instances and delete dead files
if not psutil.pid_exists(pid):
if not worker and not psutil.pid_exists(pid):
# delete the file
try:
os.remove(os.path.join(file))
@@ -83,15 +120,15 @@ class Singleton(object):
continue
instance_num += 1
try:
with open(file, 'r') as f:
uid, slot = str(f.read()).split('\n')
slot = int(slot)
except Exception:
if slot is None:
continue
if uid == unique_worker_id:
return None, None
if allow_double:
warning('Instance with the same WORKER_ID [{}] was found on this machine. '
'We are ignoring it, make sure this not a mistake.'.format(unique_worker_id))
else:
return None, None
slots[slot] = uid
@@ -110,10 +147,21 @@ class Singleton(object):
unique_worker_id = worker_name + cls.worker_name_sep + str(cls.instance_slot)
# create lock
cls._pid_file = NamedTemporaryFile(dir=gettempdir(), prefix=cls.prefix + cls.sep + str(os.getpid()) + cls.sep,
suffix=cls.ext)
cls._pid = str(os.getpid())
cls._pid_file = NamedTemporaryFile(
dir=cls._get_temp_folder(), prefix=cls.prefix + cls.sep + cls._pid + cls.sep, suffix=cls.ext)
cls._pid_file.write(('{}\n{}'.format(unique_worker_id, cls.instance_slot)).encode())
cls._pid_file.flush()
cls.worker_id = unique_worker_id
return cls.worker_id, cls.instance_slot
@classmethod
def _get_temp_folder(cls):
if ENV_DOCKER_HOST_MOUNT.get():
return ENV_DOCKER_HOST_MOUNT.get().split(':')[-1]
return gettempdir()
@classmethod
def get_slot(cls):
return cls.instance_slot or 0

View File

@@ -1,3 +1,4 @@
import itertools
from functools import partial
from importlib import import_module
import argparse
@@ -24,8 +25,17 @@ def get_parser():
from .worker import COMMANDS
subparsers = top_parser.add_subparsers(dest='command')
for c in COMMANDS:
parser = subparsers.add_parser(name=c, help=COMMANDS[c]['help'])
for a in COMMANDS[c].get('args', {}).keys():
parser.add_argument(a, **COMMANDS[c]['args'][a])
parser = subparsers.add_parser(name=c, help=COMMANDS[c]["help"])
groups = itertools.groupby(
sorted(
COMMANDS[c].get("args", {}).items(), key=lambda x: x[1].get("group", "")
),
key=lambda x: x[1].pop("group", ""),
)
for group_name, group in groups:
p = parser if not group_name else parser.add_argument_group(group_name)
for key, value in group:
aliases = value.pop("aliases", [])
p.add_argument(key, *aliases, **value)
return top_parser

View File

@@ -30,6 +30,17 @@ WORKER_ARGS = {
'type': lambda x: x.upper(),
'default': 'INFO',
},
'--gpus': {
'help': 'Specify active GPUs for the daemon to use (docker / virtual environment), '
'Equivalent to setting NVIDIA_VISIBLE_DEVICES '
'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all',
'group': 'Docker support',
},
'--cpu-only': {
'help': 'Disable GPU access for the daemon, only use CPU in either docker or virtual environment',
'action': 'store_true',
'group': 'Docker support',
},
}
DAEMON_ARGS = dict({
@@ -37,21 +48,18 @@ DAEMON_ARGS = dict({
'help': 'Pipe full log to stdout/stderr, should not be used if running in background',
'action': 'store_true',
},
'--gpus': {
'help': 'Specify active GPUs for the daemon to use (docker / virtual environment), '
'Equivalent to setting NVIDIA_VISIBLE_DEVICES '
'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all',
},
'--cpu-only': {
'help': 'Disable GPU access for the daemon, only use CPU in either docker or virtual environment',
'action': 'store_true',
},
'--docker': {
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
'nargs': '*',
'default': False,
'group': 'Docker support',
},
'--force-current-version': {
'help': 'Force trains-agent to use the current trains-agent version when running in the docker',
'action': 'store_true',
'group': 'Docker support',
},
'--queue': {
'help': 'Queue ID(s)/Name(s) to pull tasks from (\'default\' queue)',
@@ -64,6 +72,19 @@ DAEMON_ARGS = dict({
'help': 'Do not use any network connects, assume everything is pre-installed',
'action': 'store_true',
},
'--services-mode': {
'help': 'Launch multiple long-term docker services. Implies docker & cpu-only flags.',
'action': 'store_true',
},
'--create-queue': {
'help': 'Create requested queue if it does not exist already.',
'action': 'store_true',
},
'--detached': {
'help': 'Detached mode, run agent in the background',
'action': 'store_true',
'aliases': ['-d'],
},
}, **WORKER_ARGS)
@@ -97,6 +118,17 @@ COMMANDS = {
'help': 'Do not use any network connects, assume everything is pre-installed',
'action': 'store_true',
},
'--docker': {
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
'nargs': '*',
'default': False,
},
'--clone': {
'help': 'Clone the experiment before execution, and execute the cloned experiment',
'action': 'store_true',
},
}, **WORKER_ARGS),
},
'build': {
@@ -114,6 +146,12 @@ COMMANDS = {
'help': 'Where to build the task\'s virtual environment and source code. '
'When used with --docker, target docker image name to create',
},
'--install-globally': {
'help': 'Install required python packages before creating the virtual environment used to execute an '
'experiment, and use the \'agent.package_manager.system_site_packages\' virtual env option. '
'Note: when --docker is used, install-globally is always true',
'action': 'store_true',
},
'--docker': {
'help': 'Build the experiment inside a docker (v19.03 and above). Optional args <image> <arguments> or '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
@@ -121,18 +159,15 @@ COMMANDS = {
'nargs': '*',
'default': False,
},
'--gpus': {
'help': 'Specify active GPUs for the docker to use'
'Equivalent to setting NVIDIA_VISIBLE_DEVICES '
'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all',
},
'--cpu-only': {
'help': 'Disable GPU access (cpu only) for the docker',
'action': 'store_true',
},
'--python-version': {
'help': 'Virtual environment python version to use',
},
'--entry-point': {
'help': 'Run the task in the new docker. There are two options:\nEither add "reuse_task" to run the '
'given task in the docker, or "clone_task" to first clone the given task and then run it in the docker',
'default': False,
'choices': ['reuse_task', 'clone_task'],
}
}, **WORKER_ARGS),
},
'list': {

View File

@@ -15,7 +15,7 @@ from pyhocon import ConfigFactory, HOCONConverter, ConfigTree
from trains_agent.backend_api.session import Session as _Session, Request
from trains_agent.backend_api.session.client import APIClient
from trains_agent.backend_config.defs import LOCAL_CONFIG_FILE_OVERRIDE_VAR, LOCAL_CONFIG_FILES
from trains_agent.definitions import ENVIRONMENT_CONFIG
from trains_agent.definitions import ENVIRONMENT_CONFIG, ENV_TASK_EXECUTE_AS_USER, ENVIRONMENT_BACKWARD_COMPATIBLE
from trains_agent.errors import APIError
from trains_agent.helper.base import HOCONEncoder
from trains_agent.helper.process import Argv
@@ -63,6 +63,7 @@ def tree(*args):
class Session(_Session):
version = __version__
force_debug = False
def __init__(self, *args, **kwargs):
# make sure we set the environment variable so the parent session opens the correct file
@@ -75,18 +76,29 @@ class Session(_Session):
cpu_only = kwargs.get('cpu_only')
if cpu_only:
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = 'none'
if kwargs.get('gpus'):
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
if kwargs.get('gpus') and not os.environ.get('KUBERNETES_SERVICE_HOST') \
and not os.environ.get('KUBERNETES_PORT'):
# CUDA_VISIBLE_DEVICES does not support 'all'
if kwargs.get('gpus') == 'all':
os.environ.pop('CUDA_VISIBLE_DEVICES', None)
os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
else:
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
if kwargs.get('only_load_config'):
from trains_agent.backend_api.config import load
self.config = load()
else:
super(Session, self).__init__(*args, **kwargs)
# set force debug mode, if it's on:
if Session.force_debug:
self.config["agent"]["debug"] = True
self.log = self.get_logger(__name__)
self.trace = kwargs.get('trace', False)
self._config_file = kwargs.get('config_file') or \
os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR) or LOCAL_CONFIG_FILES[0]
self.api_client = APIClient(session=self, api_version="2.4")
self.api_client = APIClient(session=self, api_version="2.5")
# HACK make sure we have python version to execute,
# if nothing was specific, use the one that runs us
def_python = ConfigValue(self.config, "agent.default_python")
@@ -94,8 +106,10 @@ class Session(_Session):
def_python.set("{version.major}.{version.minor}".format(version=sys.version_info))
# HACK: backwards compatibility
os.environ['ALG_CONFIG_FILE'] = self._config_file
os.environ['SM_CONFIG_FILE'] = self._config_file
if ENVIRONMENT_BACKWARD_COMPATIBLE.get():
os.environ['ALG_CONFIG_FILE'] = self._config_file
os.environ['SM_CONFIG_FILE'] = self._config_file
if not self.config.get('api.host', None) and self.config.get('api.api_server', None):
self.config['api']['host'] = self.config.get('api.api_server')
@@ -111,6 +125,17 @@ class Session(_Session):
# override with environment variables
# cuda_version & cudnn_version are overridden with os.environ here, and normalized in the next section
for config_key, env_config in ENVIRONMENT_CONFIG.items():
# check if the propery is of a list:
if config_key.endswith('.0'):
if all(not i.get() for i in env_config.values()):
continue
parent = config_key.partition('.0')[0]
if not self.config[parent]:
self.config.put(parent, [])
self.config.put(parent, self.config[parent] + [ConfigTree((k, v.get()) for k, v in env_config.items())])
continue
value = env_config.get()
if not value:
continue
@@ -140,9 +165,16 @@ class Session(_Session):
logger.propagate = True
return TrainsAgentLogger(logger)
@staticmethod
def set_debug_mode(enable):
if enable:
import logging
logging.basicConfig(level=logging.DEBUG)
Session.force_debug = enable
@property
def debug_mode(self):
return self.config.get("agent.debug", False)
return Session.force_debug or self.config.get("agent.debug", False)
@property
def config_file(self):
@@ -165,7 +197,11 @@ class Session(_Session):
folder_keys = ('agent.venvs_dir', 'agent.vcs_cache.path',
'agent.pip_download_cache.path',
'agent.docker_pip_cache', 'agent.docker_apt_cache')
singleton_folders = ('agent.venvs_dir', 'agent.vcs_cache.path',)
singleton_folders = ('agent.venvs_dir', 'agent.vcs_cache.path', 'agent.docker_apt_cache')
if os.environ.get(ENV_TASK_EXECUTE_AS_USER):
folder_keys = tuple(list(folder_keys) + ['sdk.storage.cache.default_base_dir'])
singleton_folders = tuple(list(singleton_folders) + ['sdk.storage.cache.default_base_dir'])
for key in folder_keys:
folder_key = ConfigValue(self.config, key)

View File

@@ -1 +1 @@
__version__ = '0.13.2rc2'
__version__ = '0.15.0'