mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
95 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ea0ed4807e | ||
|
|
389600b91e | ||
|
|
5fb2550212 | ||
|
|
15e9e6b778 | ||
|
|
aa75b92e46 | ||
|
|
757210d5b3 | ||
|
|
00eb2f10ec | ||
|
|
3393372b9c | ||
|
|
f2d2d702de | ||
|
|
e3d0680d39 | ||
|
|
618c2ac5c4 | ||
|
|
0272c4c79c | ||
|
|
ff8cf63abf | ||
|
|
2c7c7f5b44 | ||
|
|
01f57c1e44 | ||
|
|
47bcd3839a | ||
|
|
0a3a8a1c52 | ||
|
|
231a907cff | ||
|
|
8f95eecf2e | ||
|
|
81008ee00e | ||
|
|
25bc44c0cf | ||
|
|
f838c8fc70 | ||
|
|
596093aac6 | ||
|
|
8f23f3b4c0 | ||
|
|
95d503afdd | ||
|
|
73ee33be99 | ||
|
|
ee3adf625f | ||
|
|
afec38a50e | ||
|
|
f9c60904f4 | ||
|
|
a09dc85c67 | ||
|
|
5d74f4b376 | ||
|
|
d558c66d3c | ||
|
|
714c6a05d0 | ||
|
|
43b2f7f41d | ||
|
|
28d752d568 | ||
|
|
6d091d8e08 | ||
|
|
5c6b3ccc94 | ||
|
|
df10e6ed46 | ||
|
|
8ef78fd058 | ||
|
|
640c83288a | ||
|
|
788c79a66f | ||
|
|
bef87c7744 | ||
|
|
f139891276 | ||
|
|
2afaff1713 | ||
|
|
a57a5b151c | ||
|
|
97f446d523 | ||
|
|
a88262c097 | ||
|
|
284271c654 | ||
|
|
ae2775f7b8 | ||
|
|
eb012f5c24 | ||
|
|
06897f7606 | ||
|
|
599219b02d | ||
|
|
b6e04ab982 | ||
|
|
98fe162878 | ||
|
|
f829d80a49 | ||
|
|
b7e568e299 | ||
|
|
6912846326 | ||
|
|
224868c9a4 | ||
|
|
b1ca90a303 | ||
|
|
dee2475698 | ||
|
|
aeede81474 | ||
|
|
2d91d4cde6 | ||
|
|
7a11c7c165 | ||
|
|
a9f479cfcd | ||
|
|
c1d91b0d6a | ||
|
|
cbfba6acb2 | ||
|
|
f2e2e1f94a | ||
|
|
23668a403a | ||
|
|
facbee0005 | ||
|
|
c486cfd09f | ||
|
|
119ecaa2e3 | ||
|
|
d6cc2be653 | ||
|
|
41d75df40c | ||
|
|
901c4be9ae | ||
|
|
966b14f914 | ||
|
|
847d35cbbb | ||
|
|
4022cb5c63 | ||
|
|
2b239829de | ||
|
|
402856656f | ||
|
|
7b94ff410c | ||
|
|
0a03dced50 | ||
|
|
ffe653afc6 | ||
|
|
8ce621cc44 | ||
|
|
7c0a2c4d50 | ||
|
|
5e063c9195 | ||
|
|
24329a21fe | ||
|
|
3a301b0b6c | ||
|
|
1f0bb4906b | ||
|
|
88f1031e5d | ||
|
|
fc2842c9a2 | ||
|
|
e9d3aab115 | ||
|
|
0ed7b2a0c8 | ||
|
|
bd73be928a | ||
|
|
79babdd149 | ||
|
|
02a21ba826 |
@@ -1,5 +1,5 @@
|
||||
# TRAINS Agent
|
||||
## Deep Learning DevOps For Everyone
|
||||
## Deep Learning DevOps For Everyone - Now supporting all platforms (Linux, macOS, and Windows)
|
||||
|
||||
"All the Deep-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
|
||||
@@ -14,7 +14,7 @@ It is a zero configuration fire-and-forget execution agent, which combined with
|
||||
|
||||
**Full AutoML in 5 steps**
|
||||
1. Install the [TRAINS server](https://github.com/allegroai/trains-agent) (or use our [open server](https://demoapp.trains.allegro.ai))
|
||||
2. `pip install trains_agent` ([install](#installing-the-trains-agent) the TRAINS agent on any GPU machine: on-premises / cloud / ...)
|
||||
2. `pip install trains-agent` ([install](#installing-the-trains-agent) the TRAINS agent on any GPU machine: on-premises / cloud / ...)
|
||||
3. Add [TRAINS](https://github.com/allegroai/trains) to your code with just 2 lines & run it once (on your machine / laptop)
|
||||
4. Change the [parameters](#using-the-trains-agent) in the UI & schedule for [execution](#using-the-trains-agent) (or automate with an [AutoML pipeline](#automl-and-orchestration-pipelines-))
|
||||
5. :chart_with_downwards_trend: :chart_with_upwards_trend: :eyes: :beer:
|
||||
@@ -133,7 +133,7 @@ Development Machine |
|
||||
### Installing the TRAINS Agent
|
||||
|
||||
```bash
|
||||
pip install trains_agent
|
||||
pip install trains-agent
|
||||
```
|
||||
|
||||
### TRAINS Agent Usage Examples
|
||||
|
||||
@@ -38,8 +38,12 @@ agent {
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
package_manager: {
|
||||
# supported options: pip, conda
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
# pip_version: "<20"
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
# install with --upgrade
|
||||
@@ -83,6 +87,17 @@ agent {
|
||||
# apt cache folder used mapped into docker, for ubuntu package caching
|
||||
docker_apt_cache = ~/.trains/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda"
|
||||
@@ -126,6 +141,9 @@ sdk {
|
||||
quality: 87
|
||||
subsampling: 0
|
||||
}
|
||||
|
||||
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to True, each series should have its own graph)
|
||||
tensorboard_single_series_per_graph: False
|
||||
}
|
||||
|
||||
network {
|
||||
@@ -229,6 +247,9 @@ sdk {
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
support_stopping: True
|
||||
|
||||
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
|
||||
default_output_uri: ""
|
||||
|
||||
# Development mode worker
|
||||
worker {
|
||||
# Status report period in seconds
|
||||
|
||||
59
examples/archive_experiments.py
Normal file
59
examples/archive_experiments.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/python3
|
||||
"""
|
||||
An example script that cleans up failed experiments by moving them to the archive
|
||||
"""
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
from trains_agent import APIClient
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--project", "-P", help="Project ID. Only clean up experiments from this project")
|
||||
parser.add_argument("--user", "-U", help="User ID. Only clean up experiments assigned to this user")
|
||||
parser.add_argument(
|
||||
"--status", "-S", default="failed",
|
||||
help="Experiment status. Only clean up experiments with this status (default %(default)s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--iterations", "-I", type=int,
|
||||
help="Number of iterations. Only clean up experiments with less or equal number of iterations"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sec-from-start", "-T", type=int,
|
||||
help="Seconds from start time. "
|
||||
"Only clean up experiments if less or equal number of seconds have elapsed since started"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
client = APIClient()
|
||||
|
||||
tasks = client.tasks.get_all(
|
||||
project=[args.project] if args.project else None,
|
||||
user=[args.user] if args.user else None,
|
||||
status=[args.status] if args.status else None,
|
||||
system_tags=["-archived"]
|
||||
)
|
||||
|
||||
count = 0
|
||||
|
||||
for task in tasks:
|
||||
if args.iterations and (task.last_iteration or 0) > args.iterations:
|
||||
continue
|
||||
if args.sec_from_start:
|
||||
if not task.started:
|
||||
continue
|
||||
if (datetime.utcnow() - task.started.replace(tzinfo=None)).total_seconds() > args.sec_from_start:
|
||||
continue
|
||||
|
||||
try:
|
||||
client.tasks.edit(
|
||||
task=task.id,
|
||||
system_tags=(task.system_tags or []) + ["archived"],
|
||||
force=True
|
||||
)
|
||||
count += 1
|
||||
except Exception as ex:
|
||||
print("Failed editing experiment: {}".format(ex))
|
||||
|
||||
print("Cleaned up {} experiments".format(count))
|
||||
587
examples/dynamic_cloud_cluster.ipynb
Normal file
587
examples/dynamic_cloud_cluster.ipynb
Normal file
@@ -0,0 +1,587 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Auto-Magically Spin AWS EC2 Instances On Demand \n",
|
||||
"# and Create a Dynamic Cluster Running *Trains-Agent*\n",
|
||||
"\n",
|
||||
"### Define your budget and execute the notebook, that's it\n",
|
||||
"### You now have a fully managed cluster on AWS 🎉 🎊 "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**trains-agent**'s main goal is to quickly pull a job from an execution queue, setup the environment (as defined in the experiment, including git cloning, python packages etc.) then execute the experiment and monitor it.\n",
|
||||
"\n",
|
||||
"This notebook defines a cloud budget (currently only AWS is supported, but feel free to expand with PRs), and spins an instance the minute a job is waiting for execution. It will also spin down idle machines, saving you some $$$ :)\n",
|
||||
"\n",
|
||||
"Configuration steps\n",
|
||||
"- Define maximum budget to be used (instance type / number of instances).\n",
|
||||
"- Create new execution *queues* in the **trains-server**.\n",
|
||||
"- Define mapping between the created the *queues* and an instance budget.\n",
|
||||
"\n",
|
||||
"**TL;DR - This notebook:**\n",
|
||||
"- Will spin instances if there are jobs in the execution *queues*, until it will hit the budget limit. \n",
|
||||
"- If machines are idle, it will spin them down.\n",
|
||||
"\n",
|
||||
"The controller implementation itself is stateless, meaning you can always re-execute the notebook, if for some reason it stopped.\n",
|
||||
"\n",
|
||||
"It is as simple as it sounds, but extremely powerful\n",
|
||||
"\n",
|
||||
"Enjoy your newly created dynamic cluster :)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Install & import required packages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install trains-agent\n",
|
||||
"!pip install boto3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Define AWS instance types and configuration (Instance Type, EBS, AMI etc.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# AWS EC2 machines types - default AMI - NVIDIA Deep Learning AMI 19.11.3\n",
|
||||
"RESOURCE_CONFIGURATIONS = {\n",
|
||||
" \"amazon_ec2_normal\": {\n",
|
||||
" \"instance_type\": \"g4dn.4xlarge\",\n",
|
||||
" \"is_spot\": False,\n",
|
||||
" \"availability_zone\": \"us-east-1b\",\n",
|
||||
" \"ami_id\": \"ami-07c95cafbb788face\",\n",
|
||||
" \"ebs_device_name\": \"/dev/xvda\",\n",
|
||||
" \"ebs_volume_size\": 100,\n",
|
||||
" \"ebs_volume_type\": \"gp2\",\n",
|
||||
" },\n",
|
||||
" \"amazon_ec2_high\": {\n",
|
||||
" \"instance_type\": \"g4dn.8xlarge\",\n",
|
||||
" \"is_spot\": False,\n",
|
||||
" \"availability_zone\": \"us-east-1b\",\n",
|
||||
" \"ami_id\": \"ami-07c95cafbb788face\",\n",
|
||||
" \"ebs_device_name\": \"/dev/xvda\",\n",
|
||||
" \"ebs_volume_size\": 100,\n",
|
||||
" \"ebs_volume_type\": \"gp2\",\n",
|
||||
" },\n",
|
||||
"}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Define machine budget per execution queue\n",
|
||||
"\n",
|
||||
"Now that we defined our budget, we need to connect it with the **Trains** cluster.\n",
|
||||
"\n",
|
||||
"We map each queue to a resource type (instance type).\n",
|
||||
"\n",
|
||||
"Create two queues in the WebUI:\n",
|
||||
"- Browse to http://your_trains_server_ip:8080/workers-and-queues/queues\n",
|
||||
"- Then click on the \"New Queue\" button and name your queues \"aws_normal\" and \"aws_high\" respectively\n",
|
||||
"\n",
|
||||
"The QUEUES dictionary hold the mapping between the queue name and the type/number of instances to spin connected to the specific queue.\n",
|
||||
"```\n",
|
||||
"QUEUES = {\n",
|
||||
" 'queue_name': [(\"instance-type-as-defined-in-RESOURCE_CONFIGURATIONS\", max_number_of_instances), ]\n",
|
||||
"}\n",
|
||||
"```\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Trains-Agent Queues - Machines budget per Queue\n",
|
||||
"# Per queue: list of (machine type as defined in RESOURCE_CONFIGURATIONS,\n",
|
||||
"# max instances for the specific queue). Order machines from most preferred to least.\n",
|
||||
"QUEUES = {\n",
|
||||
" \"aws_normal\": [(\"amazon_ec2_normal\", 2),],\n",
|
||||
" \"aws_high\": [(\"amazon_ec2_high\", 1)],\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Credentials for your AWS account, as well as for your **Trains-Server**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# AWS credentials (leave empty to use credentials set using the aws cli)\n",
|
||||
"CLOUD_CREDENTIALS_KEY = \"\"\n",
|
||||
"CLOUD_CREDENTIALS_SECRET = \"\"\n",
|
||||
"CLOUD_CREDENTIALS_REGION = \"us-east-1\"\n",
|
||||
"\n",
|
||||
"# TRAINS configuration\n",
|
||||
"TRAINS_SERVER_WEB_SERVER = \"http://localhost:8080\"\n",
|
||||
"TRAINS_SERVER_API_SERVER = \"http://localhost:8008\"\n",
|
||||
"TRAINS_SERVER_FILES_SERVER = \"http://localhost:8081\"\n",
|
||||
"# TRAINS credentials\n",
|
||||
"TRAINS_ACCESS_KEY = \"\"\n",
|
||||
"TRAINS_SECRET_KEY = \"\"\n",
|
||||
"# Git User/Pass to be used by trains-agent,\n",
|
||||
"# leave empty if image already contains git ssh-key\n",
|
||||
"TRAINS_GIT_USER = \"\"\n",
|
||||
"TRAINS_GIT_PASS = \"\"\n",
|
||||
"\n",
|
||||
"# Additional fields for trains.conf file created on the remote instance\n",
|
||||
"# for example: 'agent.default_docker.image: \"nvidia/cuda:10.0-cudnn7-runtime\"'\n",
|
||||
"EXTRA_TRAINS_CONF = \"\"\"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Bash script to run on instances before running trains-agent\n",
|
||||
"# Example: \"\"\"\n",
|
||||
"# echo \"This is the first line\"\n",
|
||||
"# echo \"This is the second line\"\n",
|
||||
"# \"\"\"\n",
|
||||
"EXTRA_BASH_SCRIPT = \"\"\"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Default docker for trains-agent when running in docker mode (requires docker v19.03 and above). \n",
|
||||
"# Leave empty to run trains-agent in non-docker mode.\n",
|
||||
"DEFAULT_DOCKER_IMAGE = \"nvidia/cuda\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Controller Internal Definitions\n",
|
||||
"\n",
|
||||
"# maximum idle time in minutes, after which the instance will be shutdown\n",
|
||||
"MAX_IDLE_TIME_MIN = 15\n",
|
||||
"# polling interval in minutes\n",
|
||||
"# make sure to increase in case bash commands were added in EXTRA_BASH_SCRIPT\n",
|
||||
"POLLING_INTERVAL_MIN = 5.0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Import Packages and Budget Definition Sanity Check"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import base64\n",
|
||||
"import re\n",
|
||||
"import os\n",
|
||||
"from itertools import chain\n",
|
||||
"from operator import itemgetter\n",
|
||||
"from time import sleep, time\n",
|
||||
"\n",
|
||||
"import boto3\n",
|
||||
"from trains_agent.backend_api.session.client import APIClient"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Sanity Check - Validate Queue Resources\n",
|
||||
"if len(set(map(itemgetter(0), chain(*QUEUES.values())))) != sum(\n",
|
||||
" map(len, QUEUES.values())\n",
|
||||
"):\n",
|
||||
" print(\n",
|
||||
" \"Error: at least one resource name is used in multiple queues. \"\n",
|
||||
" \"A resource name can only appear in a single queue definition.\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# Encode EXTRA_TRAINS_CONF for later bash script usage\n",
|
||||
"EXTRA_TRAINS_CONF_ENCODED = \"\\\\\\\"\".join(EXTRA_TRAINS_CONF.split(\"\\\"\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Cloud specific implementation of spin up/down - currently supports AWS only"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cloud-specific implementation (currently, only AWS EC2 is supported)\n",
|
||||
"def spin_up_worker(resource, worker_id_prefix, queue_name):\n",
|
||||
" \"\"\"\n",
|
||||
" Creates a new worker for trains.\n",
|
||||
" First, create an instance in the cloud and install some required packages.\n",
|
||||
" Then, define trains-agent environment variables and run \n",
|
||||
" trains-agent for the specified queue.\n",
|
||||
" NOTE: - Will wait until instance is running\n",
|
||||
" - This implementation assumes the instance image already has docker installed\n",
|
||||
"\n",
|
||||
" :param str resource: resource name, as defined in BUDGET and QUEUES.\n",
|
||||
" :param str worker_id_prefix: worker name prefix\n",
|
||||
" :param str queue_name: trains queue to listen to\n",
|
||||
" \"\"\"\n",
|
||||
" resource_conf = RESOURCE_CONFIGURATIONS[resource]\n",
|
||||
" # Add worker type and AWS instance type to the worker name.\n",
|
||||
" worker_id = \"{worker_id_prefix}:{worker_type}:{instance_type}\".format(\n",
|
||||
" worker_id_prefix=worker_id_prefix,\n",
|
||||
" worker_type=resource,\n",
|
||||
" instance_type=resource_conf[\"instance_type\"],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # user_data script will automatically run when the instance is started. \n",
|
||||
" # It will install the required packages for trains-agent configure it using \n",
|
||||
" # environment variables and run trains-agent on the required queue\n",
|
||||
" user_data = \"\"\"#!/bin/bash\n",
|
||||
" sudo apt-get update\n",
|
||||
" sudo apt-get install -y python3-dev\n",
|
||||
" sudo apt-get install -y python3-pip\n",
|
||||
" sudo apt-get install -y gcc\n",
|
||||
" sudo apt-get install -y git\n",
|
||||
" sudo apt-get install -y build-essential\n",
|
||||
" python3 -m pip install -U pip\n",
|
||||
" python3 -m pip install virtualenv\n",
|
||||
" python3 -m virtualenv trains_agent_venv\n",
|
||||
" source trains_agent_venv/bin/activate\n",
|
||||
" python -m pip install trains-agent\n",
|
||||
" echo 'agent.git_user=\\\"{git_user}\\\"' >> /root/trains.conf\n",
|
||||
" echo 'agent.git_pass=\\\"{git_pass}\\\"' >> /root/trains.conf\n",
|
||||
" echo \"{trains_conf}\" >> /root/trains.conf\n",
|
||||
" export TRAINS_API_HOST={api_server}\n",
|
||||
" export TRAINS_WEB_HOST={web_server}\n",
|
||||
" export TRAINS_FILES_HOST={files_server}\n",
|
||||
" export DYNAMIC_INSTANCE_ID=`curl http://169.254.169.254/latest/meta-data/instance-id`\n",
|
||||
" export TRAINS_WORKER_ID={worker_id}:$DYNAMIC_INSTANCE_ID\n",
|
||||
" export TRAINS_API_ACCESS_KEY='{access_key}'\n",
|
||||
" export TRAINS_API_SECRET_KEY='{secret_key}'\n",
|
||||
" {bash_script}\n",
|
||||
" source ~/.bashrc\n",
|
||||
" python -m trains_agent --config-file '/root/trains.conf' daemon --queue '{queue}' {docker}\n",
|
||||
" shutdown\n",
|
||||
" \"\"\".format(\n",
|
||||
" api_server=TRAINS_SERVER_API_SERVER,\n",
|
||||
" web_server=TRAINS_SERVER_WEB_SERVER,\n",
|
||||
" files_server=TRAINS_SERVER_FILES_SERVER,\n",
|
||||
" worker_id=worker_id,\n",
|
||||
" access_key=TRAINS_ACCESS_KEY,\n",
|
||||
" secret_key=TRAINS_SECRET_KEY,\n",
|
||||
" queue=queue_name,\n",
|
||||
" git_user=TRAINS_GIT_USER,\n",
|
||||
" git_pass=TRAINS_GIT_PASS,\n",
|
||||
" trains_conf=EXTRA_TRAINS_CONF_ENCODED,\n",
|
||||
" bash_script=EXTRA_BASH_SCRIPT,\n",
|
||||
" docker=\"--docker '{}'\".format(DEFAULT_DOCKER_IMAGE) if DEFAULT_DOCKER_IMAGE else \"\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ec2 = boto3.client(\n",
|
||||
" \"ec2\",\n",
|
||||
" aws_access_key_id=CLOUD_CREDENTIALS_KEY or None,\n",
|
||||
" aws_secret_access_key=CLOUD_CREDENTIALS_SECRET or None,\n",
|
||||
" region_name=CLOUD_CREDENTIALS_REGION\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if resource_conf[\"is_spot\"]:\n",
|
||||
" # Create a request for a spot instance in AWS\n",
|
||||
" encoded_user_data = base64.b64encode(user_data.encode(\"ascii\")).decode(\"ascii\")\n",
|
||||
" instances = ec2.request_spot_instances(\n",
|
||||
" LaunchSpecification={\n",
|
||||
" \"ImageId\": resource_conf[\"ami_id\"],\n",
|
||||
" \"InstanceType\": resource_conf[\"instance_type\"],\n",
|
||||
" \"Placement\": {\"AvailabilityZone\": resource_conf[\"availability_zone\"]},\n",
|
||||
" \"UserData\": encoded_user_data,\n",
|
||||
" \"BlockDeviceMappings\": [\n",
|
||||
" {\n",
|
||||
" \"DeviceName\": resource_conf[\"ebs_device_name\"],\n",
|
||||
" \"Ebs\": {\n",
|
||||
" \"VolumeSize\": resource_conf[\"ebs_volume_size\"],\n",
|
||||
" \"VolumeType\": resource_conf[\"ebs_volume_type\"],\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Wait until spot request is fulfilled\n",
|
||||
" request_id = instances[\"SpotInstanceRequests\"][0][\"SpotInstanceRequestId\"]\n",
|
||||
" waiter = ec2.get_waiter(\"spot_instance_request_fulfilled\")\n",
|
||||
" waiter.wait(SpotInstanceRequestIds=[request_id])\n",
|
||||
" # Get the instance object for later use\n",
|
||||
" response = ec2.describe_spot_instance_requests(\n",
|
||||
" SpotInstanceRequestIds=[request_id]\n",
|
||||
" )\n",
|
||||
" instance_id = response[\"SpotInstanceRequests\"][0][\"InstanceId\"]\n",
|
||||
"\n",
|
||||
" else:\n",
|
||||
" # Create a new EC2 instance\n",
|
||||
" instances = ec2.run_instances(\n",
|
||||
" ImageId=resource_conf[\"ami_id\"],\n",
|
||||
" MinCount=1,\n",
|
||||
" MaxCount=1,\n",
|
||||
" InstanceType=resource_conf[\"instance_type\"],\n",
|
||||
" UserData=user_data,\n",
|
||||
" InstanceInitiatedShutdownBehavior='terminate',\n",
|
||||
" BlockDeviceMappings=[\n",
|
||||
" {\n",
|
||||
" \"DeviceName\": resource_conf[\"ebs_device_name\"],\n",
|
||||
" \"Ebs\": {\n",
|
||||
" \"VolumeSize\": resource_conf[\"ebs_volume_size\"],\n",
|
||||
" \"VolumeType\": resource_conf[\"ebs_volume_type\"],\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Get the instance object for later use\n",
|
||||
" instance_id = instances[\"Instances\"][0][\"InstanceId\"]\n",
|
||||
"\n",
|
||||
" instance = boto3.resource(\n",
|
||||
" \"ec2\",\n",
|
||||
" aws_access_key_id=CLOUD_CREDENTIALS_KEY or None,\n",
|
||||
" aws_secret_access_key=CLOUD_CREDENTIALS_SECRET or None,\n",
|
||||
" region_name=CLOUD_CREDENTIALS_REGION\n",
|
||||
" ).Instance(instance_id)\n",
|
||||
"\n",
|
||||
" # Wait until instance is in running state\n",
|
||||
" instance.wait_until_running()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Cloud-specific implementation (currently, only AWS EC2 is supported)\n",
|
||||
"def spin_down_worker(instance_id):\n",
|
||||
" \"\"\"\n",
|
||||
" Destroys the cloud instance.\n",
|
||||
"\n",
|
||||
" :param str instance_id: Cloud instance ID to be destroyed \n",
|
||||
" (currently, only AWS EC2 is supported)\n",
|
||||
" \"\"\"\n",
|
||||
" try:\n",
|
||||
" boto3.resource(\n",
|
||||
" \"ec2\",\n",
|
||||
" aws_access_key_id=CLOUD_CREDENTIALS_KEY or None,\n",
|
||||
" aws_secret_access_key=CLOUD_CREDENTIALS_SECRET or None,\n",
|
||||
" region_name=CLOUD_CREDENTIALS_REGION\n",
|
||||
" ).instances.filter(InstanceIds=[instance_id]).terminate()\n",
|
||||
" except Exception as ex:\n",
|
||||
" raise ex"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"###### Controller Implementation and Logic"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def supervisor():\n",
|
||||
" \"\"\"\n",
|
||||
" Spin up or down resources as necessary.\n",
|
||||
" - For every queue in QUEUES do the following:\n",
|
||||
" 1. Check if there are tasks waiting in the queue.\n",
|
||||
" 2. Check if there are enough idle workers available for those tasks.\n",
|
||||
" 3. In case more instances are required, and we haven't reached max instances allowed,\n",
|
||||
" create the required instances with regards to the maximum number defined in QUEUES\n",
|
||||
" Choose which instance to create according to their order QUEUES. Won't create \n",
|
||||
" more instances if maximum number defined has already reached.\n",
|
||||
" - spin down instances according to their idle time. instance which is idle for \n",
|
||||
" more than MAX_IDLE_TIME_MIN minutes would be removed.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Internal definitions\n",
|
||||
" workers_prefix = \"dynamic_aws\"\n",
|
||||
" # Worker's id in trains would be composed from:\n",
|
||||
" # prefix, name, instance_type and cloud_id separated by ';'\n",
|
||||
" workers_pattern = re.compile(\n",
|
||||
" r\"^(?P<prefix>[^:]+):(?P<name>[^:]+):(?P<instance_type>[^:]+):(?P<cloud_id>[^:]+)\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Set up the environment variables for trains\n",
|
||||
" os.environ[\"TRAINS_API_HOST\"] = TRAINS_SERVER_API_SERVER\n",
|
||||
" os.environ[\"TRAINS_WEB_HOST\"] = TRAINS_SERVER_WEB_SERVER\n",
|
||||
" os.environ[\"TRAINS_FILES_HOST\"] = TRAINS_SERVER_FILES_SERVER\n",
|
||||
" os.environ[\"TRAINS_API_ACCESS_KEY\"] = TRAINS_ACCESS_KEY\n",
|
||||
" os.environ[\"TRAINS_API_SECRET_KEY\"] = TRAINS_SECRET_KEY\n",
|
||||
" api_client = APIClient()\n",
|
||||
"\n",
|
||||
" # Verify the requested queues exist and create those that doesn't exist\n",
|
||||
" all_queues = [q.name for q in list(api_client.queues.get_all())]\n",
|
||||
" missing_queues = [q for q in QUEUES if q not in all_queues]\n",
|
||||
" for q in missing_queues:\n",
|
||||
" api_client.queues.create(q)\n",
|
||||
"\n",
|
||||
" idle_workers = {}\n",
|
||||
" while True:\n",
|
||||
" queue_name_to_id = {\n",
|
||||
" queue.name: queue.id for queue in api_client.queues.get_all()\n",
|
||||
" }\n",
|
||||
" resource_to_queue = {\n",
|
||||
" item[0]: queue\n",
|
||||
" for queue, resources in QUEUES.items()\n",
|
||||
" for item in resources\n",
|
||||
" }\n",
|
||||
" all_workers = [\n",
|
||||
" worker\n",
|
||||
" for worker in api_client.workers.get_all()\n",
|
||||
" if workers_pattern.match(worker.id)\n",
|
||||
" and workers_pattern.match(worker.id)[\"prefix\"] == workers_prefix\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # Workers without a task, are added to the idle list\n",
|
||||
" for worker in all_workers:\n",
|
||||
" if not hasattr(worker, \"task\") or not worker.task:\n",
|
||||
" if worker.id not in idle_workers:\n",
|
||||
" resource_name = workers_pattern.match(worker.id)[\"instance_type\"]\n",
|
||||
" idle_workers[worker.id] = (time(), resource_name, worker)\n",
|
||||
" elif hasattr(worker, \"task\") and worker.task and worker.id in idle_workers:\n",
|
||||
" idle_workers.pop(worker.id, None)\n",
|
||||
"\n",
|
||||
" required_idle_resources = [] # idle resources we'll need to keep running\n",
|
||||
" allocate_new_resources = [] # resources that will need to be started\n",
|
||||
" # Check if we have tasks waiting on one of the designated queues\n",
|
||||
" for queue in QUEUES:\n",
|
||||
" entries = api_client.queues.get_by_id(queue_name_to_id[queue]).entries\n",
|
||||
" if entries and len(entries) > 0:\n",
|
||||
" queue_resources = QUEUES[queue]\n",
|
||||
"\n",
|
||||
" # If we have an idle worker matching the required resource,\n",
|
||||
" # remove it from the required allocation resources\n",
|
||||
" free_queue_resources = [\n",
|
||||
" resource\n",
|
||||
" for _, resource, _ in idle_workers.values()\n",
|
||||
" if resource in queue_resources\n",
|
||||
" ]\n",
|
||||
" required_idle_resources.extend(free_queue_resources)\n",
|
||||
" spin_up_count = len(entries) - len(free_queue_resources)\n",
|
||||
" spin_up_resources = []\n",
|
||||
"\n",
|
||||
" # Add as many resources as possible to handle this queue's entries\n",
|
||||
" for resource, max_instances in queue_resources:\n",
|
||||
" if len(spin_up_resources) >= spin_up_count:\n",
|
||||
" break\n",
|
||||
" max_allowed = max_instances - len(\n",
|
||||
" [\n",
|
||||
" worker\n",
|
||||
" for worker in all_workers\n",
|
||||
" if workers_pattern.match(worker.id)[\"name\"] == resource\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" spin_up_resources.extend(\n",
|
||||
" [resource] * min(max_allowed, spin_up_count)\n",
|
||||
" )\n",
|
||||
" allocate_new_resources.extend(spin_up_resources)\n",
|
||||
"\n",
|
||||
" # Now we actually spin the new machines\n",
|
||||
" for resource in allocate_new_resources:\n",
|
||||
" spin_up_worker(resource, workers_prefix, resource_to_queue[resource])\n",
|
||||
"\n",
|
||||
" # Go over the idle workers list, and spin down idle workers\n",
|
||||
" for timestamp, resources, worker in idle_workers.values():\n",
|
||||
" # skip resource types that might be needed\n",
|
||||
" if resources in required_idle_resources:\n",
|
||||
" continue\n",
|
||||
" # Remove from both aws and trains all instances that are \n",
|
||||
" # idle for longer than MAX_IDLE_TIME_MIN\n",
|
||||
" if time() - timestamp > MAX_IDLE_TIME_MIN * 60.0:\n",
|
||||
" cloud_id = workers_pattern.match(worker.id)[\"cloud_id\"]\n",
|
||||
" spin_down_worker(cloud_id)\n",
|
||||
" worker.unregister()\n",
|
||||
"\n",
|
||||
" # Nothing else to do\n",
|
||||
" sleep(POLLING_INTERVAL_MIN * 60.0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Execute Forever* (the controller is stateless, so you can always re-execute the notebook)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Loop forever, it is okay we are stateless\n",
|
||||
"while True:\n",
|
||||
" try:\n",
|
||||
" supervisor()\n",
|
||||
" except Exception as ex:\n",
|
||||
" print(\"Warning! exception occurred: {ex}\\nRetry in 15 seconds\".format(ex=ex))\n",
|
||||
" sleep(15)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
"cell_type": "raw",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -5,6 +5,7 @@ future>=0.16.0
|
||||
humanfriendly>=2.1
|
||||
jsonmodels>=2.2
|
||||
jsonschema>=2.6.0
|
||||
packaging>=16.0
|
||||
pathlib2>=2.3.0
|
||||
psutil>=3.4.2
|
||||
pyhocon>=0.3.38
|
||||
@@ -15,9 +16,8 @@ PyYAML>=3.12
|
||||
requests-file>=1.4.2
|
||||
requests>=2.20.0
|
||||
requirements_parser>=0.2.0
|
||||
semantic_version>=2.6.0
|
||||
six>=1.11.0
|
||||
tqdm>=4.19.5
|
||||
typing>=3.6.4
|
||||
urllib3>=1.21.1
|
||||
virtualenv>=16
|
||||
virtualenv>=16,<20
|
||||
|
||||
@@ -35,7 +35,7 @@ def trains_agentyaml(tmpdir):
|
||||
def _method(template_file):
|
||||
file = tmpdir.join("trains_agent.yaml")
|
||||
with (PROJECT_ROOT / "tests/templates" / template_file).open() as f:
|
||||
code = yaml.load(f)
|
||||
code = yaml.load(f, Loader=yaml.SafeLoader)
|
||||
yield Namespace(code=code, file=file.strpath)
|
||||
file.write(yaml.dump(code))
|
||||
return _method
|
||||
|
||||
@@ -1 +1 @@
|
||||
|
||||
from .backend_api.session.client import APIClient
|
||||
|
||||
@@ -22,9 +22,12 @@
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
package_manager: {
|
||||
# supported options: pip, conda
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: "<20",
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
@@ -33,7 +36,6 @@
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]
|
||||
extra_index_url: []
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["defaults", "conda-forge", "pytorch", ]
|
||||
@@ -69,6 +71,17 @@
|
||||
# apt cache folder used mapped into docker, for ubuntu package caching
|
||||
docker_apt_cache = ~/.trains/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda"
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
from .v2_4 import auth
|
||||
from .v2_4 import debug
|
||||
from .v2_4 import queues
|
||||
from .v2_4 import tasks
|
||||
from .v2_4 import workers
|
||||
from .v2_5 import auth
|
||||
from .v2_5 import debug
|
||||
from .v2_5 import queues
|
||||
from .v2_5 import tasks
|
||||
from .v2_5 import workers
|
||||
from .v2_5 import events
|
||||
from .v2_5 import models
|
||||
|
||||
__all__ = [
|
||||
'auth',
|
||||
@@ -10,4 +12,6 @@ __all__ = [
|
||||
'queues',
|
||||
'tasks',
|
||||
'workers',
|
||||
'events',
|
||||
'models',
|
||||
]
|
||||
|
||||
2977
trains_agent/backend_api/services/v2_4/events.py
Normal file
2977
trains_agent/backend_api/services/v2_4/events.py
Normal file
File diff suppressed because it is too large
Load Diff
2850
trains_agent/backend_api/services/v2_4/models.py
Normal file
2850
trains_agent/backend_api/services/v2_4/models.py
Normal file
File diff suppressed because it is too large
Load Diff
0
trains_agent/backend_api/services/v2_5/__init__.py
Normal file
0
trains_agent/backend_api/services/v2_5/__init__.py
Normal file
623
trains_agent/backend_api/services/v2_5/auth.py
Normal file
623
trains_agent/backend_api/services/v2_5/auth.py
Normal file
@@ -0,0 +1,623 @@
|
||||
"""
|
||||
auth service
|
||||
|
||||
This service provides authentication management and authorization
|
||||
validation for the entire system.
|
||||
"""
|
||||
import six
|
||||
import types
|
||||
from datetime import datetime
|
||||
import enum
|
||||
|
||||
from dateutil.parser import parse as parse_datetime
|
||||
|
||||
from ....backend_api.session import Request, BatchRequest, Response, DataModel, NonStrictDataModel, CompoundRequest, schema_property, StringEnum
|
||||
|
||||
|
||||
class Credentials(NonStrictDataModel):
|
||||
"""
|
||||
:param access_key: Credentials access key
|
||||
:type access_key: str
|
||||
:param secret_key: Credentials secret key
|
||||
:type secret_key: str
|
||||
"""
|
||||
_schema = {
|
||||
'properties': {
|
||||
'access_key': {
|
||||
'description': 'Credentials access key',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
'secret_key': {
|
||||
'description': 'Credentials secret key',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, access_key=None, secret_key=None, **kwargs):
|
||||
super(Credentials, self).__init__(**kwargs)
|
||||
self.access_key = access_key
|
||||
self.secret_key = secret_key
|
||||
|
||||
@schema_property('access_key')
|
||||
def access_key(self):
|
||||
return self._property_access_key
|
||||
|
||||
@access_key.setter
|
||||
def access_key(self, value):
|
||||
if value is None:
|
||||
self._property_access_key = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "access_key", six.string_types)
|
||||
self._property_access_key = value
|
||||
|
||||
@schema_property('secret_key')
|
||||
def secret_key(self):
|
||||
return self._property_secret_key
|
||||
|
||||
@secret_key.setter
|
||||
def secret_key(self, value):
|
||||
if value is None:
|
||||
self._property_secret_key = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "secret_key", six.string_types)
|
||||
self._property_secret_key = value
|
||||
|
||||
|
||||
class CredentialKey(NonStrictDataModel):
|
||||
"""
|
||||
:param access_key:
|
||||
:type access_key: str
|
||||
:param last_used:
|
||||
:type last_used: datetime.datetime
|
||||
:param last_used_from:
|
||||
:type last_used_from: str
|
||||
"""
|
||||
_schema = {
|
||||
'properties': {
|
||||
'access_key': {'description': '', 'type': ['string', 'null']},
|
||||
'last_used': {
|
||||
'description': '',
|
||||
'format': 'date-time',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
'last_used_from': {'description': '', 'type': ['string', 'null']},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, access_key=None, last_used=None, last_used_from=None, **kwargs):
|
||||
super(CredentialKey, self).__init__(**kwargs)
|
||||
self.access_key = access_key
|
||||
self.last_used = last_used
|
||||
self.last_used_from = last_used_from
|
||||
|
||||
@schema_property('access_key')
|
||||
def access_key(self):
|
||||
return self._property_access_key
|
||||
|
||||
@access_key.setter
|
||||
def access_key(self, value):
|
||||
if value is None:
|
||||
self._property_access_key = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "access_key", six.string_types)
|
||||
self._property_access_key = value
|
||||
|
||||
@schema_property('last_used')
|
||||
def last_used(self):
|
||||
return self._property_last_used
|
||||
|
||||
@last_used.setter
|
||||
def last_used(self, value):
|
||||
if value is None:
|
||||
self._property_last_used = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "last_used", six.string_types + (datetime,))
|
||||
if not isinstance(value, datetime):
|
||||
value = parse_datetime(value)
|
||||
self._property_last_used = value
|
||||
|
||||
@schema_property('last_used_from')
|
||||
def last_used_from(self):
|
||||
return self._property_last_used_from
|
||||
|
||||
@last_used_from.setter
|
||||
def last_used_from(self, value):
|
||||
if value is None:
|
||||
self._property_last_used_from = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "last_used_from", six.string_types)
|
||||
self._property_last_used_from = value
|
||||
|
||||
|
||||
|
||||
|
||||
class CreateCredentialsRequest(Request):
|
||||
"""
|
||||
Creates a new set of credentials for the authenticated user.
|
||||
New key/secret is returned.
|
||||
Note: Secret will never be returned in any other API call.
|
||||
If a secret is lost or compromised, the key should be revoked
|
||||
and a new set of credentials can be created.
|
||||
|
||||
"""
|
||||
|
||||
_service = "auth"
|
||||
_action = "create_credentials"
|
||||
_version = "2.1"
|
||||
_schema = {
|
||||
'additionalProperties': False,
|
||||
'definitions': {},
|
||||
'properties': {},
|
||||
'type': 'object',
|
||||
}
|
||||
|
||||
|
||||
class CreateCredentialsResponse(Response):
|
||||
"""
|
||||
Response of auth.create_credentials endpoint.
|
||||
|
||||
:param credentials: Created credentials
|
||||
:type credentials: Credentials
|
||||
"""
|
||||
_service = "auth"
|
||||
_action = "create_credentials"
|
||||
_version = "2.1"
|
||||
|
||||
_schema = {
|
||||
'definitions': {
|
||||
'credentials': {
|
||||
'properties': {
|
||||
'access_key': {
|
||||
'description': 'Credentials access key',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
'secret_key': {
|
||||
'description': 'Credentials secret key',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
},
|
||||
},
|
||||
'properties': {
|
||||
'credentials': {
|
||||
'description': 'Created credentials',
|
||||
'oneOf': [{'$ref': '#/definitions/credentials'}, {'type': 'null'}],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, credentials=None, **kwargs):
|
||||
super(CreateCredentialsResponse, self).__init__(**kwargs)
|
||||
self.credentials = credentials
|
||||
|
||||
@schema_property('credentials')
|
||||
def credentials(self):
|
||||
return self._property_credentials
|
||||
|
||||
@credentials.setter
|
||||
def credentials(self, value):
|
||||
if value is None:
|
||||
self._property_credentials = None
|
||||
return
|
||||
if isinstance(value, dict):
|
||||
value = Credentials.from_dict(value)
|
||||
else:
|
||||
self.assert_isinstance(value, "credentials", Credentials)
|
||||
self._property_credentials = value
|
||||
|
||||
|
||||
|
||||
|
||||
class EditUserRequest(Request):
|
||||
"""
|
||||
Edit a users' auth data properties
|
||||
|
||||
:param user: User ID
|
||||
:type user: str
|
||||
:param role: The new user's role within the company
|
||||
:type role: str
|
||||
"""
|
||||
|
||||
_service = "auth"
|
||||
_action = "edit_user"
|
||||
_version = "2.1"
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'role': {
|
||||
'description': "The new user's role within the company",
|
||||
'enum': ['admin', 'superuser', 'user', 'annotator'],
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
'user': {'description': 'User ID', 'type': ['string', 'null']},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, user=None, role=None, **kwargs):
|
||||
super(EditUserRequest, self).__init__(**kwargs)
|
||||
self.user = user
|
||||
self.role = role
|
||||
|
||||
@schema_property('user')
|
||||
def user(self):
|
||||
return self._property_user
|
||||
|
||||
@user.setter
|
||||
def user(self, value):
|
||||
if value is None:
|
||||
self._property_user = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "user", six.string_types)
|
||||
self._property_user = value
|
||||
|
||||
@schema_property('role')
|
||||
def role(self):
|
||||
return self._property_role
|
||||
|
||||
@role.setter
|
||||
def role(self, value):
|
||||
if value is None:
|
||||
self._property_role = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "role", six.string_types)
|
||||
self._property_role = value
|
||||
|
||||
|
||||
class EditUserResponse(Response):
|
||||
"""
|
||||
Response of auth.edit_user endpoint.
|
||||
|
||||
:param updated: Number of users updated (0 or 1)
|
||||
:type updated: float
|
||||
:param fields: Updated fields names and values
|
||||
:type fields: dict
|
||||
"""
|
||||
_service = "auth"
|
||||
_action = "edit_user"
|
||||
_version = "2.1"
|
||||
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'fields': {
|
||||
'additionalProperties': True,
|
||||
'description': 'Updated fields names and values',
|
||||
'type': ['object', 'null'],
|
||||
},
|
||||
'updated': {
|
||||
'description': 'Number of users updated (0 or 1)',
|
||||
'enum': [0, 1],
|
||||
'type': ['number', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, updated=None, fields=None, **kwargs):
|
||||
super(EditUserResponse, self).__init__(**kwargs)
|
||||
self.updated = updated
|
||||
self.fields = fields
|
||||
|
||||
@schema_property('updated')
|
||||
def updated(self):
|
||||
return self._property_updated
|
||||
|
||||
@updated.setter
|
||||
def updated(self, value):
|
||||
if value is None:
|
||||
self._property_updated = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "updated", six.integer_types + (float,))
|
||||
self._property_updated = value
|
||||
|
||||
@schema_property('fields')
|
||||
def fields(self):
|
||||
return self._property_fields
|
||||
|
||||
@fields.setter
|
||||
def fields(self, value):
|
||||
if value is None:
|
||||
self._property_fields = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "fields", (dict,))
|
||||
self._property_fields = value
|
||||
|
||||
|
||||
class GetCredentialsRequest(Request):
|
||||
"""
|
||||
Returns all existing credential keys for the authenticated user.
|
||||
Note: Only credential keys are returned.
|
||||
|
||||
"""
|
||||
|
||||
_service = "auth"
|
||||
_action = "get_credentials"
|
||||
_version = "2.1"
|
||||
_schema = {
|
||||
'additionalProperties': False,
|
||||
'definitions': {},
|
||||
'properties': {},
|
||||
'type': 'object',
|
||||
}
|
||||
|
||||
|
||||
class GetCredentialsResponse(Response):
|
||||
"""
|
||||
Response of auth.get_credentials endpoint.
|
||||
|
||||
:param credentials: List of credentials, each with an empty secret field.
|
||||
:type credentials: Sequence[CredentialKey]
|
||||
"""
|
||||
_service = "auth"
|
||||
_action = "get_credentials"
|
||||
_version = "2.1"
|
||||
|
||||
_schema = {
|
||||
'definitions': {
|
||||
'credential_key': {
|
||||
'properties': {
|
||||
'access_key': {'description': '', 'type': ['string', 'null']},
|
||||
'last_used': {
|
||||
'description': '',
|
||||
'format': 'date-time',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
'last_used_from': {
|
||||
'description': '',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
},
|
||||
},
|
||||
'properties': {
|
||||
'credentials': {
|
||||
'description': 'List of credentials, each with an empty secret field.',
|
||||
'items': {'$ref': '#/definitions/credential_key'},
|
||||
'type': ['array', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, credentials=None, **kwargs):
|
||||
super(GetCredentialsResponse, self).__init__(**kwargs)
|
||||
self.credentials = credentials
|
||||
|
||||
@schema_property('credentials')
|
||||
def credentials(self):
|
||||
return self._property_credentials
|
||||
|
||||
@credentials.setter
|
||||
def credentials(self, value):
|
||||
if value is None:
|
||||
self._property_credentials = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "credentials", (list, tuple))
|
||||
if any(isinstance(v, dict) for v in value):
|
||||
value = [CredentialKey.from_dict(v) if isinstance(v, dict) else v for v in value]
|
||||
else:
|
||||
self.assert_isinstance(value, "credentials", CredentialKey, is_array=True)
|
||||
self._property_credentials = value
|
||||
|
||||
|
||||
|
||||
|
||||
class LoginRequest(Request):
|
||||
"""
|
||||
Get a token based on supplied credentials (key/secret).
|
||||
Intended for use by users with key/secret credentials that wish to obtain a token
|
||||
for use with other services. Token will be limited by the same permissions that
|
||||
exist for the credentials used in this call.
|
||||
|
||||
:param expiration_sec: Requested token expiration time in seconds. Not
|
||||
guaranteed, might be overridden by the service
|
||||
:type expiration_sec: int
|
||||
"""
|
||||
|
||||
_service = "auth"
|
||||
_action = "login"
|
||||
_version = "2.1"
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'expiration_sec': {
|
||||
'description': 'Requested token expiration time in seconds. \n Not guaranteed, might be overridden by the service',
|
||||
'type': ['integer', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, expiration_sec=None, **kwargs):
|
||||
super(LoginRequest, self).__init__(**kwargs)
|
||||
self.expiration_sec = expiration_sec
|
||||
|
||||
@schema_property('expiration_sec')
|
||||
def expiration_sec(self):
|
||||
return self._property_expiration_sec
|
||||
|
||||
@expiration_sec.setter
|
||||
def expiration_sec(self, value):
|
||||
if value is None:
|
||||
self._property_expiration_sec = None
|
||||
return
|
||||
if isinstance(value, float) and value.is_integer():
|
||||
value = int(value)
|
||||
|
||||
self.assert_isinstance(value, "expiration_sec", six.integer_types)
|
||||
self._property_expiration_sec = value
|
||||
|
||||
|
||||
class LoginResponse(Response):
|
||||
"""
|
||||
Response of auth.login endpoint.
|
||||
|
||||
:param token: Token string
|
||||
:type token: str
|
||||
"""
|
||||
_service = "auth"
|
||||
_action = "login"
|
||||
_version = "2.1"
|
||||
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'token': {'description': 'Token string', 'type': ['string', 'null']},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, token=None, **kwargs):
|
||||
super(LoginResponse, self).__init__(**kwargs)
|
||||
self.token = token
|
||||
|
||||
@schema_property('token')
|
||||
def token(self):
|
||||
return self._property_token
|
||||
|
||||
@token.setter
|
||||
def token(self, value):
|
||||
if value is None:
|
||||
self._property_token = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "token", six.string_types)
|
||||
self._property_token = value
|
||||
|
||||
|
||||
class LogoutRequest(Request):
|
||||
"""
|
||||
Removes the authentication cookie from the current session
|
||||
|
||||
"""
|
||||
|
||||
_service = "auth"
|
||||
_action = "logout"
|
||||
_version = "2.2"
|
||||
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class LogoutResponse(Response):
|
||||
"""
|
||||
Response of auth.logout endpoint.
|
||||
|
||||
"""
|
||||
_service = "auth"
|
||||
_action = "logout"
|
||||
_version = "2.2"
|
||||
|
||||
_schema = {'additionalProperties': False, 'definitions': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class RevokeCredentialsRequest(Request):
|
||||
"""
|
||||
Revokes (and deletes) a set (key, secret) of credentials for
|
||||
the authenticated user.
|
||||
|
||||
:param access_key: Credentials key
|
||||
:type access_key: str
|
||||
"""
|
||||
|
||||
_service = "auth"
|
||||
_action = "revoke_credentials"
|
||||
_version = "2.1"
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'access_key': {
|
||||
'description': 'Credentials key',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
},
|
||||
'required': ['key_id'],
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, access_key=None, **kwargs):
|
||||
super(RevokeCredentialsRequest, self).__init__(**kwargs)
|
||||
self.access_key = access_key
|
||||
|
||||
@schema_property('access_key')
|
||||
def access_key(self):
|
||||
return self._property_access_key
|
||||
|
||||
@access_key.setter
|
||||
def access_key(self, value):
|
||||
if value is None:
|
||||
self._property_access_key = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "access_key", six.string_types)
|
||||
self._property_access_key = value
|
||||
|
||||
|
||||
class RevokeCredentialsResponse(Response):
|
||||
"""
|
||||
Response of auth.revoke_credentials endpoint.
|
||||
|
||||
:param revoked: Number of credentials revoked
|
||||
:type revoked: int
|
||||
"""
|
||||
_service = "auth"
|
||||
_action = "revoke_credentials"
|
||||
_version = "2.1"
|
||||
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'revoked': {
|
||||
'description': 'Number of credentials revoked',
|
||||
'enum': [0, 1],
|
||||
'type': ['integer', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, revoked=None, **kwargs):
|
||||
super(RevokeCredentialsResponse, self).__init__(**kwargs)
|
||||
self.revoked = revoked
|
||||
|
||||
@schema_property('revoked')
|
||||
def revoked(self):
|
||||
return self._property_revoked
|
||||
|
||||
@revoked.setter
|
||||
def revoked(self, value):
|
||||
if value is None:
|
||||
self._property_revoked = None
|
||||
return
|
||||
if isinstance(value, float) and value.is_integer():
|
||||
value = int(value)
|
||||
|
||||
self.assert_isinstance(value, "revoked", six.integer_types)
|
||||
self._property_revoked = value
|
||||
|
||||
|
||||
|
||||
|
||||
response_mapping = {
|
||||
LoginRequest: LoginResponse,
|
||||
LogoutRequest: LogoutResponse,
|
||||
CreateCredentialsRequest: CreateCredentialsResponse,
|
||||
GetCredentialsRequest: GetCredentialsResponse,
|
||||
RevokeCredentialsRequest: RevokeCredentialsResponse,
|
||||
EditUserRequest: EditUserResponse,
|
||||
}
|
||||
194
trains_agent/backend_api/services/v2_5/debug.py
Normal file
194
trains_agent/backend_api/services/v2_5/debug.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
debug service
|
||||
|
||||
Debugging utilities
|
||||
"""
|
||||
import six
|
||||
import types
|
||||
from datetime import datetime
|
||||
import enum
|
||||
|
||||
from dateutil.parser import parse as parse_datetime
|
||||
|
||||
from ....backend_api.session import Request, BatchRequest, Response, DataModel, NonStrictDataModel, CompoundRequest, schema_property, StringEnum
|
||||
|
||||
|
||||
class ApiexRequest(Request):
|
||||
"""
|
||||
"""
|
||||
|
||||
_service = "debug"
|
||||
_action = "apiex"
|
||||
_version = "1.5"
|
||||
_schema = {'definitions': {}, 'properties': {}, 'required': [], 'type': 'object'}
|
||||
|
||||
|
||||
class ApiexResponse(Response):
|
||||
"""
|
||||
Response of debug.apiex endpoint.
|
||||
|
||||
"""
|
||||
_service = "debug"
|
||||
_action = "apiex"
|
||||
_version = "1.5"
|
||||
|
||||
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class EchoRequest(Request):
|
||||
"""
|
||||
Return request data
|
||||
|
||||
"""
|
||||
|
||||
_service = "debug"
|
||||
_action = "echo"
|
||||
_version = "1.5"
|
||||
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class EchoResponse(Response):
|
||||
"""
|
||||
Response of debug.echo endpoint.
|
||||
|
||||
"""
|
||||
_service = "debug"
|
||||
_action = "echo"
|
||||
_version = "1.5"
|
||||
|
||||
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class ExRequest(Request):
|
||||
"""
|
||||
"""
|
||||
|
||||
_service = "debug"
|
||||
_action = "ex"
|
||||
_version = "1.5"
|
||||
_schema = {'definitions': {}, 'properties': {}, 'required': [], 'type': 'object'}
|
||||
|
||||
|
||||
class ExResponse(Response):
|
||||
"""
|
||||
Response of debug.ex endpoint.
|
||||
|
||||
"""
|
||||
_service = "debug"
|
||||
_action = "ex"
|
||||
_version = "1.5"
|
||||
|
||||
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class PingRequest(Request):
|
||||
"""
|
||||
Return a message. Does not require authorization.
|
||||
|
||||
"""
|
||||
|
||||
_service = "debug"
|
||||
_action = "ping"
|
||||
_version = "1.5"
|
||||
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class PingResponse(Response):
|
||||
"""
|
||||
Response of debug.ping endpoint.
|
||||
|
||||
:param msg: A friendly message
|
||||
:type msg: str
|
||||
"""
|
||||
_service = "debug"
|
||||
_action = "ping"
|
||||
_version = "1.5"
|
||||
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'msg': {
|
||||
'description': 'A friendly message',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, msg=None, **kwargs):
|
||||
super(PingResponse, self).__init__(**kwargs)
|
||||
self.msg = msg
|
||||
|
||||
@schema_property('msg')
|
||||
def msg(self):
|
||||
return self._property_msg
|
||||
|
||||
@msg.setter
|
||||
def msg(self, value):
|
||||
if value is None:
|
||||
self._property_msg = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "msg", six.string_types)
|
||||
self._property_msg = value
|
||||
|
||||
|
||||
class PingAuthRequest(Request):
|
||||
"""
|
||||
Return a message. Requires authorization.
|
||||
|
||||
"""
|
||||
|
||||
_service = "debug"
|
||||
_action = "ping_auth"
|
||||
_version = "1.5"
|
||||
_schema = {'definitions': {}, 'properties': {}, 'type': 'object'}
|
||||
|
||||
|
||||
class PingAuthResponse(Response):
|
||||
"""
|
||||
Response of debug.ping_auth endpoint.
|
||||
|
||||
:param msg: A friendly message
|
||||
:type msg: str
|
||||
"""
|
||||
_service = "debug"
|
||||
_action = "ping_auth"
|
||||
_version = "1.5"
|
||||
|
||||
_schema = {
|
||||
'definitions': {},
|
||||
'properties': {
|
||||
'msg': {
|
||||
'description': 'A friendly message',
|
||||
'type': ['string', 'null'],
|
||||
},
|
||||
},
|
||||
'type': 'object',
|
||||
}
|
||||
def __init__(
|
||||
self, msg=None, **kwargs):
|
||||
super(PingAuthResponse, self).__init__(**kwargs)
|
||||
self.msg = msg
|
||||
|
||||
@schema_property('msg')
|
||||
def msg(self):
|
||||
return self._property_msg
|
||||
|
||||
@msg.setter
|
||||
def msg(self, value):
|
||||
if value is None:
|
||||
self._property_msg = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "msg", six.string_types)
|
||||
self._property_msg = value
|
||||
|
||||
|
||||
response_mapping = {
|
||||
EchoRequest: EchoResponse,
|
||||
PingRequest: PingResponse,
|
||||
PingAuthRequest: PingAuthResponse,
|
||||
ApiexRequest: ApiexResponse,
|
||||
ExRequest: ExResponse,
|
||||
}
|
||||
3000
trains_agent/backend_api/services/v2_5/events.py
Normal file
3000
trains_agent/backend_api/services/v2_5/events.py
Normal file
File diff suppressed because it is too large
Load Diff
2850
trains_agent/backend_api/services/v2_5/models.py
Normal file
2850
trains_agent/backend_api/services/v2_5/models.py
Normal file
File diff suppressed because it is too large
Load Diff
2198
trains_agent/backend_api/services/v2_5/queues.py
Normal file
2198
trains_agent/backend_api/services/v2_5/queues.py
Normal file
File diff suppressed because it is too large
Load Diff
7053
trains_agent/backend_api/services/v2_5/tasks.py
Normal file
7053
trains_agent/backend_api/services/v2_5/tasks.py
Normal file
File diff suppressed because it is too large
Load Diff
2368
trains_agent/backend_api/services/v2_5/workers.py
Normal file
2368
trains_agent/backend_api/services/v2_5/workers.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -139,13 +139,25 @@ class Response(object):
|
||||
:param dest: if all of a response's data is contained in one field, use that field
|
||||
:type dest: Text
|
||||
"""
|
||||
self.response = None
|
||||
self._result = result
|
||||
response = getattr(result, "response", result)
|
||||
if getattr(response, "_service") == "events" and \
|
||||
getattr(response, "_action") in ("scalar_metrics_iter_histogram",
|
||||
"multi_task_scalar_metrics_iter_histogram",
|
||||
"vector_metrics_iter_histogram",
|
||||
):
|
||||
# put all the response data under metrics:
|
||||
response.metrics = result.response_data
|
||||
if 'metrics' not in response.__class__._get_data_props():
|
||||
response.__class__._data_props_list['metrics'] = 'metrics'
|
||||
if dest:
|
||||
response = getattr(response, dest)
|
||||
self.response = response
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if self.response is None:
|
||||
return None
|
||||
return getattr(self.response, attr)
|
||||
|
||||
@property
|
||||
@@ -493,6 +505,7 @@ class APIClient(object):
|
||||
queues = None # type: Any
|
||||
tasks = None # type: Any
|
||||
workers = None # type: Any
|
||||
events = None # type: Any
|
||||
|
||||
def __init__(self, session=None, api_version=None):
|
||||
self.session = session or StrictSession()
|
||||
|
||||
@@ -358,7 +358,7 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
**locals())
|
||||
self.exit(message)
|
||||
|
||||
message = 'Could not find {} with name "{}"'.format(service.rstrip('s'), name)
|
||||
message = 'Could not find {} with name/id "{}"'.format(service.rstrip('s'), name)
|
||||
|
||||
if not response:
|
||||
raise NameResolutionError(message)
|
||||
|
||||
@@ -1,19 +1,21 @@
|
||||
from __future__ import print_function
|
||||
|
||||
from six.moves import input
|
||||
from pyhocon import ConfigFactory
|
||||
from pyhocon import ConfigFactory, ConfigMissingException
|
||||
from pathlib2 import Path
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from trains_agent.backend_api.session import Session
|
||||
from trains_agent.backend_api.session.defs import ENV_HOST
|
||||
from trains_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
|
||||
|
||||
description = """
|
||||
Please create new credentials using the web app: {}/profile
|
||||
In the Admin page, press "Create new credentials", then press "Copy to clipboard"
|
||||
Please create new trains credentials through the profile page in your trains web app (e.g. https://demoapp.trains.allegro.ai/profile)
|
||||
In the profile page, press "Create new credentials", then press "Copy to clipboard".
|
||||
|
||||
Paste credentials here: """
|
||||
Paste copied configuration here:
|
||||
"""
|
||||
|
||||
def_host = 'http://localhost:8080'
|
||||
try:
|
||||
@@ -38,20 +40,39 @@ def main():
|
||||
print('Leaving setup, feel free to edit the configuration file.')
|
||||
return
|
||||
|
||||
print(host_description)
|
||||
web_host = input_url('Web Application Host', '')
|
||||
parsed_host = verify_url(web_host)
|
||||
print(description, end='')
|
||||
sentinel = ''
|
||||
parse_input = '\n'.join(iter(input, sentinel))
|
||||
credentials = None
|
||||
api_host = None
|
||||
web_server = None
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
parsed = ConfigFactory.parse_string(parse_input)
|
||||
if parsed:
|
||||
# Take the credentials in raw form or from api section
|
||||
credentials = get_parsed_field(parsed, ["credentials"])
|
||||
api_host = get_parsed_field(parsed, ["api_server", "host"])
|
||||
web_server = get_parsed_field(parsed, ["web_server"])
|
||||
except Exception:
|
||||
credentials = credentials or None
|
||||
api_host = api_host or None
|
||||
web_server = web_server or None
|
||||
|
||||
if parsed_host.port == 8008:
|
||||
print('Port 8008 is the api port. Replacing 8080 with 8008 for Web application')
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8080', 1) + parsed_host.path
|
||||
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8081', 1) + parsed_host.path
|
||||
elif parsed_host.port == 8080:
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8008', 1) + parsed_host.path
|
||||
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8081', 1) + parsed_host.path
|
||||
elif parsed_host.netloc.startswith('demoapp.'):
|
||||
while not credentials or set(credentials) != {"access_key", "secret_key"}:
|
||||
print('Could not parse credentials, please try entering them manually.')
|
||||
credentials = read_manual_credentials()
|
||||
|
||||
print('Detected credentials key=\"{}\" secret=\"{}\"'.format(credentials['access_key'],
|
||||
credentials['secret_key'][0:4] + "***"))
|
||||
if api_host:
|
||||
api_host = input_url('API Host', api_host)
|
||||
else:
|
||||
print(host_description)
|
||||
api_host = input_url('API Host', '')
|
||||
parsed_host = verify_url(api_host)
|
||||
|
||||
if parsed_host.netloc.startswith('demoapp.'):
|
||||
# this is our demo server
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('demoapp.', 'demoapi.', 1) + parsed_host.path
|
||||
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
@@ -73,61 +94,50 @@ def main():
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('api.', 'app.', 1) + parsed_host.path
|
||||
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace('api.', 'files.', 1) + parsed_host.path
|
||||
elif parsed_host.port == 8008:
|
||||
print('Port 8008 is the api port. Replacing 8080 with 8008 for Web application')
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
web_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8080', 1) + parsed_host.path
|
||||
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8008', ':8081', 1) + parsed_host.path
|
||||
elif parsed_host.port == 8080:
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8008', 1) + parsed_host.path
|
||||
web_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
files_host = parsed_host.scheme + "://" + parsed_host.netloc.replace(':8080', ':8081', 1) + parsed_host.path
|
||||
else:
|
||||
api_host = ''
|
||||
web_host = ''
|
||||
files_host = ''
|
||||
if not parsed_host.port:
|
||||
print('Host port not detected, do you wish to use the default 8008 port n/[y]? ', end='')
|
||||
print('Host port not detected, do you wish to use the default 8080 port n/[y]? ', end='')
|
||||
replace_port = input().lower()
|
||||
if not replace_port or replace_port == 'y' or replace_port == 'yes':
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8008' + parsed_host.path
|
||||
web_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8080' + parsed_host.path
|
||||
files_host = parsed_host.scheme + "://" + parsed_host.netloc + ':8081' + parsed_host.path
|
||||
elif not replace_port or replace_port.lower() == 'n' or replace_port.lower() == 'no':
|
||||
web_host = input_host_port("Web", parsed_host)
|
||||
api_host = input_host_port("API", parsed_host)
|
||||
files_host = input_host_port("Files", parsed_host)
|
||||
if not api_host:
|
||||
api_host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
|
||||
api_host = input_url('API Host', api_host)
|
||||
web_host = input_url('Web Application Host', web_server if web_server else web_host)
|
||||
files_host = input_url('File Store Host', files_host)
|
||||
|
||||
print('\nTRAINS Hosts configuration:\nAPI: {}\nWeb App: {}\nFile Store: {}\n'.format(
|
||||
api_host, web_host, files_host))
|
||||
print('\nTRAINS Hosts configuration:\nWeb App: {}\nAPI: {}\nFile Store: {}\n'.format(
|
||||
web_host, api_host, files_host))
|
||||
|
||||
while True:
|
||||
print(description.format(web_host), end='')
|
||||
parse_input = input()
|
||||
# check if these are valid credentials
|
||||
credentials = None
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
parsed = ConfigFactory.parse_string(parse_input)
|
||||
if parsed:
|
||||
credentials = parsed.get("credentials", None)
|
||||
except Exception:
|
||||
credentials = None
|
||||
|
||||
if not credentials or set(credentials) != {"access_key", "secret_key"}:
|
||||
print('Could not parse user credentials, try again one after the other.')
|
||||
credentials = {}
|
||||
# parse individual
|
||||
print('Enter user access key: ', end='')
|
||||
credentials['access_key'] = input()
|
||||
print('Enter user secret: ', end='')
|
||||
credentials['secret_key'] = input()
|
||||
|
||||
print('Detected credentials key=\"{}\" secret=\"{}\"'.format(credentials['access_key'],
|
||||
credentials['secret_key'], ))
|
||||
|
||||
from trains_agent.backend_api.session import Session
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
print('Verifying credentials ...')
|
||||
Session(api_key=credentials['access_key'], secret_key=credentials['secret_key'], host=api_host)
|
||||
print('Credentials verified!')
|
||||
retry = 1
|
||||
max_retries = 2
|
||||
while retry <= max_retries: # Up to 2 tries by the user
|
||||
if verify_credentials(api_host, credentials):
|
||||
break
|
||||
except Exception:
|
||||
print('Error: could not verify credentials: host={} access={} secret={}'.format(
|
||||
api_host, credentials['access_key'], credentials['secret_key']))
|
||||
retry += 1
|
||||
if retry < max_retries + 1:
|
||||
credentials = read_manual_credentials()
|
||||
else:
|
||||
print('Exiting setup without creating configuration file')
|
||||
return
|
||||
|
||||
# get GIT User/Pass for cloning
|
||||
print('Enter git username for repository cloning (leave blank for SSH key authentication): [] ', end='')
|
||||
@@ -140,6 +150,18 @@ def main():
|
||||
git_user = None
|
||||
git_pass = None
|
||||
|
||||
# get extra-index-url for pip installations
|
||||
extra_index_urls = []
|
||||
print('\nEnter additional artifact repository (extra-index-url) to use when installing python packages '
|
||||
'(leave blank if not required):', end='')
|
||||
index_url = input().strip()
|
||||
while index_url:
|
||||
extra_index_urls.append(index_url)
|
||||
print('Another artifact repository? (enter another url or leave blank if done):', end='')
|
||||
index_url = input().strip()
|
||||
if len(extra_index_urls):
|
||||
print("The following artifact repositories will be added:\n\t- {}".format("\n\t- ".join(extra_index_urls)))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
conf_folder = Path(__file__).parent.absolute() / '..' / 'backend_api' / 'config' / 'default'
|
||||
@@ -173,6 +195,10 @@ def main():
|
||||
'agent.git_pass=\"{}\"\n' \
|
||||
'\n'.format(git_user or '', git_pass or '')
|
||||
f.write(git_credentials)
|
||||
extra_index_str = '# extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]\n' \
|
||||
'agent.package_manager.extra_index_url= ' \
|
||||
'[\n{}\n]\n\n'.format("\n".join(map("\"{}\"".format, extra_index_urls)))
|
||||
f.write(extra_index_str)
|
||||
f.write(default_conf)
|
||||
except Exception:
|
||||
print('Error! Could not write configuration file at: {}'.format(str(conf_file)))
|
||||
@@ -182,18 +208,72 @@ def main():
|
||||
print('TRAINS-AGENT setup completed successfully.')
|
||||
|
||||
|
||||
def verify_credentials(api_host, credentials):
|
||||
"""check if the credentials are valid"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
print('Verifying credentials ...')
|
||||
if api_host:
|
||||
Session(api_key=credentials['access_key'], secret_key=credentials['secret_key'], host=api_host)
|
||||
print('Credentials verified!')
|
||||
return True
|
||||
else:
|
||||
print("Can't verify credentials")
|
||||
return False
|
||||
except Exception:
|
||||
print('Error: could not verify credentials: key={} secret={}'.format(
|
||||
credentials.get('access_key'), credentials.get('secret_key')))
|
||||
return False
|
||||
|
||||
|
||||
def get_parsed_field(parsed_config, fields):
|
||||
"""
|
||||
Parsed the value from web profile page, 'copy to clipboard' option
|
||||
:param parsed_config: The parsed value from the web ui
|
||||
:type parsed_config: Config object
|
||||
:param fields: list of values to parse, will parse by the list order
|
||||
:type fields: List[str]
|
||||
:return: parsed value if found, None else
|
||||
"""
|
||||
try:
|
||||
return parsed_config.get("api").get(fields[0])
|
||||
except ConfigMissingException: # fallback - try to parse the field like it was in web older version
|
||||
if len(fields) == 1:
|
||||
return parsed_config.get(fields[0])
|
||||
elif len(fields) == 2:
|
||||
return parsed_config.get(fields[1])
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def read_manual_credentials():
|
||||
print('Enter user access key: ', end='')
|
||||
access_key = input()
|
||||
print('Enter user secret: ', end='')
|
||||
secret_key = input()
|
||||
return {"access_key": access_key, "secret_key": secret_key}
|
||||
|
||||
|
||||
def input_url(host_type, host=None):
|
||||
while True:
|
||||
print('{} configured to: [{}] '.format(host_type, host), end='')
|
||||
parse_input = input()
|
||||
if host and (not parse_input or parse_input.lower() == 'yes' or parse_input.lower() == 'y'):
|
||||
break
|
||||
if parse_input and verify_url(parse_input):
|
||||
host = parse_input
|
||||
parsed_host = verify_url(parse_input) if parse_input else None
|
||||
if parse_input and parsed_host:
|
||||
host = parsed_host.scheme + "://" + parsed_host.netloc + parsed_host.path
|
||||
break
|
||||
return host
|
||||
|
||||
|
||||
def input_host_port(host_type, parsed_host):
|
||||
print('Enter port for {} host '.format(host_type), end='')
|
||||
replace_port = input().lower()
|
||||
return parsed_host.scheme + "://" + parsed_host.netloc + (':{}'.format(replace_port) if replace_port else '') + \
|
||||
parsed_host.path
|
||||
|
||||
|
||||
def verify_url(parse_input):
|
||||
try:
|
||||
if not parse_input.startswith('http://') and not parse_input.startswith('https://'):
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -73,6 +73,12 @@ ENVIRONMENT_CONFIG = {
|
||||
"agent.cpu_only": EnvironmentConfig(
|
||||
"TRAINS_CPU_ONLY", "ALG_CPU_ONLY", "CPU_ONLY", type=bool
|
||||
),
|
||||
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
|
||||
"sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"),
|
||||
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
|
||||
"sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
|
||||
'account_key': EnvironmentConfig("AZURE_STORAGE_KEY")},
|
||||
"sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
|
||||
}
|
||||
|
||||
CONFIG_FILE_ENV = EnvironmentConfig("ALG_CONFIG_FILE")
|
||||
@@ -114,6 +120,8 @@ DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
|
||||
PIP_EXTRA_INDICES = [
|
||||
]
|
||||
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
|
||||
ENV_TASK_EXECUTE_AS_USER = 'TRAINS_AGENT_EXEC_USER'
|
||||
ENV_K8S_HOST_MOUNT = 'TRAINS_AGENT_K8S_HOST_MOUNT'
|
||||
|
||||
|
||||
class FileBuffering(IntEnum):
|
||||
|
||||
169
trains_agent/glue/k8s.py
Normal file
169
trains_agent/glue/k8s.py
Normal file
@@ -0,0 +1,169 @@
|
||||
from __future__ import print_function, division, unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from time import sleep
|
||||
from typing import Text, List
|
||||
|
||||
from pyhocon import HOCONConverter
|
||||
|
||||
from trains_agent.commands.events import Events
|
||||
from trains_agent.commands.worker import Worker
|
||||
from trains_agent.helper.process import get_bash_output
|
||||
from trains_agent.helper.resource_monitor import ResourceMonitor
|
||||
|
||||
|
||||
class K8sIntegration(Worker):
|
||||
K8S_PENDING_QUEUE = "k8s_scheduler"
|
||||
|
||||
KUBECTL_RUN_CMD = "kubectl run trains_id_{task_id} " \
|
||||
"--image {docker_image} " \
|
||||
"--restart=Never --replicas=1 " \
|
||||
"--generator=run-pod/v1"
|
||||
|
||||
KUBECTL_DELETE_CMD = "kubectl delete pods " \
|
||||
"--selector=TRAINS=agent " \
|
||||
"--field-selector=status.phase!=Pending,status.phase!=Running"
|
||||
|
||||
CONTAINER_BASH_SCRIPT = "apt-get install -y git python-pip && " \
|
||||
"pip install trains-agent && " \
|
||||
"python -u -m trains_agent execute --full-monitoring --require-queue --id {}"
|
||||
|
||||
def __init__(self, k8s_pending_queue_name=None, kubectl_cmd=None, container_bash_script=None, debug=False):
|
||||
"""
|
||||
Initialize the k8s integration glue layer daemon
|
||||
|
||||
:param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
|
||||
:param str|callable kubectl_cmd: kubectl command line str, supports formating (default: KUBECTL_RUN_CMD)
|
||||
example: "task={task_id} image={docker_image} queue_id={queue_id}"
|
||||
or a callable function: kubectl_cmd(task_id, docker_image, queue_id, task_data)
|
||||
:param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
|
||||
:param bool debug: Switch logging on
|
||||
"""
|
||||
super(K8sIntegration, self).__init__()
|
||||
self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
|
||||
self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
|
||||
self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
|
||||
# Always do system packages, because by we will be running inside a docker
|
||||
self._session.config.put("agent.package_manager.system_site_packages", True)
|
||||
# Add debug logging
|
||||
if debug:
|
||||
self.log.logger.disabled = False
|
||||
self.log.logger.setLevel(logging.INFO)
|
||||
|
||||
def run_one_task(self, queue: Text, task_id: Text, worker_args=None):
|
||||
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
|
||||
|
||||
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
|
||||
try:
|
||||
self._session.api_client.tasks.enqueue(task_id, queue=self.k8s_pending_queue_name,
|
||||
status_reason='k8s pending scheduler')
|
||||
except Exception as e:
|
||||
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
|
||||
task_id, self.k8s_pending_queue_name, e))
|
||||
return
|
||||
|
||||
if task_data.execution.docker_cmd:
|
||||
docker_image = task_data.execution.docker_cmd
|
||||
else:
|
||||
docker_image = str(os.environ.get("TRAINS_DOCKER_IMAGE") or
|
||||
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
|
||||
|
||||
# take the first part, this is the docker image name (not arguments)
|
||||
docker_image = docker_image.split()[0]
|
||||
|
||||
create_trains_conf = "echo '{}' >> ~/trains.conf && ".format(
|
||||
HOCONConverter.to_hocon(self._session.config._config))
|
||||
|
||||
if callable(self.kubectl_cmd):
|
||||
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
|
||||
else:
|
||||
kubectl_cmd = self.kubectl_cmd.format(task_id=task_id, docker_image=docker_image, queue_id=queue)
|
||||
|
||||
# make sure we gave a list
|
||||
if isinstance(kubectl_cmd, str):
|
||||
kubectl_cmd = kubectl_cmd.split()
|
||||
|
||||
kubectl_cmd += ["--labels=TRAINS=agent", "--command", "--", "/bin/sh", "-c",
|
||||
create_trains_conf + self.container_bash_script.format(task_id)]
|
||||
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
self.log.info("K8s scheduling experiment task id={}".format(task_id))
|
||||
if error:
|
||||
self.log.error("Running kubectl encountered an error: {}".format(
|
||||
error if isinstance(error, str) else error.decode()))
|
||||
|
||||
def run_tasks_loop(self, queues: List[Text], worker_params):
|
||||
"""
|
||||
:summary: Pull and run tasks from queues.
|
||||
:description: 1. Go through ``queues`` by order.
|
||||
2. Try getting the next task for each and run the first one that returns.
|
||||
3. Go to step 1
|
||||
:param queues: IDs of queues to pull tasks from
|
||||
:type queues: list of ``Text``
|
||||
:param worker_params: Worker command line arguments
|
||||
:type worker_params: ``trains_agent.helper.process.WorkerParams``
|
||||
"""
|
||||
events_service = self.get_service(Events)
|
||||
|
||||
# make sure we have a k8s pending queue
|
||||
try:
|
||||
self._session.api_client.queues.create(self.k8s_pending_queue_name)
|
||||
except Exception:
|
||||
pass
|
||||
# get queue id
|
||||
self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")
|
||||
|
||||
_last_machine_update_ts = 0
|
||||
while True:
|
||||
# iterate over queues (priority style, queues[0] is highest)
|
||||
for queue in queues:
|
||||
# delete old completed /failed pods
|
||||
get_bash_output(self.KUBECTL_DELETE_CMD)
|
||||
|
||||
# get next task in queue
|
||||
try:
|
||||
response = self._session.api_client.queues.get_next_task(queue=queue)
|
||||
except Exception as e:
|
||||
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
task_id = response.entry.task
|
||||
except AttributeError:
|
||||
print("No tasks in queue {}".format(queue))
|
||||
continue
|
||||
events_service.send_log_events(
|
||||
self.worker_id,
|
||||
task_id=task_id,
|
||||
lines="task {} pulled from {} by worker {}".format(
|
||||
task_id, queue, self.worker_id
|
||||
),
|
||||
level="INFO",
|
||||
)
|
||||
|
||||
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
|
||||
self.run_one_task(queue, task_id, worker_params)
|
||||
self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
|
||||
break
|
||||
else:
|
||||
# sleep and retry polling
|
||||
print("No tasks in Queues, sleeping for {:.1f} seconds".format(self._polling_interval))
|
||||
sleep(self._polling_interval)
|
||||
|
||||
if self._session.config["agent.reload_config"]:
|
||||
self.reload_config()
|
||||
|
||||
def k8s_daemon(self, queues):
|
||||
"""
|
||||
Start the k8s Glue service.
|
||||
This service will be pulling tasks from *queues* and scheduling them for execution using kubectl.
|
||||
Notice all scheduled tasks are pushed back into K8S_PENDING_QUEUE,
|
||||
and popped when execution actually starts. This creates full visibility into the k8s scheduler.
|
||||
Manually popping a task from the K8S_PENDING_QUEUE,
|
||||
will cause the k8s scheduler to skip the execution once the scheduled tasks needs to be executed
|
||||
|
||||
:param list(str) queues: List of queue names to pull from
|
||||
"""
|
||||
return self.daemon(queues=queues, log_level=logging.INFO, foreground=True, docker=False)
|
||||
@@ -157,6 +157,10 @@ def is_windows_platform():
|
||||
return any(platform.win32_ver())
|
||||
|
||||
|
||||
def is_linux_platform():
|
||||
return 'linux' in platform.system().lower()
|
||||
|
||||
|
||||
def normalize_path(*paths):
|
||||
"""
|
||||
normalize_path
|
||||
@@ -176,6 +180,25 @@ def safe_remove_file(filename, error_message=None):
|
||||
print(error_message)
|
||||
|
||||
|
||||
def get_python_path(script_dir, entry_point, package_api):
|
||||
try:
|
||||
python_path_sep = ';' if is_windows_platform() else ':'
|
||||
python_path_cmd = package_api.get_python_command(
|
||||
["-c", "import sys; print('{}'.join(sys.path))".format(python_path_sep)])
|
||||
org_python_path = python_path_cmd.get_output(cwd=script_dir)
|
||||
# Add path of the script directory and executable directory
|
||||
python_path = '{}{python_path_sep}{}{python_path_sep}'.format(
|
||||
Path(script_dir).absolute().as_posix(),
|
||||
(Path(script_dir) / Path(entry_point)).parent.absolute().as_posix(),
|
||||
python_path_sep=python_path_sep)
|
||||
if is_windows_platform():
|
||||
return python_path.replace('/', '\\') + org_python_path
|
||||
|
||||
return python_path + org_python_path
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
class Singleton(ABCMeta):
|
||||
_instances = {}
|
||||
|
||||
@@ -440,6 +463,17 @@ def rm_tree(root): # type: (Union[Path, Text]) -> None
|
||||
return shutil.rmtree(os.path.expanduser(os.path.expandvars(Text(root))), onerror=on_error)
|
||||
|
||||
|
||||
def rm_file(filename): # type: (Union[Path, Text]) -> None
|
||||
"""
|
||||
A version of os.unlink that will not raise error
|
||||
"""
|
||||
try:
|
||||
os.unlink(os.path.expanduser(os.path.expandvars(Text(filename))))
|
||||
except:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_conda(config):
|
||||
return config['agent.package_manager.type'].lower() == 'conda'
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ from time import sleep
|
||||
import requests
|
||||
import json
|
||||
from threading import Thread
|
||||
from semantic_version import Version
|
||||
from packaging import version as packaging_version
|
||||
from ..version import __version__
|
||||
|
||||
__check_update_thread = None
|
||||
@@ -30,8 +30,8 @@ def _check_new_version_available():
|
||||
return None
|
||||
trains_answer = update_server_releases.get("trains-agent", {})
|
||||
latest_version = trains_answer.get("version")
|
||||
cur_version = Version(cur_version)
|
||||
latest_version = Version(latest_version)
|
||||
cur_version = packaging_version.parse(cur_version)
|
||||
latest_version = packaging_version.parse(latest_version or '')
|
||||
if cur_version >= latest_version:
|
||||
return None
|
||||
patch_upgrade = latest_version.major == cur_version.major and latest_version.minor == cur_version.minor
|
||||
|
||||
@@ -22,6 +22,18 @@ def print_text(text, newline=True):
|
||||
sys.stdout.write(data)
|
||||
|
||||
|
||||
def decode_binary_lines(binary_lines, encoding='utf-8'):
|
||||
# decode per line, if we failed decoding skip the line
|
||||
lines = []
|
||||
for b in binary_lines:
|
||||
try:
|
||||
l = b.decode(encoding=encoding, errors='replace').replace('\r', '\n')
|
||||
except:
|
||||
l = ''
|
||||
lines.append(l + '\n' if l and l[-1] != '\n' else l)
|
||||
return lines
|
||||
|
||||
|
||||
def ensure_text(s, encoding='utf-8', errors='strict'):
|
||||
"""Coerce *s* to six.text_type.
|
||||
For Python 2:
|
||||
|
||||
@@ -16,6 +16,8 @@ class PackageManager(object):
|
||||
"""
|
||||
|
||||
_selected_manager = None
|
||||
_cwd = None
|
||||
_pip_version = None
|
||||
|
||||
@abc.abstractproperty
|
||||
def bin(self):
|
||||
@@ -64,7 +66,7 @@ class PackageManager(object):
|
||||
pass
|
||||
|
||||
def upgrade_pip(self):
|
||||
return self._install("pip", "--upgrade")
|
||||
return self._install("pip"+self.get_pip_version(), "--upgrade")
|
||||
|
||||
def get_python_command(self, extra=()):
|
||||
# type: (...) -> Executable
|
||||
@@ -97,11 +99,42 @@ class PackageManager(object):
|
||||
# this is helpful when we want out of context requirement installations
|
||||
PackageManager._selected_manager = self
|
||||
|
||||
@property
|
||||
def cwd(self):
|
||||
return self._cwd
|
||||
|
||||
@cwd.setter
|
||||
def cwd(self, value):
|
||||
self._cwd = value
|
||||
|
||||
@classmethod
|
||||
def out_of_scope_install_package(cls, package_name):
|
||||
def out_of_scope_install_package(cls, package_name, *args):
|
||||
if PackageManager._selected_manager is not None:
|
||||
try:
|
||||
return PackageManager._selected_manager._install(package_name)
|
||||
return PackageManager._selected_manager._install(package_name, *args)
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def out_of_scope_freeze(cls):
|
||||
if PackageManager._selected_manager is not None:
|
||||
try:
|
||||
return PackageManager._selected_manager.freeze()
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def set_pip_version(cls, version):
|
||||
if not version:
|
||||
return
|
||||
version = version.replace(' ', '')
|
||||
if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
|
||||
cls._pip_version = version
|
||||
else:
|
||||
cls._pip_version = "=="+version
|
||||
|
||||
@classmethod
|
||||
def get_pip_version(cls):
|
||||
return cls._pip_version or ''
|
||||
|
||||
@@ -14,7 +14,7 @@ import yaml
|
||||
from time import time
|
||||
from attr import attrs, attrib, Factory
|
||||
from pathlib2 import Path
|
||||
from semantic_version import Version
|
||||
from packaging import version as packaging_version
|
||||
from requirements import parse
|
||||
from requirements.requirement import Requirement
|
||||
|
||||
@@ -59,7 +59,7 @@ class CondaAPI(PackageManager):
|
||||
A programmatic interface for controlling conda
|
||||
"""
|
||||
|
||||
MINIMUM_VERSION = Version("4.3.30", partial=True)
|
||||
MINIMUM_VERSION = packaging_version.parse("4.3.30")
|
||||
|
||||
def __init__(self, session, path, python, requirements_manager):
|
||||
# type: (Session, PathLike, float, RequirementsManager) -> None
|
||||
@@ -93,7 +93,7 @@ class CondaAPI(PackageManager):
|
||||
)
|
||||
)
|
||||
self.conda_version = self.get_conda_version(output)
|
||||
if Version(self.conda_version, partial=True) < self.MINIMUM_VERSION:
|
||||
if packaging_version.parse(self.conda_version) < self.MINIMUM_VERSION:
|
||||
raise CommandFailedError(
|
||||
"conda version '{}' is smaller than minimum supported conda version '{}'".format(
|
||||
self.conda_version, self.MINIMUM_VERSION
|
||||
@@ -112,7 +112,7 @@ class CondaAPI(PackageManager):
|
||||
return self.pip.bin
|
||||
|
||||
def upgrade_pip(self):
|
||||
return self.pip.upgrade_pip()
|
||||
return self._install("pip" + self.pip.get_pip_version())
|
||||
|
||||
def create(self):
|
||||
"""
|
||||
@@ -227,20 +227,20 @@ class CondaAPI(PackageManager):
|
||||
self.pip.install_from_file(reqs)
|
||||
|
||||
def freeze(self):
|
||||
# result = yaml.load(
|
||||
# self._run_command((self.conda, "env", "export", "-p", self.path), raw=True)
|
||||
# )
|
||||
# for key in "name", "prefix":
|
||||
# result.pop(key, None)
|
||||
# freeze = {"conda": result}
|
||||
# try:
|
||||
# freeze["pip"] = result["dependencies"][-1]["pip"]
|
||||
# except (TypeError, KeyError):
|
||||
# freeze["pip"] = []
|
||||
# else:
|
||||
# del result["dependencies"][-1]
|
||||
# return freeze
|
||||
return self.pip.freeze()
|
||||
requirements = self.pip.freeze()
|
||||
try:
|
||||
conda_packages = json.loads(self._run_command((self.conda, "list", "--json", "-p", self.path), raw=True))
|
||||
conda_packages_txt = []
|
||||
requirements_pip = [r.split('==')[0].strip().lower() for r in requirements['pip']]
|
||||
for pkg in conda_packages:
|
||||
# skip if this is a pypi package or it is not a python package at all
|
||||
if pkg['channel'] == 'pypi' or pkg['name'].lower() not in requirements_pip:
|
||||
continue
|
||||
conda_packages_txt.append('{0}{1}{2}'.format(pkg['name'], '==', pkg['version']))
|
||||
requirements['conda'] = conda_packages_txt
|
||||
except:
|
||||
pass
|
||||
return requirements
|
||||
|
||||
def load_requirements(self, requirements):
|
||||
# create new environment file
|
||||
@@ -249,6 +249,8 @@ class CondaAPI(PackageManager):
|
||||
reqs = []
|
||||
if isinstance(requirements['pip'], six.string_types):
|
||||
requirements['pip'] = requirements['pip'].split('\n')
|
||||
if isinstance(requirements.get('conda'), six.string_types):
|
||||
requirements['conda'] = requirements['conda'].split('\n')
|
||||
has_torch = False
|
||||
has_matplotlib = False
|
||||
try:
|
||||
@@ -256,35 +258,86 @@ class CondaAPI(PackageManager):
|
||||
except:
|
||||
cuda_version = 0
|
||||
|
||||
for r in requirements['pip']:
|
||||
marker = list(parse(r))
|
||||
if marker:
|
||||
m = MarkerRequirement(marker[0])
|
||||
if m.req.name.lower() == 'matplotlib':
|
||||
has_matplotlib = True
|
||||
elif m.req.name.lower().startswith('torch'):
|
||||
has_torch = True
|
||||
# notice 'conda' entry with empty string is a valid conda requirements list, it means pip only
|
||||
# this should happen if experiment was executed on non-conda machine or old trains client
|
||||
conda_supported_req = requirements['pip'] if requirements.get('conda', None) is None else requirements['conda']
|
||||
conda_supported_req_names = []
|
||||
for r in conda_supported_req:
|
||||
try:
|
||||
marker = list(parse(r))
|
||||
except:
|
||||
marker = None
|
||||
if not marker:
|
||||
continue
|
||||
|
||||
if m.req.name.lower() in ('torch', 'pytorch'):
|
||||
has_torch = True
|
||||
m.req.name = 'pytorch'
|
||||
m = MarkerRequirement(marker[0])
|
||||
conda_supported_req_names.append(m.name.lower())
|
||||
if m.req.name.lower() == 'matplotlib':
|
||||
has_matplotlib = True
|
||||
elif m.req.name.lower().startswith('torch'):
|
||||
has_torch = True
|
||||
|
||||
if m.req.name.lower() in ('tensorflow_gpu', 'tensorflow-gpu', 'tensorflow'):
|
||||
has_torch = True
|
||||
m.req.name = 'tensorflow-gpu' if cuda_version > 0 else 'tensorflow'
|
||||
if m.req.name.lower() in ('torch', 'pytorch'):
|
||||
has_torch = True
|
||||
m.req.name = 'pytorch'
|
||||
|
||||
if m.req.name.lower() in ('tensorflow_gpu', 'tensorflow-gpu', 'tensorflow'):
|
||||
has_torch = True
|
||||
m.req.name = 'tensorflow-gpu' if cuda_version > 0 else 'tensorflow'
|
||||
|
||||
reqs.append(m)
|
||||
|
||||
reqs.append(m)
|
||||
pip_requirements = []
|
||||
# if we have a conda list, the rest should be installed with pip,
|
||||
if requirements.get('conda', None) is not None:
|
||||
for r in requirements['pip']:
|
||||
try:
|
||||
marker = list(parse(r))
|
||||
except:
|
||||
marker = None
|
||||
if not marker:
|
||||
continue
|
||||
|
||||
m = MarkerRequirement(marker[0])
|
||||
m_name = m.name.lower()
|
||||
if m_name in conda_supported_req_names:
|
||||
# this package is in the conda list,
|
||||
# make sure that if we changed version and we match it in conda
|
||||
conda_supported_req_names.remove(m_name)
|
||||
for cr in reqs:
|
||||
if m_name == cr.name.lower():
|
||||
# match versions
|
||||
cr.specs = m.specs
|
||||
break
|
||||
else:
|
||||
# not in conda, it is a pip package
|
||||
pip_requirements.append(m)
|
||||
if m_name == 'matplotlib':
|
||||
has_matplotlib = True
|
||||
|
||||
# remove any leftover conda packages (they were removed from the pip list)
|
||||
if conda_supported_req_names:
|
||||
reqs = [r for r in reqs if r.name.lower() not in conda_supported_req_names]
|
||||
|
||||
# Conda requirements Hacks:
|
||||
if has_matplotlib:
|
||||
reqs.append(MarkerRequirement(Requirement.parse('graphviz')))
|
||||
reqs.append(MarkerRequirement(Requirement.parse('python-graphviz')))
|
||||
reqs.append(MarkerRequirement(Requirement.parse('kiwisolver')))
|
||||
if has_torch and cuda_version == 0:
|
||||
reqs.append(MarkerRequirement(Requirement.parse('cpuonly')))
|
||||
|
||||
# conform conda packages (version/name)
|
||||
for r in reqs:
|
||||
# remove .post from version numbers, it fails ~= version, and change == to ~=
|
||||
if r.specs and r.specs[0]:
|
||||
r.specs = [(r.specs[0][0].replace('==', '~='), r.specs[0][1].split('.post')[0])]
|
||||
# conda always likes "-" not "_"
|
||||
r.req.name = r.req.name.replace('_', '-')
|
||||
|
||||
while reqs:
|
||||
conda_env['dependencies'] = [r.tostr().replace('==', '=') for r in reqs]
|
||||
# notice, we give conda more freedom in version selection, to help it choose best combination
|
||||
conda_env['dependencies'] = [r.tostr() for r in reqs]
|
||||
with self.temp_file("conda_env", yaml.dump(conda_env), suffix=".yml") as name:
|
||||
print('Conda: Trying to install requirements:\n{}'.format(conda_env['dependencies']))
|
||||
result = self._run_command(
|
||||
@@ -297,7 +350,7 @@ class CondaAPI(PackageManager):
|
||||
|
||||
solved = False
|
||||
for bad_r in bad_req:
|
||||
name = bad_r.split('[')[0].split('=')[0]
|
||||
name = bad_r.split('[')[0].split('=')[0].split('~')[0].split('<')[0].split('>')[0]
|
||||
# look for name in requirements
|
||||
for r in reqs:
|
||||
if r.name.lower() == name.lower():
|
||||
@@ -338,7 +391,7 @@ class CondaAPI(PackageManager):
|
||||
if len(empty_lines) >= 2:
|
||||
deps = error_lines[empty_lines[0]+1:empty_lines[1]]
|
||||
try:
|
||||
return yaml.load('\n'.join(deps))
|
||||
return yaml.load('\n'.join(deps), Loader=yaml.SafeLoader)
|
||||
except:
|
||||
return None
|
||||
return None
|
||||
@@ -412,4 +465,4 @@ class PackageNotFoundError(CondaException):
|
||||
as a singleton YAML list.
|
||||
"""
|
||||
|
||||
pkg = attrib(default="", converter=lambda val: yaml.load(val)[0].replace(" ", ""))
|
||||
pkg = attrib(default="", converter=lambda val: yaml.load(val, Loader=yaml.SafeLoader)[0].replace(" ", ""))
|
||||
|
||||
@@ -6,14 +6,14 @@ from .requirements import SimpleSubstitution
|
||||
|
||||
class CythonRequirement(SimpleSubstitution):
|
||||
|
||||
name = "cython"
|
||||
name = ("cython", "numpy", )
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(CythonRequirement, self).__init__(*args, **kwargs)
|
||||
|
||||
def match(self, req):
|
||||
# match both Cython & cython
|
||||
return self.name == req.name.lower()
|
||||
return req.name and req.name.lower() in self.name
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
|
||||
60
trains_agent/helper/package/external_req.py
Normal file
60
trains_agent/helper/package/external_req.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from collections import OrderedDict
|
||||
from typing import Text
|
||||
|
||||
from .base import PackageManager
|
||||
from .requirements import SimpleSubstitution
|
||||
|
||||
|
||||
class ExternalRequirements(SimpleSubstitution):
|
||||
|
||||
name = "external_link"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ExternalRequirements, self).__init__(*args, **kwargs)
|
||||
self.post_install_req = []
|
||||
self.post_install_req_lookup = OrderedDict()
|
||||
|
||||
def match(self, req):
|
||||
# match both editable or code or unparsed
|
||||
if not (not req.name or req.req and (req.req.editable or req.req.vcs)):
|
||||
return False
|
||||
if not req.req or not req.req.line or not req.req.line.strip() or req.req.line.strip().startswith('#'):
|
||||
return False
|
||||
return True
|
||||
|
||||
def post_install(self):
|
||||
post_install_req = self.post_install_req
|
||||
self.post_install_req = []
|
||||
for req in post_install_req:
|
||||
try:
|
||||
freeze_base = PackageManager.out_of_scope_freeze() or ''
|
||||
except:
|
||||
freeze_base = ''
|
||||
PackageManager.out_of_scope_install_package(req.tostr(markers=False), "--no-deps")
|
||||
try:
|
||||
freeze_post = PackageManager.out_of_scope_freeze() or ''
|
||||
package_name = list(set(freeze_post['pip']) - set(freeze_base['pip']))
|
||||
if package_name and package_name[0] not in self.post_install_req_lookup:
|
||||
self.post_install_req_lookup[package_name[0]] = req.req.line
|
||||
except:
|
||||
pass
|
||||
PackageManager.out_of_scope_install_package(req.tostr(markers=False), "--ignore-installed")
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
:raises: ValueError if version is pre-release
|
||||
"""
|
||||
# Store in post req install, and return nothing
|
||||
self.post_install_req.append(req)
|
||||
# mark skip package, we will install it in post install hook
|
||||
return Text('')
|
||||
|
||||
def replace_back(self, list_of_requirements):
|
||||
if 'pip' in list_of_requirements:
|
||||
original_requirements = list_of_requirements['pip']
|
||||
list_of_requirements['pip'] = [r for r in original_requirements
|
||||
if r not in self.post_install_req_lookup]
|
||||
list_of_requirements['pip'] += [self.post_install_req_lookup.get(r, '')
|
||||
for r in self.post_install_req_lookup.keys() if r in original_requirements]
|
||||
return list_of_requirements
|
||||
@@ -14,7 +14,7 @@ class HorovodRequirement(SimpleSubstitution):
|
||||
|
||||
def match(self, req):
|
||||
# match both horovod
|
||||
return self.name == req.name.lower()
|
||||
return req.name and self.name == req.name.lower()
|
||||
|
||||
def post_install(self):
|
||||
if self.post_install_req:
|
||||
|
||||
@@ -29,13 +29,13 @@ class SystemPip(PackageManager):
|
||||
pass
|
||||
|
||||
def install_from_file(self, path):
|
||||
self.run_with_env(('install', '-r', path) + self.install_flags())
|
||||
self.run_with_env(('install', '-r', path) + self.install_flags(), cwd=self.cwd)
|
||||
|
||||
def install_packages(self, *packages):
|
||||
self._install(*(packages + self.install_flags()))
|
||||
|
||||
def _install(self, *args):
|
||||
self.run_with_env(('install',) + args)
|
||||
self.run_with_env(('install',) + args, cwd=self.cwd)
|
||||
|
||||
def uninstall_packages(self, *packages):
|
||||
self.run_with_env(('uninstall', '-y') + packages)
|
||||
@@ -82,7 +82,7 @@ class SystemPip(PackageManager):
|
||||
return (command.get_output if output else command.check_call)(stdin=DEVNULL, **kwargs)
|
||||
|
||||
def _make_command(self, command):
|
||||
return Argv(self.bin, '-m', 'pip', *command)
|
||||
return Argv(self.bin, '-m', 'pip', '--disable-pip-version-check', *command)
|
||||
|
||||
def install_flags(self):
|
||||
if self.indices_args is None:
|
||||
|
||||
@@ -33,7 +33,7 @@ class VirtualenvPip(SystemPip, PackageManager):
|
||||
self.python = python
|
||||
|
||||
def _make_command(self, command):
|
||||
return self.session.command(self.bin, "-m", "pip", *command)
|
||||
return self.session.command(self.bin, "-m", "pip", "--disable-pip-version-check", *command)
|
||||
|
||||
def load_requirements(self, requirements):
|
||||
if isinstance(requirements, dict) and requirements.get("pip"):
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
from copy import deepcopy
|
||||
from functools import wraps
|
||||
|
||||
import attr
|
||||
import sys
|
||||
import os
|
||||
from pathlib2 import Path
|
||||
from trains_agent.helper.process import Argv, DEVNULL
|
||||
from trains_agent.helper.process import Argv, DEVNULL, check_if_command_exists
|
||||
from trains_agent.session import Session, POETRY
|
||||
|
||||
|
||||
@@ -35,10 +38,12 @@ def prop_guard(prop, log_prop=None):
|
||||
|
||||
class PoetryConfig:
|
||||
|
||||
def __init__(self, session):
|
||||
# type: (Session) -> ()
|
||||
def __init__(self, session, interpreter=None):
|
||||
# type: (Session, str) -> ()
|
||||
self.session = session
|
||||
self._log = session.get_logger(__name__)
|
||||
self._python = interpreter or sys.executable
|
||||
self._initialized = False
|
||||
|
||||
@property
|
||||
def log(self):
|
||||
@@ -53,7 +58,20 @@ class PoetryConfig:
|
||||
def run(self, *args, **kwargs):
|
||||
func = kwargs.pop("func", Argv.get_output)
|
||||
kwargs.setdefault("stdin", DEVNULL)
|
||||
argv = Argv("poetry", "-n", *args)
|
||||
kwargs['env'] = deepcopy(os.environ)
|
||||
if 'VIRTUAL_ENV' in kwargs['env'] or 'CONDA_PREFIX' in kwargs['env']:
|
||||
kwargs['env'].pop('VIRTUAL_ENV', None)
|
||||
kwargs['env'].pop('CONDA_PREFIX', None)
|
||||
kwargs['env'].pop('PYTHONPATH', None)
|
||||
if hasattr(sys, "real_prefix") and hasattr(sys, "base_prefix"):
|
||||
path = ':'+kwargs['env']['PATH']
|
||||
path = path.replace(':'+sys.base_prefix, ':'+sys.real_prefix, 1)
|
||||
kwargs['env']['PATH'] = path
|
||||
|
||||
if check_if_command_exists("poetry"):
|
||||
argv = Argv("poetry", *args)
|
||||
else:
|
||||
argv = Argv(self._python, "-m", "poetry", *args)
|
||||
self.log.debug("running: %s", argv)
|
||||
return func(argv, **kwargs)
|
||||
|
||||
@@ -61,10 +79,16 @@ class PoetryConfig:
|
||||
return self.run("config", *args, **kwargs)
|
||||
|
||||
@_guard_enabled
|
||||
def initialize(self):
|
||||
self._config("settings.virtualenvs.in-project", "true")
|
||||
# self._config("repositories.{}".format(self.REPO_NAME), PYTHON_INDEX)
|
||||
# self._config("http-basic.{}".format(self.REPO_NAME), *PYTHON_INDEX_CREDENTIALS)
|
||||
def initialize(self, cwd=None):
|
||||
if not self._initialized:
|
||||
self._initialized = True
|
||||
try:
|
||||
self._config("--local", "virtualenvs.in-project", "true", cwd=cwd)
|
||||
# self._config("repositories.{}".format(self.REPO_NAME), PYTHON_INDEX)
|
||||
# self._config("http-basic.{}".format(self.REPO_NAME), *PYTHON_INDEX_CREDENTIALS)
|
||||
except Exception as ex:
|
||||
print("Exception: {}\nError: Failed configuring Poetry virtualenvs.in-project".format(ex))
|
||||
raise
|
||||
|
||||
def get_api(self, path):
|
||||
# type: (Path) -> PoetryAPI
|
||||
@@ -81,7 +105,7 @@ class PoetryAPI(object):
|
||||
def install(self):
|
||||
# type: () -> bool
|
||||
if self.enabled:
|
||||
self.config.run("install", cwd=str(self.path), func=Argv.check_call)
|
||||
self.config.run("install", "-n", cwd=str(self.path), func=Argv.check_call)
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -92,7 +116,24 @@ class PoetryAPI(object):
|
||||
)
|
||||
|
||||
def freeze(self):
|
||||
return {"poetry": self.config.run("show", cwd=str(self.path)).splitlines()}
|
||||
lines = self.config.run("show", cwd=str(self.path)).splitlines()
|
||||
lines = [[p for p in line.split(' ') if p] for line in lines]
|
||||
return {"pip": [parts[0]+'=='+parts[1]+' # '+' '.join(parts[2:]) for parts in lines]}
|
||||
|
||||
def get_python_command(self, extra):
|
||||
return Argv("poetry", "run", "python", *extra)
|
||||
if check_if_command_exists("poetry"):
|
||||
return Argv("poetry", "run", "python", *extra)
|
||||
else:
|
||||
return Argv(self.config._python, "-m", "poetry", "run", "python", *extra)
|
||||
|
||||
def upgrade_pip(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def set_selected_package_manager(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def out_of_scope_install_package(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def install_from_file(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@@ -10,7 +10,8 @@ from typing import Text
|
||||
|
||||
import attr
|
||||
import requests
|
||||
from semantic_version import Version, Spec
|
||||
from packaging import version as packaging_version
|
||||
from packaging.specifiers import SpecifierSet
|
||||
|
||||
import six
|
||||
from .requirements import SimpleSubstitution, FatalSpecsResolutionError
|
||||
@@ -155,10 +156,16 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
self.os = os_name or self.get_platform()
|
||||
self.cuda = "cuda{}".format(self.cuda_version).lower()
|
||||
self.python_version_string = str(self.config["agent.default_python"])
|
||||
self.python_semantic_version = Version.coerce(
|
||||
self.python_version_string, partial=True
|
||||
)
|
||||
self.python = "python{}.{}".format(self.python_semantic_version.major, self.python_semantic_version.minor)
|
||||
self.python_major_minor_str = '.'.join(packaging_version.parse(
|
||||
self.python_version_string).base_version.split('.')[:2])
|
||||
if '.' not in self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
"must have both major and minor parts of the version (for example: '3.7')".format(
|
||||
self.python_version_string
|
||||
)
|
||||
)
|
||||
self.python = "python{}".format(self.python_major_minor_str)
|
||||
|
||||
self.exceptions = [
|
||||
PytorchResolutionError(message)
|
||||
@@ -188,9 +195,7 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
"""
|
||||
Make sure python version has both major and minor versions as required for choosing pytorch wheel
|
||||
"""
|
||||
if self.is_pip and not (
|
||||
self.python_semantic_version.major and self.python_semantic_version.minor
|
||||
):
|
||||
if self.is_pip and not self.python_major_minor_str:
|
||||
raise PytorchResolutionError(
|
||||
"invalid python version {!r} defined in configuration file, key 'agent.default_python': "
|
||||
"must have both major and minor parts of the version (for example: '3.7')".format(
|
||||
@@ -215,8 +220,10 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
links_parser = LinksHTMLParser()
|
||||
links_parser.feed(requests.get(torch_url, timeout=10).text)
|
||||
platform_wheel = "win" if self.get_platform() == "windows" else self.get_platform()
|
||||
py_ver = "{0.major}{0.minor}".format(self.python_semantic_version)
|
||||
py_ver = self.python_major_minor_str.replace('.', '')
|
||||
url = None
|
||||
spec = SpecifierSet(req.format_specs())
|
||||
last_v = None
|
||||
# search for our package
|
||||
for l in links_parser.links:
|
||||
parts = l.split('/')[-1].split('-')
|
||||
@@ -225,21 +232,40 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
if parts[0] != req.name:
|
||||
continue
|
||||
# version (ignore +cpu +cu92 etc. + is %2B in the file link)
|
||||
if parts[1].split('%')[0].split('+')[0] != req.specs[0][1]:
|
||||
# version ignore .postX suffix (treat as regular version)
|
||||
try:
|
||||
v = packaging_version.parse(parts[1].split('%')[0].split('+')[0])
|
||||
except Exception:
|
||||
continue
|
||||
if v not in spec or (last_v and last_v > v):
|
||||
continue
|
||||
if not parts[2].endswith(py_ver):
|
||||
continue
|
||||
if platform_wheel not in parts[4]:
|
||||
continue
|
||||
url = '/'.join(torch_url.split('/')[:-1] + l.split('/'))
|
||||
break
|
||||
last_v = v
|
||||
|
||||
return url
|
||||
|
||||
def get_url_for_platform(self, req):
|
||||
assert self.package_manager == "pip"
|
||||
assert self.os != "mac"
|
||||
assert req.specs
|
||||
# check if package is already installed with system packages
|
||||
try:
|
||||
if self.config.get("agent.package_manager.system_site_packages"):
|
||||
from pip._internal.commands.show import search_packages_info
|
||||
installed_torch = list(search_packages_info([req.name]))
|
||||
op, version = req.specs[0] if req.specs else (None, None)
|
||||
# notice the comparision order, the first part will make sure we have a valid installed package
|
||||
if installed_torch[0]['version'] and (installed_torch[0]['version'] == version or not version):
|
||||
# package already installed, do nothing
|
||||
return str(req), True
|
||||
except:
|
||||
pass
|
||||
|
||||
# make sure we have a specific version to retrieve
|
||||
if not req.specs:
|
||||
req.specs = [('>', '0')]
|
||||
|
||||
try:
|
||||
req.specs[0] = (req.specs[0][0], req.specs[0][1].split('+')[0])
|
||||
except:
|
||||
@@ -266,7 +292,7 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
if not url:
|
||||
url = PytorchWheel(
|
||||
torch_version=fix_version(version),
|
||||
python="{0.major}{0.minor}".format(self.python_semantic_version),
|
||||
python=self.python_major_minor_str.replace('.', ''),
|
||||
os_name=self.os,
|
||||
cuda_version=self.cuda_version,
|
||||
).make_url()
|
||||
@@ -280,13 +306,13 @@ class PytorchRequirement(SimpleSubstitution):
|
||||
@staticmethod
|
||||
def match_version(req, options):
|
||||
versioned_options = sorted(
|
||||
((Version(fix_version(key)), value) for key, value in options.items()),
|
||||
((packaging_version.parse(fix_version(key)), value) for key, value in options.items()),
|
||||
key=itemgetter(0),
|
||||
reverse=True,
|
||||
)
|
||||
req.specs = [(op, fix_version(version)) for op, version in req.specs]
|
||||
if req.specs:
|
||||
specs = Spec(req.format_specs())
|
||||
specs = SpecifierSet(req.format_specs())
|
||||
else:
|
||||
specs = None
|
||||
try:
|
||||
|
||||
@@ -8,9 +8,9 @@ from copy import deepcopy
|
||||
from itertools import chain, starmap
|
||||
from operator import itemgetter
|
||||
from os import path
|
||||
from typing import Text, List, Type, Optional, Tuple
|
||||
from typing import Text, List, Type, Optional, Tuple, Dict
|
||||
|
||||
import semantic_version
|
||||
from packaging import version as packaging_version
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
from requirements import parse
|
||||
@@ -48,7 +48,7 @@ class MarkerRequirement(object):
|
||||
|
||||
def tostr(self, markers=True):
|
||||
if not self.uri:
|
||||
parts = [self.name]
|
||||
parts = [self.name or self.line]
|
||||
|
||||
if self.extras:
|
||||
parts.append('[{0}]'.format(','.join(sorted(self.extras))))
|
||||
@@ -177,13 +177,20 @@ class SimpleSubstitution(RequirementSubstitution):
|
||||
|
||||
if req.specs:
|
||||
_, version_number = req.specs[0]
|
||||
assert semantic_version.Version(version_number, partial=True)
|
||||
assert packaging_version.parse(version_number)
|
||||
else:
|
||||
version_number = self.get_pip_version(self.name)
|
||||
|
||||
req.specs = [('==', version_number + self.suffix)]
|
||||
return Text(req)
|
||||
|
||||
def replace_back(self, list_of_requirements): # type: (Dict) -> Dict
|
||||
"""
|
||||
:param list_of_requirements: {'pip': ['a==1.0', ]}
|
||||
:return: {'pip': ['a==1.0', ]}
|
||||
"""
|
||||
return list_of_requirements
|
||||
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class CudaSensitiveSubstitution(SimpleSubstitution):
|
||||
@@ -235,15 +242,17 @@ class RequirementsManager(object):
|
||||
return None
|
||||
|
||||
def replace(self, requirements): # type: (Text) -> Text
|
||||
def safe_parse(req_str):
|
||||
try:
|
||||
return next(parse(req_str))
|
||||
except Exception as ex:
|
||||
return Requirement(req_str)
|
||||
|
||||
parsed_requirements = tuple(
|
||||
map(
|
||||
MarkerRequirement,
|
||||
filter(
|
||||
None,
|
||||
parse(requirements)
|
||||
if isinstance(requirements, six.text_type)
|
||||
else (next(parse(line), None) for line in requirements)
|
||||
)
|
||||
[safe_parse(line) for line in (requirements.splitlines()
|
||||
if isinstance(requirements, six.text_type) else requirements)]
|
||||
)
|
||||
)
|
||||
if not parsed_requirements:
|
||||
@@ -258,7 +267,7 @@ class RequirementsManager(object):
|
||||
warning('could not resolve python wheel replacement for {}'.format(req))
|
||||
raise
|
||||
except Exception:
|
||||
warning('could not resolve python wheel replacement for {}, '
|
||||
warning('could not resolve python wheel replacement for \"{}\", '
|
||||
'using original requirements line: {}'.format(req, i))
|
||||
return None
|
||||
|
||||
@@ -280,6 +289,14 @@ class RequirementsManager(object):
|
||||
except Exception as ex:
|
||||
print('RequirementsManager handler {} raised exception: {}'.format(h, ex))
|
||||
|
||||
def replace_back(self, requirements):
|
||||
for h in self.handlers:
|
||||
try:
|
||||
requirements = h.replace_back(requirements)
|
||||
except Exception:
|
||||
pass
|
||||
return requirements
|
||||
|
||||
@staticmethod
|
||||
def get_cuda_version(config): # type: (ConfigTree) -> (Text, Text)
|
||||
# we assume os.environ already updated the config['agent.cuda_version'] & config['agent.cudnn_version']
|
||||
|
||||
@@ -59,17 +59,47 @@ def kill_all_child_processes(pid=None):
|
||||
parent.kill()
|
||||
|
||||
|
||||
def shutdown_docker_process(docker_cmd_ending):
|
||||
def get_docker_id(docker_cmd_contains):
|
||||
try:
|
||||
containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"')
|
||||
for docker_line in containers_running.split('\n'):
|
||||
parts = docker_line.split(':')
|
||||
if parts[-1].endswith(docker_cmd_ending):
|
||||
# we found our docker, stop it
|
||||
get_bash_output(cmd='docker stop -t 1 {}'.format(parts[0]))
|
||||
return
|
||||
if docker_cmd_contains in parts[-1]:
|
||||
# we found our docker, return it
|
||||
return parts[0]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def shutdown_docker_process(docker_cmd_contains=None, docker_id=None):
|
||||
try:
|
||||
if not docker_id:
|
||||
docker_id = get_docker_id(docker_cmd_contains=docker_cmd_contains)
|
||||
if docker_id:
|
||||
# we found our docker, stop it
|
||||
get_bash_output(cmd='docker stop -t 1 {}'.format(docker_id))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def commit_docker(container_name, docker_cmd_contains=None, docker_id=None):
|
||||
try:
|
||||
if not docker_id:
|
||||
docker_id = get_docker_id(docker_cmd_contains=docker_cmd_contains)
|
||||
if not docker_id:
|
||||
print("Failed locating requested docker")
|
||||
return False
|
||||
|
||||
if docker_id:
|
||||
# we found our docker, stop it
|
||||
output = get_bash_output(cmd='docker commit {} {}'.format(docker_id, container_name))
|
||||
return output
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print("Failed storing requested docker")
|
||||
return False
|
||||
|
||||
|
||||
def check_if_command_exists(cmd):
|
||||
|
||||
@@ -42,7 +42,9 @@ class VcsFactory(object):
|
||||
:param location: (desired) clone location
|
||||
"""
|
||||
url = execution_info.repository
|
||||
is_git = url.endswith(cls.GIT_SUFFIX)
|
||||
# We only support git, hg is deprecated
|
||||
is_git = True
|
||||
# is_git = url.endswith(cls.GIT_SUFFIX)
|
||||
vcs_cls = Git if is_git else Hg
|
||||
revision = (
|
||||
execution_info.version_num
|
||||
@@ -263,8 +265,9 @@ class VCS(object):
|
||||
"""
|
||||
self._set_ssh_url()
|
||||
clone_command = ("clone", self.url_with_auth, self.location) + self.clone_flags
|
||||
if branch:
|
||||
clone_command += ("-b", branch)
|
||||
# clone all branches regardless of when we want to later checkout
|
||||
# if branch:
|
||||
# clone_command += ("-b", branch)
|
||||
if self.session.debug_mode:
|
||||
self.call(*clone_command)
|
||||
return
|
||||
@@ -453,13 +456,23 @@ class Git(VCS):
|
||||
)
|
||||
|
||||
def pull(self):
|
||||
self.call("fetch", "origin", cwd=self.location)
|
||||
self.call("fetch", "--all", "--recurse-submodules", cwd=self.location)
|
||||
|
||||
def checkout(self): # type: () -> None
|
||||
"""
|
||||
Checkout repository at specified revision
|
||||
"""
|
||||
self.call("checkout", self.revision, *self.checkout_flags, cwd=self.location)
|
||||
try:
|
||||
self.call("submodule", "update", "--recursive", cwd=self.location)
|
||||
except:
|
||||
pass
|
||||
|
||||
info_commands = dict(
|
||||
url=Argv("git", "remote", "get-url", "origin"),
|
||||
branch=Argv("git", "rev-parse", "--abbrev-ref", "HEAD"),
|
||||
commit=Argv("git", "rev-parse", "HEAD"),
|
||||
root=Argv("git", "rev-parse", "--show-toplevel"),
|
||||
url=Argv(executable_name, "ls-remote", "--get-url", "origin"),
|
||||
branch=Argv(executable_name, "rev-parse", "--abbrev-ref", "HEAD"),
|
||||
commit=Argv(executable_name, "rev-parse", "HEAD"),
|
||||
root=Argv(executable_name, "rev-parse", "--show-toplevel"),
|
||||
)
|
||||
|
||||
patch_base = ("apply",)
|
||||
@@ -493,10 +506,10 @@ class Hg(VCS):
|
||||
)
|
||||
|
||||
info_commands = dict(
|
||||
url=Argv("hg", "paths", "--verbose"),
|
||||
branch=Argv("hg", "--debug", "id", "-b"),
|
||||
commit=Argv("hg", "--debug", "id", "-i"),
|
||||
root=Argv("hg", "root"),
|
||||
url=Argv(executable_name, "paths", "--verbose"),
|
||||
branch=Argv(executable_name, "--debug", "id", "-b"),
|
||||
commit=Argv(executable_name, "--debug", "id", "-i"),
|
||||
root=Argv(executable_name, "root"),
|
||||
)
|
||||
|
||||
|
||||
@@ -537,8 +550,6 @@ def clone_repository_cached(session, execution, destination):
|
||||
vcs.clone() # branch=execution.branch)
|
||||
|
||||
vcs.pull()
|
||||
vcs.checkout()
|
||||
|
||||
rm_tree(destination)
|
||||
shutil.copytree(Text(cached_repo_path), Text(clone_folder))
|
||||
if not clone_folder.is_dir():
|
||||
@@ -548,6 +559,10 @@ def clone_repository_cached(session, execution, destination):
|
||||
)
|
||||
)
|
||||
|
||||
# checkout in the newly copy destination
|
||||
vcs.location = Text(clone_folder)
|
||||
vcs.checkout()
|
||||
|
||||
repo_info = vcs.get_repository_copy_info(clone_folder)
|
||||
|
||||
# make sure we have no user/pass in the returned repository structure
|
||||
|
||||
@@ -4,11 +4,12 @@ from time import sleep
|
||||
from glob import glob
|
||||
from tempfile import gettempdir, NamedTemporaryFile
|
||||
|
||||
from trains_agent.definitions import ENV_K8S_HOST_MOUNT
|
||||
from trains_agent.helper.base import warning
|
||||
|
||||
|
||||
class Singleton(object):
|
||||
prefix = 'trainsagent'
|
||||
prefix = '.trainsagent'
|
||||
sep = '_'
|
||||
ext = '.tmp'
|
||||
worker_id = None
|
||||
@@ -19,7 +20,7 @@ class Singleton(object):
|
||||
_lock_timeout = 10
|
||||
|
||||
@classmethod
|
||||
def register_instance(cls, unique_worker_id=None, worker_name=None):
|
||||
def register_instance(cls, unique_worker_id=None, worker_name=None, api_client=None):
|
||||
"""
|
||||
# Exit the process if another instance of us is using the same worker_id
|
||||
|
||||
@@ -28,7 +29,7 @@ class Singleton(object):
|
||||
:return: (str worker_id, int slot_number) Return None value on instance already running
|
||||
"""
|
||||
# try to lock file
|
||||
lock_file = os.path.join(gettempdir(), cls._lock_file_name)
|
||||
lock_file = os.path.join(cls._get_temp_folder(), cls._lock_file_name)
|
||||
timeout = 0
|
||||
while os.path.exists(lock_file):
|
||||
if timeout > cls._lock_timeout:
|
||||
@@ -46,7 +47,8 @@ class Singleton(object):
|
||||
f.write(bytes(os.getpid()))
|
||||
f.flush()
|
||||
try:
|
||||
ret = cls._register_instance(unique_worker_id=unique_worker_id, worker_name=worker_name)
|
||||
ret = cls._register_instance(unique_worker_id=unique_worker_id, worker_name=worker_name,
|
||||
api_client=api_client)
|
||||
except:
|
||||
ret = None, None
|
||||
|
||||
@@ -58,12 +60,12 @@ class Singleton(object):
|
||||
return ret
|
||||
|
||||
@classmethod
|
||||
def _register_instance(cls, unique_worker_id=None, worker_name=None):
|
||||
def _register_instance(cls, unique_worker_id=None, worker_name=None, api_client=None):
|
||||
if cls.worker_id:
|
||||
return cls.worker_id, cls.instance_slot
|
||||
# make sure we have a unique name
|
||||
instance_num = 0
|
||||
temp_folder = gettempdir()
|
||||
temp_folder = cls._get_temp_folder()
|
||||
files = glob(os.path.join(temp_folder, cls.prefix + cls.sep + '*' + cls.ext))
|
||||
slots = {}
|
||||
for file in files:
|
||||
@@ -73,8 +75,24 @@ class Singleton(object):
|
||||
except Exception:
|
||||
# something is wrong, use non existing pid and delete the file
|
||||
pid = -1
|
||||
|
||||
uid, slot = None, None
|
||||
try:
|
||||
with open(file, 'r') as f:
|
||||
uid, slot = str(f.read()).split('\n')
|
||||
slot = int(slot)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
worker = None
|
||||
if api_client and os.environ.get(ENV_K8S_HOST_MOUNT) and uid:
|
||||
try:
|
||||
worker = [w for w in api_client.workers.get_all() if w.id == uid]
|
||||
except Exception:
|
||||
worker = None
|
||||
|
||||
# count active instances and delete dead files
|
||||
if not psutil.pid_exists(pid):
|
||||
if not worker and not psutil.pid_exists(pid):
|
||||
# delete the file
|
||||
try:
|
||||
os.remove(os.path.join(file))
|
||||
@@ -83,11 +101,7 @@ class Singleton(object):
|
||||
continue
|
||||
|
||||
instance_num += 1
|
||||
try:
|
||||
with open(file, 'r') as f:
|
||||
uid, slot = str(f.read()).split('\n')
|
||||
slot = int(slot)
|
||||
except Exception:
|
||||
if slot is None:
|
||||
continue
|
||||
|
||||
if uid == unique_worker_id:
|
||||
@@ -110,10 +124,16 @@ class Singleton(object):
|
||||
unique_worker_id = worker_name + cls.worker_name_sep + str(cls.instance_slot)
|
||||
|
||||
# create lock
|
||||
cls._pid_file = NamedTemporaryFile(dir=gettempdir(), prefix=cls.prefix + cls.sep + str(os.getpid()) + cls.sep,
|
||||
suffix=cls.ext)
|
||||
cls._pid_file = NamedTemporaryFile(dir=cls._get_temp_folder(),
|
||||
prefix=cls.prefix + cls.sep + str(os.getpid()) + cls.sep, suffix=cls.ext)
|
||||
cls._pid_file.write(('{}\n{}'.format(unique_worker_id, cls.instance_slot)).encode())
|
||||
cls._pid_file.flush()
|
||||
cls.worker_id = unique_worker_id
|
||||
|
||||
return cls.worker_id, cls.instance_slot
|
||||
|
||||
@classmethod
|
||||
def _get_temp_folder(cls):
|
||||
if os.environ.get(ENV_K8S_HOST_MOUNT):
|
||||
return os.environ.get(ENV_K8S_HOST_MOUNT).split(':')[-1]
|
||||
return gettempdir()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import itertools
|
||||
from functools import partial
|
||||
from importlib import import_module
|
||||
import argparse
|
||||
@@ -24,8 +25,16 @@ def get_parser():
|
||||
from .worker import COMMANDS
|
||||
subparsers = top_parser.add_subparsers(dest='command')
|
||||
for c in COMMANDS:
|
||||
parser = subparsers.add_parser(name=c, help=COMMANDS[c]['help'])
|
||||
for a in COMMANDS[c].get('args', {}).keys():
|
||||
parser.add_argument(a, **COMMANDS[c]['args'][a])
|
||||
parser = subparsers.add_parser(name=c, help=COMMANDS[c]["help"])
|
||||
groups = itertools.groupby(
|
||||
sorted(
|
||||
COMMANDS[c].get("args", {}).items(), key=lambda x: x[1].get("group", "")
|
||||
),
|
||||
key=lambda x: x[1].pop("group", ""),
|
||||
)
|
||||
for group_name, group in groups:
|
||||
p = parser if not group_name else parser.add_argument_group(group_name)
|
||||
for key, value in group:
|
||||
p.add_argument(key, **value)
|
||||
|
||||
return top_parser
|
||||
|
||||
@@ -37,21 +37,29 @@ DAEMON_ARGS = dict({
|
||||
'help': 'Pipe full log to stdout/stderr, should not be used if running in background',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
'group': 'Docker support',
|
||||
},
|
||||
'--gpus': {
|
||||
'help': 'Specify active GPUs for the daemon to use (docker / virtual environment), '
|
||||
'Equivalent to setting NVIDIA_VISIBLE_DEVICES '
|
||||
'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all',
|
||||
'group': 'Docker support',
|
||||
},
|
||||
'--cpu-only': {
|
||||
'help': 'Disable GPU access for the daemon, only use CPU in either docker or virtual environment',
|
||||
'action': 'store_true',
|
||||
'group': 'Docker support',
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'set NVIDIA_VISIBLE_DEVICES to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
'--force-current-version': {
|
||||
'help': 'Force trains-agent to use the current trains-agent version when running in the docker',
|
||||
'action': 'store_true',
|
||||
'group': 'Docker support',
|
||||
},
|
||||
'--queue': {
|
||||
'help': 'Queue ID(s)/Name(s) to pull tasks from (\'default\' queue)',
|
||||
@@ -60,6 +68,11 @@ DAEMON_ARGS = dict({
|
||||
'dest': 'queues',
|
||||
'type': foreign_object_id('queues'),
|
||||
},
|
||||
'--standalone-mode': {
|
||||
'help': 'Do not use any network connects, assume everything is pre-installed',
|
||||
'action': 'store_true',
|
||||
},
|
||||
|
||||
}, **WORKER_ARGS)
|
||||
|
||||
COMMANDS = {
|
||||
@@ -83,6 +96,26 @@ COMMANDS = {
|
||||
'help': 'Full environment setup log & task logging & monitoring (stdout is still visible)',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--require-queue': {
|
||||
'help': 'If the specified task is not queued (in any Queue), the execution will fail. '
|
||||
'(Used for 3rd party scheduler integration, e.g. K8s, SLURM, etc.)',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--standalone-mode': {
|
||||
'help': 'Do not use any network connects, assume everything is pre-installed',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
},
|
||||
'--clone': {
|
||||
'help': 'Clone the experiment before execution, and execute the cloned experiment',
|
||||
'action': 'store_true',
|
||||
},
|
||||
}, **WORKER_ARGS),
|
||||
},
|
||||
'build': {
|
||||
@@ -96,8 +129,25 @@ COMMANDS = {
|
||||
'dest': 'task_id',
|
||||
'type': foreign_object_id('tasks'),
|
||||
},
|
||||
'--target-folder': {
|
||||
'help': 'Where to build the task\'s virtual environment and source code',
|
||||
'--target': {
|
||||
'help': 'Where to build the task\'s virtual environment and source code. '
|
||||
'When used with --docker, target docker image name to create',
|
||||
},
|
||||
'--docker': {
|
||||
'help': 'Build the experiment inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||
'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
|
||||
'nargs': '*',
|
||||
'default': False,
|
||||
},
|
||||
'--gpus': {
|
||||
'help': 'Specify active GPUs for the docker to use'
|
||||
'Equivalent to setting NVIDIA_VISIBLE_DEVICES '
|
||||
'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all',
|
||||
},
|
||||
'--cpu-only': {
|
||||
'help': 'Disable GPU access (cpu only) for the docker',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--python-version': {
|
||||
'help': 'Virtual environment python version to use',
|
||||
|
||||
@@ -15,7 +15,7 @@ from pyhocon import ConfigFactory, HOCONConverter, ConfigTree
|
||||
from trains_agent.backend_api.session import Session as _Session, Request
|
||||
from trains_agent.backend_api.session.client import APIClient
|
||||
from trains_agent.backend_config.defs import LOCAL_CONFIG_FILE_OVERRIDE_VAR, LOCAL_CONFIG_FILES
|
||||
from trains_agent.definitions import ENVIRONMENT_CONFIG
|
||||
from trains_agent.definitions import ENVIRONMENT_CONFIG, ENV_TASK_EXECUTE_AS_USER
|
||||
from trains_agent.errors import APIError
|
||||
from trains_agent.helper.base import HOCONEncoder
|
||||
from trains_agent.helper.process import Argv
|
||||
@@ -75,7 +75,8 @@ class Session(_Session):
|
||||
cpu_only = kwargs.get('cpu_only')
|
||||
if cpu_only:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = 'none'
|
||||
if kwargs.get('gpus'):
|
||||
if kwargs.get('gpus') and not os.environ.get('KUBERNETES_SERVICE_HOST') \
|
||||
and not os.environ.get('KUBERNETES_PORT'):
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
|
||||
if kwargs.get('only_load_config'):
|
||||
from trains_agent.backend_api.config import load
|
||||
@@ -86,7 +87,7 @@ class Session(_Session):
|
||||
self.trace = kwargs.get('trace', False)
|
||||
self._config_file = kwargs.get('config_file') or \
|
||||
os.environ.get(LOCAL_CONFIG_FILE_OVERRIDE_VAR) or LOCAL_CONFIG_FILES[0]
|
||||
self.api_client = APIClient(session=self, api_version="2.4")
|
||||
self.api_client = APIClient(session=self, api_version="2.5")
|
||||
# HACK make sure we have python version to execute,
|
||||
# if nothing was specific, use the one that runs us
|
||||
def_python = ConfigValue(self.config, "agent.default_python")
|
||||
@@ -99,16 +100,29 @@ class Session(_Session):
|
||||
if not self.config.get('api.host', None) and self.config.get('api.api_server', None):
|
||||
self.config['api']['host'] = self.config.get('api.api_server')
|
||||
|
||||
# initialize nvidia visibility variable
|
||||
# initialize nvidia visibility variables
|
||||
os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
|
||||
if os.environ.get('NVIDIA_VISIBLE_DEVICES') and not os.environ.get('CUDA_VISIBLE_DEVICES'):
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ.get('NVIDIA_VISIBLE_DEVICES')
|
||||
# do not create CUDA_VISIBLE_DEVICES if it doesn't exist, it breaks TF/PyTotch CUDA detection
|
||||
# os.environ['CUDA_VISIBLE_DEVICES'] = os.environ.get('NVIDIA_VISIBLE_DEVICES')
|
||||
pass
|
||||
elif os.environ.get('CUDA_VISIBLE_DEVICES') and not os.environ.get('NVIDIA_VISIBLE_DEVICES'):
|
||||
os.environ['NVIDIA_VISIBLE_DEVICES'] = os.environ.get('CUDA_VISIBLE_DEVICES')
|
||||
|
||||
# override with environment variables
|
||||
# cuda_version & cudnn_version are overridden with os.environ here, and normalized in the next section
|
||||
for config_key, env_config in ENVIRONMENT_CONFIG.items():
|
||||
# check if the propery is of a list:
|
||||
if config_key.endswith('.0'):
|
||||
if all(not i.get() for i in env_config.values()):
|
||||
continue
|
||||
parent = config_key.partition('.0')[0]
|
||||
if not self.config[parent]:
|
||||
self.config.put(parent, [])
|
||||
|
||||
self.config.put(parent, self.config[parent] + [ConfigTree((k, v.get()) for k, v in env_config.items())])
|
||||
continue
|
||||
|
||||
value = env_config.get()
|
||||
if not value:
|
||||
continue
|
||||
@@ -163,7 +177,11 @@ class Session(_Session):
|
||||
folder_keys = ('agent.venvs_dir', 'agent.vcs_cache.path',
|
||||
'agent.pip_download_cache.path',
|
||||
'agent.docker_pip_cache', 'agent.docker_apt_cache')
|
||||
singleton_folders = ('agent.venvs_dir', 'agent.vcs_cache.path',)
|
||||
singleton_folders = ('agent.venvs_dir', 'agent.vcs_cache.path', 'agent.docker_apt_cache')
|
||||
|
||||
if os.environ.get(ENV_TASK_EXECUTE_AS_USER):
|
||||
folder_keys = tuple(list(folder_keys) + ['sdk.storage.cache.default_base_dir'])
|
||||
singleton_folders = tuple(list(singleton_folders) + ['sdk.storage.cache.default_base_dir'])
|
||||
|
||||
for key in folder_keys:
|
||||
folder_key = ConfigValue(self.config, key)
|
||||
@@ -204,6 +222,15 @@ class Session(_Session):
|
||||
config.pop('env', None)
|
||||
if remove_secret_keys:
|
||||
recursive_remove_secrets(config, secret_keys=remove_secret_keys)
|
||||
# remove logging.loggers.urllib3.level from the print
|
||||
try:
|
||||
config['logging']['loggers']['urllib3'].pop('level', None)
|
||||
except (KeyError, TypeError, AttributeError):
|
||||
pass
|
||||
try:
|
||||
config['logging'].pop('version', None)
|
||||
except (KeyError, TypeError, AttributeError):
|
||||
pass
|
||||
config = ConfigFactory.from_dict(config)
|
||||
self.log.debug("Run by interpreter: %s", sys.executable)
|
||||
print(
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '0.12.1'
|
||||
__version__ = '0.14.0'
|
||||
|
||||
Reference in New Issue
Block a user