mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Rename trains-agent -> clearml-agent
This commit is contained in:
16
clearml_agent/backend_api/config/__init__.py
Normal file
16
clearml_agent/backend_api/config/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from ...backend_config import Config
|
||||
from pathlib2 import Path
|
||||
|
||||
|
||||
def load(*additional_module_paths):
|
||||
# type: (str) -> Config
|
||||
"""
|
||||
Load configuration with the API defaults, using the additional module path provided
|
||||
:param additional_module_paths: Additional config paths for modules who'se default
|
||||
configurations should be loaded as well
|
||||
:return: Config object
|
||||
"""
|
||||
config = Config(verbose=False)
|
||||
this_module_path = Path(__file__).parent
|
||||
config.load_relative_to(this_module_path, *additional_module_paths)
|
||||
return config
|
||||
165
clearml_agent/backend_api/config/default/agent.conf
Normal file
165
clearml_agent/backend_api/config/default/agent.conf
Normal file
@@ -0,0 +1,165 @@
|
||||
{
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Override with os environment: TRAINS_WORKER_ID
|
||||
# worker_id: "trains-agent-machine1:gpu0"
|
||||
worker_id: ""
|
||||
|
||||
# worker name, replaces the hostname when creating a unique name for this worker
|
||||
# Override with os environment: TRAINS_WORKER_NAME
|
||||
# worker_name: "trains-agent-machine1"
|
||||
worker_name: ""
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the trains_agent
|
||||
python_binary: ""
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: "<20.2",
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
# install with --upgrade
|
||||
force_upgrade: false,
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["defaults", "conda-forge", "pytorch", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||
torch_nightly: false,
|
||||
},
|
||||
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.trains/venvs-builds
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.trains/vcs-cache
|
||||
},
|
||||
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
|
||||
# cached folder for specific python package download (used for pytorch package caching)
|
||||
pip_download_cache {
|
||||
enabled: true,
|
||||
path: ~/.trains/pip-download-cache
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
# pip cache folder mapped into docker, used for python package caching
|
||||
docker_pip_cache = ~/.trains/pip-cache
|
||||
# apt cache folder mapped into docker, used for ubuntu package caching
|
||||
docker_apt_cache = ~/.trains/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
# Defined using a list of items of the format: "<hours> <days>".
|
||||
# hours - use values 0-23, single values would count as start hour and end at midnight.
|
||||
# days - use days in abbreviated format (SUN-SAT)
|
||||
# use '-' for ranges and ',' to separate singular values.
|
||||
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
|
||||
# uptime: ["17-20 SUN,TUE"]
|
||||
|
||||
# optional downtime configuration, can be used only when uptime is not used.
|
||||
# If downtime is specified, agent will be idle in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
|
||||
# Use the same format as described above for uptime
|
||||
# downtime: []
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
# docker_init_bash_script = [
|
||||
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||
# "chown -R root /root/.cache/pip",
|
||||
# "apt-get update",
|
||||
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
|
||||
# ]
|
||||
|
||||
# set the preprocessing bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# docker_preprocess_bash_script = [
|
||||
# "echo \"starting docker\"",
|
||||
#]
|
||||
|
||||
# If False replace \r with \n and display full console output
|
||||
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||
# suppress_carriage_return: true
|
||||
|
||||
# cuda versions used for solving pytorch wheel packages
|
||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
}
|
||||
37
clearml_agent/backend_api/config/default/api.conf
Normal file
37
clearml_agent/backend_api/config/default/api.conf
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
version: 1.5
|
||||
# verify host ssl certificate, set to False only if you have a very good reason
|
||||
verify_certificate: True
|
||||
|
||||
# default version assigned to requests with no specific version. this is not expected to change
|
||||
# as it keeps us backwards compatible.
|
||||
default_version: 1.5
|
||||
|
||||
http {
|
||||
max_req_size = 15728640 # request size limit (smaller than that configured in api server)
|
||||
|
||||
retries {
|
||||
# retry values (int, 0 means fail on first retry)
|
||||
total: 240 # Total number of retries to allow. Takes precedence over other counts.
|
||||
connect: 240 # How many times to retry on connection-related errors (never reached server)
|
||||
read: 240 # How many times to retry on read errors (waiting for server)
|
||||
redirect: 240 # How many redirects to perform (HTTP response with a status code 301, 302, 303, 307 or 308)
|
||||
status: 240 # How many times to retry on bad status codes
|
||||
|
||||
# backoff parameters
|
||||
# timeout between retries is min({backoff_max}, {backoff factor} * (2 ^ ({number of total retries} - 1))
|
||||
backoff_factor: 1.0
|
||||
backoff_max: 120.0
|
||||
}
|
||||
|
||||
wait_on_maintenance_forever: true
|
||||
|
||||
pool_maxsize: 512
|
||||
pool_connections: 512
|
||||
}
|
||||
|
||||
auth {
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token
|
||||
token_expiration_threshold_sec = 360
|
||||
}
|
||||
}
|
||||
8
clearml_agent/backend_api/config/default/logging.conf
Normal file
8
clearml_agent/backend_api/config/default/logging.conf
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
version: 1
|
||||
loggers {
|
||||
urllib3 {
|
||||
level: ERROR
|
||||
}
|
||||
}
|
||||
}
|
||||
181
clearml_agent/backend_api/config/default/sdk.conf
Normal file
181
clearml_agent/backend_api/config/default/sdk.conf
Normal file
@@ -0,0 +1,181 @@
|
||||
{
|
||||
# TRAINS - default SDK configuration
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
default_base_dir: "~/.trains/cache"
|
||||
size {
|
||||
# max_used_bytes = -1
|
||||
min_free_bytes = 10GB
|
||||
# cleanup_margin_percent = 5%
|
||||
}
|
||||
}
|
||||
|
||||
direct_access: [
|
||||
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
|
||||
# or cached, and any download request will return a direct reference.
|
||||
# Objects are specified in glob format, available for url and content_type.
|
||||
{ url: "file://*" } # file-urls are always directly referenced
|
||||
]
|
||||
}
|
||||
|
||||
metrics {
|
||||
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
|
||||
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
|
||||
# X files are stored in the upload destination for each metric/variant combination.
|
||||
file_history_size: 100
|
||||
|
||||
# Max history size for matplotlib imshow files per plot title.
|
||||
# File names for the uploaded images will be recycled in such a way that no more than
|
||||
# X images are stored in the upload destination for each matplotlib plot title.
|
||||
matplotlib_untitled_history_size: 100
|
||||
|
||||
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
|
||||
# plot_max_num_digits: 5
|
||||
|
||||
# Settings for generated debug images
|
||||
images {
|
||||
format: JPEG
|
||||
quality: 87
|
||||
subsampling: 0
|
||||
}
|
||||
|
||||
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
|
||||
tensorboard_single_series_per_graph: false
|
||||
}
|
||||
|
||||
network {
|
||||
metrics {
|
||||
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
|
||||
# a specific iteration
|
||||
file_upload_threads: 4
|
||||
|
||||
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
|
||||
# being sent for upload
|
||||
file_upload_starvation_warning_sec: 120
|
||||
}
|
||||
|
||||
iteration {
|
||||
# Max number of retries when getting frames if the server returned an error (http code 500)
|
||||
max_retries_on_server_error: 5
|
||||
# Backoff factory for consecutive retry attempts.
|
||||
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
|
||||
retry_backoff_factor_sec: 10
|
||||
}
|
||||
}
|
||||
aws {
|
||||
s3 {
|
||||
# S3 credentials, used for read/write access by various SDK elements
|
||||
|
||||
# default, used for any bucket not specified below
|
||||
key: ""
|
||||
secret: ""
|
||||
region: ""
|
||||
|
||||
credentials: [
|
||||
# specifies key/secret credentials to use when handling s3 urls (read or write)
|
||||
# {
|
||||
# bucket: "my-bucket-name"
|
||||
# key: "my-access-key"
|
||||
# secret: "my-secret-key"
|
||||
# },
|
||||
# {
|
||||
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
|
||||
# host: "my-minio-host:9000"
|
||||
# key: "12345678"
|
||||
# secret: "12345678"
|
||||
# multipart: false
|
||||
# secure: false
|
||||
# }
|
||||
]
|
||||
}
|
||||
boto3 {
|
||||
pool_connections: 512
|
||||
max_multipart_concurrency: 16
|
||||
}
|
||||
}
|
||||
google.storage {
|
||||
# # Default project and credentials file
|
||||
# # Will be used when no bucket configuration is found
|
||||
# project: "trains"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
|
||||
# # Specific credentials per bucket and sub directory
|
||||
# credentials = [
|
||||
# {
|
||||
# bucket: "my-bucket"
|
||||
# subdir: "path/in/bucket" # Not required
|
||||
# project: "trains"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
# },
|
||||
# ]
|
||||
}
|
||||
azure.storage {
|
||||
# containers: [
|
||||
# {
|
||||
# account_name: "trains"
|
||||
# account_key: "secret"
|
||||
# # container_name:
|
||||
# }
|
||||
# ]
|
||||
}
|
||||
|
||||
log {
|
||||
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
|
||||
null_log_propagate: false
|
||||
task_log_buffer_capacity: 66
|
||||
|
||||
# disable urllib info and lower levels
|
||||
disable_urllib3_info: true
|
||||
}
|
||||
|
||||
development {
|
||||
# Development-mode options
|
||||
|
||||
# dev task reuse window
|
||||
task_reuse_time_window_in_hours: 72.0
|
||||
|
||||
# Run VCS repository detection asynchronously
|
||||
vcs_repo_detect_async: true
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff: true
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
support_stopping: true
|
||||
|
||||
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
|
||||
default_output_uri: ""
|
||||
|
||||
# Default auto generated requirements optimize for smaller requirements
|
||||
# If True, analyze the entire repository regardless of the entry point.
|
||||
# If False, first analyze the entry point script, if it does not contain other to local files,
|
||||
# do not analyze the entire repository.
|
||||
force_analyze_entire_repo: false
|
||||
|
||||
# If set to true, *trains* update message will not be printed to the console
|
||||
# this value can be overwritten with os environment variable TRAINS_SUPPRESS_UPDATE_MESSAGE=1
|
||||
suppress_update_message: false
|
||||
|
||||
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
|
||||
detect_with_pip_freeze: false
|
||||
|
||||
# Development mode worker
|
||||
worker {
|
||||
# Status report period in seconds
|
||||
report_period_sec: 2
|
||||
|
||||
# ping to the server - check connectivity
|
||||
ping_period_sec: 30
|
||||
|
||||
# Log all stdout & stderr
|
||||
log_stdout: true
|
||||
|
||||
# compatibility feature, report memory usage for the entire machine
|
||||
# default (false), report only on the running process and its sub-processes
|
||||
report_global_mem_used: false
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user