clearml-agent/docs/trains.conf

304 lines
12 KiB
Plaintext
Raw Normal View History

2020-12-22 21:00:57 +00:00
# CLEARML-AGENT configuration file - Please use ~/clearml.conf
2019-10-25 19:28:44 +00:00
api {
2020-12-22 21:00:57 +00:00
api_server: https://demoapi.demo.clear.ml
web_server: https://demoapp.demo.clear.ml
files_server: https://demofiles.demo.clear.ml
2019-10-25 19:28:44 +00:00
2020-12-22 21:00:57 +00:00
# Credentials are generated in the webapp, https://demoapp.demo.clear.ml/profile
# Overridden with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
2019-10-25 19:28:44 +00:00
credentials {"access_key": "EGRTCO8JMSIGI6S39GTP43NFWXDQOW", "secret_key": "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"}
# verify host ssl certificate, set to False only if you have a very good reason
verify_certificate: True
}
agent {
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
2019-10-25 19:28:44 +00:00
git_user=""
git_pass=""
# Limit credentials to a single domain, for example: github.com,
# all other domains will use public access (no user/pass). Default: always send user/pass for any VCS domain
2020-10-05 08:25:03 +00:00
git_host=""
2019-10-25 19:28:44 +00:00
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
force_git_ssh_protocol: false
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
# force_git_ssh_port: ""
2019-10-25 19:28:44 +00:00
# unique name of this worker, if None, created based on hostname:process_id
2020-12-22 21:00:57 +00:00
# Overridden with os environment: CLEARML_WORKER_NAME
# worker_id: "clearml-agent-machine1:gpu0"
2019-10-25 19:28:44 +00:00
worker_id: ""
# worker name, replaces the hostname when creating a unique name for this worker
2020-12-22 21:00:57 +00:00
# Overridden with os environment: CLEARML_WORKER_ID
# worker_name: "clearml-agent-machine1"
2019-10-25 19:28:44 +00:00
worker_name: ""
2019-11-15 21:21:16 +00:00
# Set the python version to use when creating the virtual environment and launching the experiment
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
2020-12-22 21:00:57 +00:00
# The default is the python executing the clearml_agent
2019-11-15 21:21:16 +00:00
python_binary: ""
2019-10-25 19:28:44 +00:00
# select python package manager:
# currently supported pip and conda
# poetry is used if pip selected and repository contains poetry.lock file
package_manager: {
# supported options: pip, conda, poetry
2019-10-25 19:28:44 +00:00
type: pip,
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
# pip_version: "<20"
2019-10-25 19:28:44 +00:00
# virtual environment inheres packages from system
system_site_packages: false,
# install with --upgrade
force_upgrade: false,
# additional artifact repositories to use when installing python packages
2020-12-22 21:00:57 +00:00
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
2019-10-25 19:28:44 +00:00
extra_index_url: []
# additional conda channels to use when installing with conda package manager
conda_channels: ["pytorch", "conda-forge", ]
# conda_full_env_update: false
# conda_env_as_base_docker: false
# set the priority packages to be installed before the rest of the required packages
# priority_packages: ["cython", "numpy", "setuptools", ]
# set the optional priority packages to be installed before the rest of the required packages,
# In case a package installation fails, the package will be ignored,
# and the virtual environment process will continue
# priority_optional_packages: ["pygobject", ]
# set the post packages to be installed after all the rest of the required packages
# post_packages: ["horovod", ]
# set the optional post packages to be installed after all the rest of the required packages,
# In case a package installation fails, the package will be ignored,
# and the virtual environment process will continue
# post_optional_packages: []
# set to True to support torch nightly build installation,
# notice: torch nightly builds are ephemeral and are deleted from time to time
torch_nightly: false,
2019-10-25 19:28:44 +00:00
},
# target folder for virtual environments builds, created when executing experiment
2020-12-22 21:00:57 +00:00
venvs_dir = ~/.clearml/venvs-builds
2019-10-25 19:28:44 +00:00
# cached git clone folder
vcs_cache: {
enabled: true,
2020-12-22 21:00:57 +00:00
path: ~/.clearml/vcs-cache
2019-10-25 19:28:44 +00:00
},
# use venv-update in order to accelerate python virtual environment building
# Still in beta, turned off by default
venv_update: {
enabled: false,
},
# cached folder for specific python package download (mostly pytorch versions)
pip_download_cache {
enabled: true,
2020-12-22 21:00:57 +00:00
path: ~/.clearml/pip-download-cache
2019-10-25 19:28:44 +00:00
},
translate_ssh: true,
# reload configuration file every daemon execution
reload_config: false,
2020-04-09 08:23:45 +00:00
# pip cache folder mapped into docker, used for python package caching
2020-12-22 21:00:57 +00:00
docker_pip_cache = ~/.clearml/pip-cache
2020-04-09 08:23:45 +00:00
# apt cache folder mapped into docker, used for ubuntu package caching
2020-12-22 21:00:57 +00:00
docker_apt_cache = ~/.clearml/apt-cache
2019-10-25 19:28:44 +00:00
# optional arguments to pass to docker image
# these are local for this agent and will not be updated in the experiment's docker_cmd section
# extra_docker_arguments: ["--ipc=host", ]
# optional shell script to run in docker when started before the experiment is started
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
# set to true in order to force "docker pull" before running an experiment using a docker image.
# This makes sure the docker image is updated.
docker_force_pull: false
2019-10-25 19:28:44 +00:00
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
2019-10-25 19:28:44 +00:00
# optional arguments to pass to docker image
# arguments: ["--ipc=host"]
}
# CUDA versions used for Conda setup & solving PyTorch wheel packages
# it Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
# cuda_version: 10.1
# cudnn_version: 7.6
2019-10-25 19:28:44 +00:00
}
sdk {
2020-12-22 21:00:57 +00:00
# CLEARML - default SDK configuration
2019-10-25 19:28:44 +00:00
storage {
cache {
2023-07-11 07:32:01 +00:00
# Defaults to <system_temp_folder>/clearml_cache
2020-12-22 21:00:57 +00:00
default_base_dir: "~/.clearml/cache"
2019-10-25 19:28:44 +00:00
}
direct_access: [
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
# or cached, and any download request will return a direct reference.
# Objects are specified in glob format, available for url and content_type.
{ url: "file://*" } # file-urls are always directly referenced
]
}
metrics {
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
# X files are stored in the upload destination for each metric/variant combination.
file_history_size: 100
2019-11-15 21:21:16 +00:00
# Max history size for matplotlib imshow files per plot title.
# File names for the uploaded images will be recycled in such a way that no more than
# X images are stored in the upload destination for each matplotlib plot title.
matplotlib_untitled_history_size: 100
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
# plot_max_num_digits: 5
2019-10-25 19:28:44 +00:00
# Settings for generated debug images
images {
format: JPEG
quality: 87
subsampling: 0
}
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to True, each series should have its own graph)
tensorboard_single_series_per_graph: False
2019-10-25 19:28:44 +00:00
}
network {
metrics {
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
# a specific iteration
file_upload_threads: 4
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
# being sent for upload
file_upload_starvation_warning_sec: 120
}
iteration {
# Max number of retries when getting frames if the server returned an error (http code 500)
max_retries_on_server_error: 5
# Backoff factory for consecutive retry attempts.
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
retry_backoff_factor_sec: 10
}
}
aws {
s3 {
# S3 credentials, used for read/write access by various SDK elements
# default, used for any bucket not specified below
key: ""
secret: ""
region: ""
credentials: [
# specifies key/secret credentials to use when handling s3 urls (read or write)
# {
# bucket: "my-bucket-name"
# key: "my-access-key"
# secret: "my-secret-key"
# },
# {
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
# host: "my-minio-host:9000"
# key: "12345678"
# secret: "12345678"
# multipart: false
# secure: false
# }
]
}
boto3 {
pool_connections: 512
max_multipart_concurrency: 16
}
}
google.storage {
# # Default project and credentials file
# # Will be used when no bucket configuration is found
2020-12-22 21:00:57 +00:00
# project: "clearml"
2019-10-25 19:28:44 +00:00
# credentials_json: "/path/to/credentials.json"
# # Specific credentials per bucket and sub directory
# credentials = [
# {
# bucket: "my-bucket"
# subdir: "path/in/bucket" # Not required
2020-12-22 21:00:57 +00:00
# project: "clearml"
2019-10-25 19:28:44 +00:00
# credentials_json: "/path/to/credentials.json"
# },
# ]
}
azure.storage {
# containers: [
# {
2020-12-22 21:00:57 +00:00
# account_name: "clearml"
2019-10-25 19:28:44 +00:00
# account_key: "secret"
# # container_name:
# }
# ]
}
log {
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
null_log_propagate: False
task_log_buffer_capacity: 66
# disable urllib info and lower levels
disable_urllib3_info: True
}
development {
# Development-mode options
# dev task reuse window
task_reuse_time_window_in_hours: 72.0
# Run VCS repository detection asynchronously
vcs_repo_detect_async: True
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
store_uncommitted_code_diff_on_train: True
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
support_stopping: True
2019-12-14 22:01:47 +00:00
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
default_output_uri: ""
2019-10-25 19:28:44 +00:00
# Development mode worker
worker {
# Status report period in seconds
report_period_sec: 2
# ping to the server - check connectivity
ping_period_sec: 30
# Log all stdout & stderr
log_stdout: True
}
}
}