# TRAINS-AGENT configuration file api { api_server: https://demoapi.trainsai.io web_server: https://demoapp.trainsai.io files_server: https://demofiles.trainsai.io # Credentials are generated in the webapp, https://demoapp.trainsai.io/profile # Overridden with os environment: TRAINS_API_ACCESS_KEY / TRAINS_API_SECRET_KEY credentials {"access_key": "EGRTCO8JMSIGI6S39GTP43NFWXDQOW", "secret_key": "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"} # verify host ssl certificate, set to False only if you have a very good reason verify_certificate: True } agent { # Set GIT user/pass credentials # leave blank for GIT SSH credentials git_user="" git_pass="" # unique name of this worker, if None, created based on hostname:process_id # Overridden with os environment: TRAINS_WORKER_NAME # worker_id: "trains-agent-machine1:gpu0" worker_id: "" # worker name, replaces the hostname when creating a unique name for this worker # Overridden with os environment: TRAINS_WORKER_ID # worker_name: "trains-agent-machine1" worker_name: "" # select python package manager: # currently supported pip and conda # poetry is used if pip selected and repository contains poetry.lock file package_manager: { # supported options: pip, conda type: pip, # virtual environment inheres packages from system system_site_packages: false, # install with --upgrade force_upgrade: false, # additional artifact repositories to use when installing python packages # extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"] extra_index_url: [] # additional conda channels to use when installing with conda package manager conda_channels: ["pytorch", "conda-forge", ] }, # target folder for virtual environments builds, created when executing experiment venvs_dir = ~/.trains/venvs-builds # cached git clone folder vcs_cache: { enabled: true, path: ~/.trains/vcs-cache }, # use venv-update in order to accelerate python virtual environment building # Still in beta, turned off by default venv_update: { enabled: false, }, # cached folder for specific python package download (mostly pytorch versions) pip_download_cache { enabled: true, path: ~/.trains/pip-download-cache }, translate_ssh: true, # reload configuration file every daemon execution reload_config: false, # pip cache folder used mapped into docker, for python package caching docker_pip_cache = ~/.trains/pip-cache # apt cache folder used mapped into docker, for ubuntu package caching docker_apt_cache = ~/.trains/apt-cache default_docker: { # default docker image to use when running in docker mode image: "nvidia/cuda" # optional arguments to pass to docker image # arguments: ["--ipc=host"] } } sdk { # TRAINS - default SDK configuration storage { cache { # Defaults to system temp folder / cache default_base_dir: "~/.trains/cache" } direct_access: [ # Objects matching are considered to be available for direct access, i.e. they will not be downloaded # or cached, and any download request will return a direct reference. # Objects are specified in glob format, available for url and content_type. { url: "file://*" } # file-urls are always directly referenced ] } metrics { # History size for debug files per metric/variant. For each metric/variant combination with an attached file # (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than # X files are stored in the upload destination for each metric/variant combination. file_history_size: 100 # Settings for generated debug images images { format: JPEG quality: 87 subsampling: 0 } } network { metrics { # Number of threads allocated to uploading files (typically debug images) when transmitting metrics for # a specific iteration file_upload_threads: 4 # Warn about upload starvation if no uploads were made in specified period while file-bearing events keep # being sent for upload file_upload_starvation_warning_sec: 120 } iteration { # Max number of retries when getting frames if the server returned an error (http code 500) max_retries_on_server_error: 5 # Backoff factory for consecutive retry attempts. # SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries. retry_backoff_factor_sec: 10 } } aws { s3 { # S3 credentials, used for read/write access by various SDK elements # default, used for any bucket not specified below key: "" secret: "" region: "" credentials: [ # specifies key/secret credentials to use when handling s3 urls (read or write) # { # bucket: "my-bucket-name" # key: "my-access-key" # secret: "my-secret-key" # }, # { # # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket) # host: "my-minio-host:9000" # key: "12345678" # secret: "12345678" # multipart: false # secure: false # } ] } boto3 { pool_connections: 512 max_multipart_concurrency: 16 } } google.storage { # # Default project and credentials file # # Will be used when no bucket configuration is found # project: "trains" # credentials_json: "/path/to/credentials.json" # # Specific credentials per bucket and sub directory # credentials = [ # { # bucket: "my-bucket" # subdir: "path/in/bucket" # Not required # project: "trains" # credentials_json: "/path/to/credentials.json" # }, # ] } azure.storage { # containers: [ # { # account_name: "trains" # account_key: "secret" # # container_name: # } # ] } log { # debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout) null_log_propagate: False task_log_buffer_capacity: 66 # disable urllib info and lower levels disable_urllib3_info: True } development { # Development-mode options # dev task reuse window task_reuse_time_window_in_hours: 72.0 # Run VCS repository detection asynchronously vcs_repo_detect_async: True # Store uncommitted git/hg source code diff in experiment manifest when training in development mode # This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section store_uncommitted_code_diff_on_train: True # Support stopping an experiment in case it was externally stopped, status was changed or task was reset support_stopping: True # Development mode worker worker { # Status report period in seconds report_period_sec: 2 # ping to the server - check connectivity ping_period_sec: 30 # Log all stdout & stderr log_stdout: True } } }