Rename trains-agent -> clearml-agent

2025-06-26 18:16:15 +00:00 · 2020-12-22 21:21:29 +02:00
parent 090327234a
commit 6e1f74402e
112 changed files with 0 additions and 0 deletions
--- a/clearml_agent/backend_api/config/init.py
+++ b/clearml_agent/backend_api/config/init.py
@@ -0,0 +1,16 @@
+from ...backend_config import Config
+from pathlib2 import Path
+
+
+def load(*additional_module_paths):
+    # type: (str) -> Config
+    """
+    Load configuration with the API defaults, using the additional module path provided
+    :param additional_module_paths: Additional config paths for modules who'se default
+    configurations should be loaded as well
+    :return: Config object
+    """
+    config = Config(verbose=False)
+    this_module_path = Path(__file__).parent
+    config.load_relative_to(this_module_path, *additional_module_paths)
+    return config
--- a/clearml_agent/backend_api/config/default/agent.conf
+++ b/clearml_agent/backend_api/config/default/agent.conf
@@ -0,0 +1,165 @@
+{
+    # unique name of this worker, if None, created based on hostname:process_id
+    # Override with os environment: TRAINS_WORKER_ID
+    # worker_id: "trains-agent-machine1:gpu0"
+    worker_id: ""
+
+    # worker name, replaces the hostname when creating a unique name for this worker
+    # Override with os environment: TRAINS_WORKER_NAME
+    # worker_name: "trains-agent-machine1"
+    worker_name: ""
+
+    # Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
+    # leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
+    # git_user: ""
+    # git_pass: ""
+    # git_host: ""
+
+    # Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
+    force_git_ssh_protocol: false
+    # Force a specific SSH port when converting http to ssh links (the domain is kept the same)
+    # force_git_ssh_port: 0
+
+    # Set the python version to use when creating the virtual environment and launching the experiment
+    # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
+    # The default is the python executing the trains_agent
+    python_binary: ""
+
+    # select python package manager:
+    # currently supported pip and conda
+    # poetry is used if pip selected and repository contains poetry.lock file
+    package_manager: {
+        # supported options: pip, conda, poetry
+        type: pip,
+
+        # specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
+        pip_version: "<20.2",
+
+        # virtual environment inheres packages from system
+        system_site_packages: false,
+
+        # install with --upgrade
+        force_upgrade: false,
+
+        # additional artifact repositories to use when installing python packages
+        # extra_index_url: ["https://allegroai.jfrog.io/trainsai/api/pypi/public/simple"]
+
+        # additional conda channels to use when installing with conda package manager
+        conda_channels: ["defaults", "conda-forge", "pytorch", ]
+
+        # If set to true, Task's "installed packages" are ignored,
+        # and the repository's "requirements.txt" is used instead
+        # force_repo_requirements_txt: false
+
+        # set the priority packages to be installed before the rest of the required packages
+        # priority_packages: ["cython", "numpy", "setuptools", ]
+
+        # set the optional priority packages to be installed before the rest of the required packages,
+        # In case a package installation fails, the package will be ignored,
+        # and the virtual environment process will continue
+        # priority_optional_packages: ["pygobject", ]
+
+        # set the post packages to be installed after all the rest of the required packages
+        # post_packages: ["horovod", ]
+
+        # set the optional post packages to be installed after all the rest of the required packages,
+        # In case a package installation fails, the package will be ignored,
+        # and the virtual environment process will continue
+        # post_optional_packages: []
+
+        # set to True to support torch nightly build installation,
+        # notice: torch nightly builds are ephemeral and are deleted from time to time
+        torch_nightly: false,
+    },
+
+    # target folder for virtual environments builds, created when executing experiment
+    venvs_dir = ~/.trains/venvs-builds
+
+    # cached git clone folder
+    vcs_cache: {
+        enabled: true,
+        path: ~/.trains/vcs-cache
+    },
+
+    # use venv-update in order to accelerate python virtual environment building
+    # Still in beta, turned off by default
+    venv_update: {
+        enabled: false,
+    },
+
+    # cached folder for specific python package download (used for pytorch package caching)
+    pip_download_cache {
+        enabled: true,
+        path: ~/.trains/pip-download-cache
+    },
+
+    translate_ssh: true,
+    # reload configuration file every daemon execution
+    reload_config: false,
+
+    # pip cache folder mapped into docker, used for python package caching
+    docker_pip_cache = ~/.trains/pip-cache
+    # apt cache folder mapped into docker, used for ubuntu package caching
+    docker_apt_cache = ~/.trains/apt-cache
+
+    # optional arguments to pass to docker image
+    # these are local for this agent and will not be updated in the experiment's docker_cmd section
+    # extra_docker_arguments: ["--ipc=host", ]
+
+    # optional shell script to run in docker when started before the experiment is started
+    # extra_docker_shell_script: ["apt-get install -y bindfs", ]
+
+    # optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
+    # If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
+    # Outside of the specified time-spans, the agent will be idle.
+    # Defined using a list of items of the format: "<hours> <days>".
+    # hours - use values 0-23, single values would count as start hour and end at midnight.
+    # days - use days in abbreviated format (SUN-SAT)
+    # use '-' for ranges and ',' to separate singular values.
+    # for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
+    # uptime: ["17-20 SUN,TUE"]
+
+    # optional downtime configuration, can be used only when uptime is not used.
+    # If downtime is specified, agent will be idle in the time-spans defined here.
+    # Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
+    # Use the same format as described above for uptime
+    # downtime: []
+
+    # set to true in order to force "docker pull" before running an experiment using a docker image.
+    # This makes sure the docker image is updated.
+    docker_force_pull: false
+
+    default_docker: {
+        # default docker image to use when running in docker mode
+        image: "nvidia/cuda:10.1-runtime-ubuntu18.04"
+
+        # optional arguments to pass to docker image
+        # arguments: ["--ipc=host", ]
+    }
+
+    # set the initial bash script to execute at the startup of any docker.
+    # all lines will be executed regardless of their exit code.
+    # {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
+    # docker_init_bash_script = [
+    #     "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
+    #     "chown -R root /root/.cache/pip",
+    #     "apt-get update",
+    #     "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
+    #     "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
+    # ]
+
+    # set the preprocessing bash script to execute at the startup of any docker.
+    # all lines will be executed regardless of their exit code.
+    # docker_preprocess_bash_script = [
+    #     "echo \"starting docker\"",
+    #]
+
+    # If False replace \r with \n and display full console output
+    # default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
+    # suppress_carriage_return: true
+
+    # cuda versions used for solving pytorch wheel packages
+    # should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
+    # cuda_version: 10.1
+    # cudnn_version: 7.6
+}
--- a/clearml_agent/backend_api/config/default/api.conf
+++ b/clearml_agent/backend_api/config/default/api.conf
@@ -0,0 +1,37 @@
+{
+    version: 1.5
+    # verify host ssl certificate, set to False only if you have a very good reason
+    verify_certificate: True
+
+    # default version assigned to requests with no specific version. this is not expected to change
+    # as it keeps us backwards compatible.
+    default_version: 1.5
+
+    http {
+        max_req_size = 15728640  # request size limit (smaller than that configured in api server)
+
+        retries {
+            # retry values (int, 0 means fail on first retry)
+            total: 240     # Total number of retries to allow. Takes precedence over other counts.
+            connect: 240   # How many times to retry on connection-related errors (never reached server)
+            read: 240      # How many times to retry on read errors (waiting for server)
+            redirect: 240  # How many redirects to perform (HTTP response with a status code 301, 302, 303, 307 or 308)
+            status: 240    # How many times to retry on bad status codes
+
+            # backoff parameters
+            # timeout between retries is min({backoff_max}, {backoff factor} * (2 ^ ({number of total retries} - 1))
+            backoff_factor: 1.0
+            backoff_max: 120.0
+        }
+
+        wait_on_maintenance_forever: true
+
+        pool_maxsize: 512
+        pool_connections: 512
+    }
+
+    auth {
+        # When creating a request, if token will expire in less than this value, try to refresh the token
+        token_expiration_threshold_sec = 360
+    }
+}
--- a/clearml_agent/backend_api/config/default/logging.conf
+++ b/clearml_agent/backend_api/config/default/logging.conf
@@ -0,0 +1,8 @@
+{
+    version: 1
+    loggers {
+        urllib3 {
+            level: ERROR
+        }
+    }
+}
--- a/clearml_agent/backend_api/config/default/sdk.conf
+++ b/clearml_agent/backend_api/config/default/sdk.conf
@@ -0,0 +1,181 @@
+{
+    # TRAINS - default SDK configuration
+
+    storage {
+        cache {
+            # Defaults to system temp folder / cache
+            default_base_dir: "~/.trains/cache"
+            size {
+                # max_used_bytes = -1
+                min_free_bytes = 10GB
+                # cleanup_margin_percent = 5%
+            }
+        }
+
+        direct_access: [
+            # Objects matching are considered to be available for direct access, i.e. they will not be downloaded
+            # or cached, and any download request will return a direct reference.
+            # Objects are specified in glob format, available for url and content_type.
+            { url: "file://*" }  # file-urls are always directly referenced
+        ]
+    }
+
+    metrics {
+        # History size for debug files per metric/variant. For each metric/variant combination with an attached file
+        # (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
+        # X files are stored in the upload destination for each metric/variant combination.
+        file_history_size: 100
+
+        # Max history size for matplotlib imshow files per plot title.
+        # File names for the uploaded images will be recycled in such a way that no more than
+        # X images are stored in the upload destination for each matplotlib plot title.
+        matplotlib_untitled_history_size: 100
+
+        # Limit the number of digits after the dot in plot reporting (reducing plot report size)
+        # plot_max_num_digits: 5
+
+        # Settings for generated debug images
+        images {
+            format: JPEG
+            quality: 87
+            subsampling: 0
+        }
+
+        # Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
+        tensorboard_single_series_per_graph: false
+    }
+
+    network {
+        metrics {
+            # Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
+            # a specific iteration
+            file_upload_threads: 4
+
+            # Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
+            # being sent for upload
+            file_upload_starvation_warning_sec: 120
+        }
+
+        iteration {
+            # Max number of retries when getting frames if the server returned an error (http code 500)
+            max_retries_on_server_error: 5
+            # Backoff factory for consecutive retry attempts.
+            # SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
+            retry_backoff_factor_sec: 10
+        }
+    }
+    aws {
+        s3 {
+            # S3 credentials, used for read/write access by various SDK elements
+
+            # default, used for any bucket not specified below
+            key: ""
+            secret: ""
+            region: ""
+
+            credentials: [
+                # specifies key/secret credentials to use when handling s3 urls (read or write)
+                # {
+                #     bucket: "my-bucket-name"
+                #     key: "my-access-key"
+                #     secret: "my-secret-key"
+                # },
+                # {
+                #     # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
+                #     host: "my-minio-host:9000"
+                #     key: "12345678"
+                #     secret: "12345678"
+                #     multipart: false
+                #     secure: false
+                # }
+            ]
+        }
+        boto3 {
+            pool_connections: 512
+            max_multipart_concurrency: 16
+        }
+    }
+    google.storage {
+        # # Default project and credentials file
+        # # Will be used when no bucket configuration is found
+        # project: "trains"
+        # credentials_json: "/path/to/credentials.json"
+
+        # # Specific credentials per bucket and sub directory
+        # credentials = [
+        #     {
+        #         bucket: "my-bucket"
+        #         subdir: "path/in/bucket" # Not required
+        #         project: "trains"
+        #         credentials_json: "/path/to/credentials.json"
+        #     },
+        # ]
+    }
+    azure.storage {
+        # containers: [
+        #     {
+        #         account_name: "trains"
+        #         account_key: "secret"
+        #         # container_name:
+        #     }
+        # ]
+    }
+
+    log {
+        # debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
+        null_log_propagate: false
+        task_log_buffer_capacity: 66
+
+        # disable urllib info and lower levels
+        disable_urllib3_info: true
+    }
+
+    development {
+        # Development-mode options
+
+        # dev task reuse window
+        task_reuse_time_window_in_hours: 72.0
+
+        # Run VCS repository detection asynchronously
+        vcs_repo_detect_async: true
+
+        # Store uncommitted git/hg source code diff in experiment manifest when training in development mode
+        # This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
+        store_uncommitted_code_diff: true
+
+        # Support stopping an experiment in case it was externally stopped, status was changed or task was reset
+        support_stopping: true
+
+        # Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
+        default_output_uri: ""
+
+        # Default auto generated requirements optimize for smaller requirements
+        # If True, analyze the entire repository regardless of the entry point.
+        # If False, first analyze the entry point script, if it does not contain other to local files,
+        # do not analyze the entire repository.
+        force_analyze_entire_repo: false
+
+        # If set to true, *trains* update message will not be printed to the console
+        # this value can be overwritten with os environment variable TRAINS_SUPPRESS_UPDATE_MESSAGE=1
+        suppress_update_message: false
+
+        # If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
+        detect_with_pip_freeze: false
+
+        # Development mode worker
+        worker {
+            # Status report period in seconds
+            report_period_sec: 2
+
+            # ping to the server - check connectivity
+            ping_period_sec: 30
+
+            # Log all stdout & stderr
+            log_stdout: true
+
+            # compatibility feature, report memory usage for the entire machine
+            # default (false), report only on the running process and its sub-processes
+            report_global_mem_used: false
+        }
+    }
+}