diff --git a/clearml/backend_api/config/default/api.conf b/clearml/backend_api/config/default/api.conf index 10145862..277ea559 100644 --- a/clearml/backend_api/config/default/api.conf +++ b/clearml/backend_api/config/default/api.conf @@ -23,6 +23,11 @@ http { max_req_size = 15728640 # request size limit (smaller than that configured in api server) + # extra retry codes on which the session will automatically use the specified retry. + # these will be added to the default retry list which currently includes bad gateway (502), + # service unavailable (503), bandwidth limit exceeded (509) and too many requests (429) + extra_retry_codes: [] + retries { # retry values (int, 0 means fail on first retry) total: 240 # Total number of retries to allow. Takes precedence over other counts. diff --git a/clearml/backend_api/session/defs.py b/clearml/backend_api/session/defs.py index b553f838..e34da8ca 100644 --- a/clearml/backend_api/session/defs.py +++ b/clearml/backend_api/session/defs.py @@ -26,3 +26,11 @@ POST requests can be used instead. However this has not been vigorously tested and may have unintended consequences. """ ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD") + +""" +Experimental option to make the SDK retry on additional error codes. +Use a comma-separated list of integer return codes. +NOTE: this changes behavior and might cause the experiment to wait +for a very long time for a non-responding or mis-configured server +""" +ENV_API_EXTRA_RETRY_CODES = EnvEntry("CLEARML_API_EXTRA_RETRY_CODES") diff --git a/clearml/backend_api/session/session.py b/clearml/backend_api/session/session.py index da9566bd..19dda1a2 100644 --- a/clearml/backend_api/session/session.py +++ b/clearml/backend_api/session/session.py @@ -9,6 +9,7 @@ import requests import six from requests.auth import HTTPBasicAuth from six.moves.urllib.parse import urlparse, urlunparse +from typing import List from .callresult import CallResult from .defs import ( @@ -25,6 +26,7 @@ from .defs import ( ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION, ENV_API_DEFAULT_REQ_METHOD, + ENV_API_EXTRA_RETRY_CODES, ) from .request import Request, BatchRequest # noqa: F401 from .token_manager import TokenManager @@ -183,7 +185,8 @@ class Session(TokenManager): self.__host = host.strip("/") http_retries_config = http_retries_config or self.config.get( "api.http.retries", ConfigTree()).as_plain_ordered_dict() - http_retries_config["status_forcelist"] = self._retry_codes + + http_retries_config["status_forcelist"] = self._get_retry_codes() self.__http_session = get_http_session_with_retry(**http_retries_config) self.__http_session.write_timeout = self._write_session_timeout self.__http_session.request_size_threshold = self._write_session_data_size @@ -233,6 +236,25 @@ class Session(TokenManager): self._apply_config_sections(local_logger) + def _get_retry_codes(self): + # type: () -> List[int] + retry_codes = set(self._retry_codes) + + extra = self.config.get("api.http.extra_retry_codes", []) + if ENV_API_EXTRA_RETRY_CODES.get(): + extra = [s.strip() for s in ENV_API_EXTRA_RETRY_CODES.get().split(",") if s.strip()] + + for code in extra or []: + try: + retry_codes.add(int(code)) + except (ValueError, TypeError): + print("Warning: invalid extra HTTP retry code detected: {}".format(code)) + + if retry_codes.difference(self._retry_codes): + print("Using extra HTTP retry codes {}".format(sorted(retry_codes.difference(self._retry_codes)))) + + return list(retry_codes) + def _load_vaults(self): if not self.check_min_api_version("2.15") or self.feature_set == "basic": return