Add support for extra HTTP retry codes

This commit is contained in:
allegroai 2022-02-07 13:30:20 +02:00
parent 9744d63796
commit ff91ee9ade
3 changed files with 36 additions and 1 deletions

View File

@ -23,6 +23,11 @@
http {
max_req_size = 15728640 # request size limit (smaller than that configured in api server)
# extra retry codes on which the session will automatically use the specified retry.
# these will be added to the default retry list which currently includes bad gateway (502),
# service unavailable (503), bandwidth limit exceeded (509) and too many requests (429)
extra_retry_codes: []
retries {
# retry values (int, 0 means fail on first retry)
total: 240 # Total number of retries to allow. Takes precedence over other counts.

View File

@ -26,3 +26,11 @@ POST requests can be used instead.
However this has not been vigorously tested and may have unintended consequences.
"""
ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD")
"""
Experimental option to make the SDK retry on additional error codes.
Use a comma-separated list of integer return codes.
NOTE: this changes behavior and might cause the experiment to wait
for a very long time for a non-responding or mis-configured server
"""
ENV_API_EXTRA_RETRY_CODES = EnvEntry("CLEARML_API_EXTRA_RETRY_CODES")

View File

@ -9,6 +9,7 @@ import requests
import six
from requests.auth import HTTPBasicAuth
from six.moves.urllib.parse import urlparse, urlunparse
from typing import List
from .callresult import CallResult
from .defs import (
@ -25,6 +26,7 @@ from .defs import (
ENV_ENABLE_ENV_CONFIG_SECTION,
ENV_ENABLE_FILES_CONFIG_SECTION,
ENV_API_DEFAULT_REQ_METHOD,
ENV_API_EXTRA_RETRY_CODES,
)
from .request import Request, BatchRequest # noqa: F401
from .token_manager import TokenManager
@ -183,7 +185,8 @@ class Session(TokenManager):
self.__host = host.strip("/")
http_retries_config = http_retries_config or self.config.get(
"api.http.retries", ConfigTree()).as_plain_ordered_dict()
http_retries_config["status_forcelist"] = self._retry_codes
http_retries_config["status_forcelist"] = self._get_retry_codes()
self.__http_session = get_http_session_with_retry(**http_retries_config)
self.__http_session.write_timeout = self._write_session_timeout
self.__http_session.request_size_threshold = self._write_session_data_size
@ -233,6 +236,25 @@ class Session(TokenManager):
self._apply_config_sections(local_logger)
def _get_retry_codes(self):
# type: () -> List[int]
retry_codes = set(self._retry_codes)
extra = self.config.get("api.http.extra_retry_codes", [])
if ENV_API_EXTRA_RETRY_CODES.get():
extra = [s.strip() for s in ENV_API_EXTRA_RETRY_CODES.get().split(",") if s.strip()]
for code in extra or []:
try:
retry_codes.add(int(code))
except (ValueError, TypeError):
print("Warning: invalid extra HTTP retry code detected: {}".format(code))
if retry_codes.difference(self._retry_codes):
print("Using extra HTTP retry codes {}".format(sorted(retry_codes.difference(self._retry_codes))))
return list(retry_codes)
def _load_vaults(self):
if not self.check_min_api_version("2.15") or self.feature_set == "basic":
return