Add support for extra HTTP retry codes

This commit is contained in:
allegroai 2022-02-07 13:30:20 +02:00
parent 9744d63796
commit ff91ee9ade
3 changed files with 36 additions and 1 deletions

View File

@ -23,6 +23,11 @@
http { http {
max_req_size = 15728640 # request size limit (smaller than that configured in api server) max_req_size = 15728640 # request size limit (smaller than that configured in api server)
# extra retry codes on which the session will automatically use the specified retry.
# these will be added to the default retry list which currently includes bad gateway (502),
# service unavailable (503), bandwidth limit exceeded (509) and too many requests (429)
extra_retry_codes: []
retries { retries {
# retry values (int, 0 means fail on first retry) # retry values (int, 0 means fail on first retry)
total: 240 # Total number of retries to allow. Takes precedence over other counts. total: 240 # Total number of retries to allow. Takes precedence over other counts.

View File

@ -26,3 +26,11 @@ POST requests can be used instead.
However this has not been vigorously tested and may have unintended consequences. However this has not been vigorously tested and may have unintended consequences.
""" """
ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD") ENV_API_DEFAULT_REQ_METHOD = EnvEntry("CLEARML_API_DEFAULT_REQ_METHOD")
"""
Experimental option to make the SDK retry on additional error codes.
Use a comma-separated list of integer return codes.
NOTE: this changes behavior and might cause the experiment to wait
for a very long time for a non-responding or mis-configured server
"""
ENV_API_EXTRA_RETRY_CODES = EnvEntry("CLEARML_API_EXTRA_RETRY_CODES")

View File

@ -9,6 +9,7 @@ import requests
import six import six
from requests.auth import HTTPBasicAuth from requests.auth import HTTPBasicAuth
from six.moves.urllib.parse import urlparse, urlunparse from six.moves.urllib.parse import urlparse, urlunparse
from typing import List
from .callresult import CallResult from .callresult import CallResult
from .defs import ( from .defs import (
@ -25,6 +26,7 @@ from .defs import (
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_ENV_CONFIG_SECTION,
ENV_ENABLE_FILES_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
ENV_API_DEFAULT_REQ_METHOD, ENV_API_DEFAULT_REQ_METHOD,
ENV_API_EXTRA_RETRY_CODES,
) )
from .request import Request, BatchRequest # noqa: F401 from .request import Request, BatchRequest # noqa: F401
from .token_manager import TokenManager from .token_manager import TokenManager
@ -183,7 +185,8 @@ class Session(TokenManager):
self.__host = host.strip("/") self.__host = host.strip("/")
http_retries_config = http_retries_config or self.config.get( http_retries_config = http_retries_config or self.config.get(
"api.http.retries", ConfigTree()).as_plain_ordered_dict() "api.http.retries", ConfigTree()).as_plain_ordered_dict()
http_retries_config["status_forcelist"] = self._retry_codes
http_retries_config["status_forcelist"] = self._get_retry_codes()
self.__http_session = get_http_session_with_retry(**http_retries_config) self.__http_session = get_http_session_with_retry(**http_retries_config)
self.__http_session.write_timeout = self._write_session_timeout self.__http_session.write_timeout = self._write_session_timeout
self.__http_session.request_size_threshold = self._write_session_data_size self.__http_session.request_size_threshold = self._write_session_data_size
@ -233,6 +236,25 @@ class Session(TokenManager):
self._apply_config_sections(local_logger) self._apply_config_sections(local_logger)
def _get_retry_codes(self):
# type: () -> List[int]
retry_codes = set(self._retry_codes)
extra = self.config.get("api.http.extra_retry_codes", [])
if ENV_API_EXTRA_RETRY_CODES.get():
extra = [s.strip() for s in ENV_API_EXTRA_RETRY_CODES.get().split(",") if s.strip()]
for code in extra or []:
try:
retry_codes.add(int(code))
except (ValueError, TypeError):
print("Warning: invalid extra HTTP retry code detected: {}".format(code))
if retry_codes.difference(self._retry_codes):
print("Using extra HTTP retry codes {}".format(sorted(retry_codes.difference(self._retry_codes))))
return list(retry_codes)
def _load_vaults(self): def _load_vaults(self):
if not self.check_min_api_version("2.15") or self.feature_set == "basic": if not self.check_min_api_version("2.15") or self.feature_set == "basic":
return return