diff --git a/doc/progress.rst b/doc/progress.rst index 5b3aae784..05b4b64c4 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,6 +8,7 @@ Changelog 0.12.2 ~~~~~~ +* ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests. * ADD #1075: A docker image is now automatically built on a push to develop. It can be used to build docs or run tests in an isolated environment. * DOC: Fixes a few broken links in the documentation. * MAINT/DOC: Automatically check for broken external links when building the documentation. diff --git a/doc/usage.rst b/doc/usage.rst index fd7d5fbec..4b40decc8 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -52,9 +52,14 @@ which are separated by newlines. The following keys are defined: * if set to ``True``, when ``run_flow_on_task`` or similar methods are called a lookup is performed to see if there already exists such a run on the server. If so, download those results instead. * if not given, will default to ``True``. +* retry_policy: + * Defines how to react when the server is unavailable or experiencing high load. It determines both how often to attempt to reconnect and how quickly to do so. Please don't use ``human`` in an automated script that you run more than one instance of, it might increase the time to complete your jobs and that of others. + * human (default): For people running openml in interactive fashion. Try only a few times, but in quick succession. + * robot: For people using openml in an automated fashion. Keep trying to reconnect for a longer time, quickly increasing the time between retries. + * connection_n_retries: - * number of connection retries. - * default: 2. Maximum number of retries: 20. + * number of connection retries + * default depends on retry_policy (5 for ``human``, 50 for ``robot``) * verbosity: * 0: normal output diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 624b0da45..b5ed976bc 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -3,7 +3,9 @@ import time import hashlib import logging +import math import pathlib +import random import requests import urllib.parse import xml @@ -217,7 +219,7 @@ def __is_checksum_equal(downloaded_file, md5_checksum=None): def _send_request(request_method, url, data, files=None, md5_checksum=None): - n_retries = max(1, min(config.connection_n_retries, config.max_retries)) + n_retries = max(1, config.connection_n_retries) response = None with requests.Session() as session: @@ -261,7 +263,17 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None): if retry_counter >= n_retries: raise else: - time.sleep(retry_counter) + + def robot(n: int) -> float: + wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 + variation = random.gauss(0, wait / 10) + return max(1.0, wait + variation) + + def human(n: int) -> float: + return max(1.0, n) + + delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter) + time.sleep(delay) if response is None: raise ValueError("This should never happen!") return response diff --git a/openml/cli.py b/openml/cli.py index b26e67d2e..15654cfc6 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -149,11 +149,9 @@ def check_cache_dir(path: str) -> str: def configure_connection_n_retries(value: str) -> None: def valid_connection_retries(n: str) -> str: if not n.isdigit(): - return f"Must be an integer number (smaller than {config.max_retries})." - if int(n) > config.max_retries: - return f"connection_n_retries may not exceed {config.max_retries}." - if int(n) == 0: - return "connection_n_retries must be non-zero." + return f"'{n}' is not a valid positive integer." + if int(n) <= 0: + return "connection_n_retries must be positive." return "" configure_field( @@ -161,7 +159,7 @@ def valid_connection_retries(n: str) -> str: value=value, check_with_message=valid_connection_retries, intro_message="Configuring the number of times to attempt to connect to the OpenML Server", - input_message=f"Enter an integer between 0 and {config.max_retries}: ", + input_message="Enter a positive integer: ", ) @@ -217,6 +215,35 @@ def is_zero_through_two(verbosity: str) -> str: ) +def configure_retry_policy(value: str) -> None: + def is_known_policy(policy: str) -> str: + if policy in ["human", "robot"]: + return "" + return "Must be 'human' or 'robot'." + + def autocomplete_policy(policy: str) -> str: + for option in ["human", "robot"]: + if option.startswith(policy.lower()): + return option + return policy + + intro_message = ( + "Set the retry policy which determines how to react if the server is unresponsive." + "We recommend 'human' for interactive usage and 'robot' for scripts." + "'human': try a few times in quick succession, less reliable but quicker response." + "'robot': try many times with increasing intervals, more reliable but slower response." + ) + + configure_field( + field="retry_policy", + value=value, + check_with_message=is_known_policy, + intro_message=intro_message, + input_message="Enter 'human' or 'robot': ", + sanitize=autocomplete_policy, + ) + + def configure_field( field: str, value: Union[None, str], @@ -272,6 +299,7 @@ def configure(args: argparse.Namespace): "apikey": configure_apikey, "server": configure_server, "cachedir": configure_cachedir, + "retry_policy": configure_retry_policy, "connection_n_retries": configure_connection_n_retries, "avoid_duplicate_runs": configure_avoid_duplicate_runs, "verbosity": configure_verbosity, diff --git a/openml/config.py b/openml/config.py index f2264dc2a..8593ad484 100644 --- a/openml/config.py +++ b/openml/config.py @@ -9,7 +9,7 @@ import os from pathlib import Path import platform -from typing import Tuple, cast, Any +from typing import Tuple, cast, Any, Optional import warnings from io import StringIO @@ -95,11 +95,10 @@ def set_file_log_level(file_output_level: int): else os.path.join("~", ".openml") ), "avoid_duplicate_runs": "True", - "connection_n_retries": "10", - "max_retries": "20", + "retry_policy": "human", + "connection_n_retries": "5", } - # Default values are actually added here in the _setup() function which is # called at the end of this module server = str(_defaults["server"]) # so mypy knows it is a string @@ -122,9 +121,26 @@ def get_server_base_url() -> str: cache_directory = str(_defaults["cachedir"]) # so mypy knows it is a string avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False -# Number of retries if the connection breaks +retry_policy = _defaults["retry_policy"] connection_n_retries = int(_defaults["connection_n_retries"]) -max_retries = int(_defaults["max_retries"]) + + +def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None: + global retry_policy + global connection_n_retries + default_retries_by_policy = dict(human=5, robot=50) + + if value not in default_retries_by_policy: + raise ValueError( + f"Detected retry_policy '{value}' but must be one of {default_retries_by_policy}" + ) + if n_retries is not None and not isinstance(n_retries, int): + raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.") + if isinstance(n_retries, int) and n_retries < 1: + raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.") + + retry_policy = value + connection_n_retries = default_retries_by_policy[value] if n_retries is None else n_retries class ConfigurationForExamples: @@ -205,8 +221,6 @@ def _setup(config=None): global server global cache_directory global avoid_duplicate_runs - global connection_n_retries - global max_retries config_file = determine_config_file_path() config_dir = config_file.parent @@ -238,8 +252,12 @@ def _get(config, key): apikey = _get(config, "apikey") server = _get(config, "server") short_cache_dir = _get(config, "cachedir") - connection_n_retries = int(_get(config, "connection_n_retries")) - max_retries = int(_get(config, "max_retries")) + + n_retries = _get(config, "connection_n_retries") + if n_retries is not None: + n_retries = int(n_retries) + + set_retry_policy(_get(config, "retry_policy"), n_retries) cache_directory = os.path.expanduser(short_cache_dir) # create the cache subdirectory @@ -261,12 +279,6 @@ def _get(config, key): "not working properly." % config_dir ) - if connection_n_retries > max_retries: - raise ValueError( - "A higher number of retries than {} is not allowed to keep the " - "server load reasonable".format(max_retries) - ) - def set_field_in_config_file(field: str, value: Any): """ Overwrites the `field` in the configuration file with the new `value`. """ @@ -317,7 +329,7 @@ def get_config_as_dict(): config["cachedir"] = cache_directory config["avoid_duplicate_runs"] = avoid_duplicate_runs config["connection_n_retries"] = connection_n_retries - config["max_retries"] = max_retries + config["retry_policy"] = retry_policy return config diff --git a/openml/testing.py b/openml/testing.py index f8e22bb4c..922d373b2 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -94,8 +94,9 @@ def setUp(self, n_levels: int = 1): openml.config.cache_directory = self.workdir # Increase the number of retries to avoid spurious server failures + self.retry_policy = openml.config.retry_policy self.connection_n_retries = openml.config.connection_n_retries - openml.config.connection_n_retries = 10 + openml.config.set_retry_policy("robot", n_retries=20) def tearDown(self): os.chdir(self.cwd) @@ -109,6 +110,7 @@ def tearDown(self): raise openml.config.server = self.production_server openml.config.connection_n_retries = self.connection_n_retries + openml.config.retry_policy = self.retry_policy @classmethod def _mark_entity_for_removal(self, entity_type, entity_id): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index ec9dd6c53..9d67ee177 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -506,6 +506,9 @@ def test__getarff_md5_issue(self): "oml:md5_checksum": "abc", "oml:url": "https://www.openml.org/data/download/61", } + n = openml.config.connection_n_retries + openml.config.connection_n_retries = 1 + self.assertRaisesRegex( OpenMLHashException, "Checksum of downloaded file is unequal to the expected checksum abc when downloading " @@ -514,6 +517,8 @@ def test__getarff_md5_issue(self): description, ) + openml.config.connection_n_retries = n + def test__get_dataset_features(self): features_file = _get_dataset_features_file(self.workdir, 2) self.assertIsInstance(features_file, str) diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index 459a0cdf5..16bdbc7df 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -29,4 +29,4 @@ def test_retry_on_database_error(self, Session_class_mock, _): ): openml._api_calls._send_request("get", "/abc", {}) - self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 10) + self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 20) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 2e2c609db..638f02420 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -44,8 +44,8 @@ def test_get_config_as_dict(self): _config["server"] = "https://test.openml.org/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = False - _config["connection_n_retries"] = 10 - _config["max_retries"] = 20 + _config["connection_n_retries"] = 20 + _config["retry_policy"] = "robot" self.assertIsInstance(config, dict) self.assertEqual(len(config), 6) self.assertDictEqual(config, _config) @@ -57,8 +57,8 @@ def test_setup_with_config(self): _config["server"] = "https://www.openml.org/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = True + _config["retry_policy"] = "human" _config["connection_n_retries"] = 100 - _config["max_retries"] = 1000 orig_config = openml.config.get_config_as_dict() openml.config._setup(_config) updated_config = openml.config.get_config_as_dict()