From 5e0d214e8b5c0d41176ecec6333c3d7493223e3d Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 28 Apr 2021 16:26:26 +0200 Subject: [PATCH 1/5] Create a non-linear retry policy. --- openml/_api_calls.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index aee67d8c6..a9c3c98af 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -3,7 +3,9 @@ import time import hashlib import logging +import math import pathlib +import random import requests import urllib.parse import xml @@ -262,7 +264,10 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None): if retry_counter >= n_retries: raise else: - time.sleep(retry_counter) + wait = retry_counter + (1 / (1 + math.exp(-(retry_counter - 6)))) * 20 + variation = random.gauss(0, wait / 10) + wait_time = max(1.0, wait + variation) + time.sleep(wait_time) if response is None: raise ValueError("This should never happen!") return response From 761ad9d7f76e4eef78ab0d01ac946bb7a396b3c1 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 3 May 2021 10:21:09 +0200 Subject: [PATCH 2/5] Add retry_policy documentation --- doc/progress.rst | 1 + doc/usage.rst | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/progress.rst b/doc/progress.rst index 8d3f4ec1d..22728f0a7 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -9,6 +9,7 @@ Changelog 0.12.2 ~~~~~~ +* ADD #1065: Add a ``retry_policy`` configuration option that determines the frequency and number of times to attempt to retry server requests. * DOC: Fixes a few broken links in the documentation. * MAINT/DOC: Automatically check for broken external links when building the documentation. * MAINT/DOC: Fail documentation building on warnings. This will make the documentation building diff --git a/doc/usage.rst b/doc/usage.rst index 7bf247f4d..10928fac0 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -52,9 +52,14 @@ which are separated by newlines. The following keys are defined: * if set to ``True``, when ``run_flow_on_task`` or similar methods are called a lookup is performed to see if there already exists such a run on the server. If so, download those results instead. * if not given, will default to ``True``. +* retry_policy: + * Defines how to react when the server is unavailable or experiencing high load. It determines both how often to attempt to reconnect and how quickly to do so. Please don't use ``human`` in an automated script that you run more than one instance of, it might increase the time to complete your jobs and that of others. + * human: For people running openml in interactive fashion. Try only a few times, but in quick succession. + * robot: For people using openml in an automated fashion. Keep trying to reconnect for a longer time, quickly increasing the time between retries. + * connection_n_retries: - * number of connection retries. - * default: 2. Maximum number of retries: 20. + * number of connection retries + * default depends on retry_policy (5 for ``human``, 50 for ``robot``) * verbosity: * 0: normal output From 40e349390fcca958eed531f62e1ca633e59b1383 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 3 May 2021 11:04:30 +0200 Subject: [PATCH 3/5] Remove max_retries, add retry_policy to config Retry_policy is not yet used to affect retry frequency. Max_retries is removed because it's mostly useless, server overload is caused by many users (or parallel scripts) not a single user reconnecting. The retry policies will have acceptable times between tries either way. --- doc/usage.rst | 2 +- openml/_api_calls.py | 2 +- openml/cli.py | 10 +++---- openml/config.py | 46 ++++++++++++++++++++------------ tests/test_openml/test_config.py | 4 +-- 5 files changed, 37 insertions(+), 27 deletions(-) diff --git a/doc/usage.rst b/doc/usage.rst index 10928fac0..d9212099d 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -54,7 +54,7 @@ which are separated by newlines. The following keys are defined: * retry_policy: * Defines how to react when the server is unavailable or experiencing high load. It determines both how often to attempt to reconnect and how quickly to do so. Please don't use ``human`` in an automated script that you run more than one instance of, it might increase the time to complete your jobs and that of others. - * human: For people running openml in interactive fashion. Try only a few times, but in quick succession. + * human (default): For people running openml in interactive fashion. Try only a few times, but in quick succession. * robot: For people using openml in an automated fashion. Keep trying to reconnect for a longer time, quickly increasing the time between retries. * connection_n_retries: diff --git a/openml/_api_calls.py b/openml/_api_calls.py index a9c3c98af..5526cdd97 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -219,7 +219,7 @@ def __is_checksum_equal(downloaded_file, md5_checksum=None): def _send_request(request_method, url, data, files=None, md5_checksum=None): - n_retries = max(1, min(config.connection_n_retries, config.max_retries)) + n_retries = max(1, config.connection_n_retries) response = None with requests.Session() as session: diff --git a/openml/cli.py b/openml/cli.py index b26e67d2e..ded1be54c 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -149,11 +149,9 @@ def check_cache_dir(path: str) -> str: def configure_connection_n_retries(value: str) -> None: def valid_connection_retries(n: str) -> str: if not n.isdigit(): - return f"Must be an integer number (smaller than {config.max_retries})." - if int(n) > config.max_retries: - return f"connection_n_retries may not exceed {config.max_retries}." - if int(n) == 0: - return "connection_n_retries must be non-zero." + return f"'{n}' is not a valid positive integer." + if int(n) <= 0: + return "connection_n_retries must be positive." return "" configure_field( @@ -161,7 +159,7 @@ def valid_connection_retries(n: str) -> str: value=value, check_with_message=valid_connection_retries, intro_message="Configuring the number of times to attempt to connect to the OpenML Server", - input_message=f"Enter an integer between 0 and {config.max_retries}: ", + input_message="Enter a positive integer: ", ) diff --git a/openml/config.py b/openml/config.py index 7295ea82e..50e635761 100644 --- a/openml/config.py +++ b/openml/config.py @@ -9,7 +9,7 @@ import os from pathlib import Path import platform -from typing import Tuple, cast, Any +from typing import Tuple, cast, Any, Optional from io import StringIO import configparser @@ -94,11 +94,10 @@ def set_file_log_level(file_output_level: int): else os.path.join("~", ".openml") ), "avoid_duplicate_runs": "True", - "connection_n_retries": "10", - "max_retries": "20", + "retry_policy": "human", + "connection_n_retries": "5", } - # Default values are actually added here in the _setup() function which is # called at the end of this module server = str(_defaults["server"]) # so mypy knows it is a string @@ -121,9 +120,26 @@ def get_server_base_url() -> str: cache_directory = str(_defaults["cachedir"]) # so mypy knows it is a string avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False -# Number of retries if the connection breaks +retry_policy = _defaults["retry_policy"] connection_n_retries = int(_defaults["connection_n_retries"]) -max_retries = int(_defaults["max_retries"]) + + +def set_retry_policy(value: str, n_retries: Optional[int] = None) -> None: + global retry_policy + global connection_n_retries + default_retries_by_policy = dict(human=5, robot=50) + + if value not in default_retries_by_policy: + raise ValueError( + f"Detected retry_policy '{value}' but must be one of {default_retries_by_policy}" + ) + if n_retries is not None and not isinstance(n_retries, int): + raise TypeError(f"`n_retries` must be of type `int` or `None` but is `{type(n_retries)}`.") + if isinstance(n_retries, int) and n_retries < 1: + raise ValueError(f"`n_retries` is '{n_retries}' but must be positive.") + + retry_policy = value + connection_n_retries = default_retries_by_policy[value] if n_retries is None else n_retries class ConfigurationForExamples: @@ -200,8 +216,6 @@ def _setup(config=None): global server global cache_directory global avoid_duplicate_runs - global connection_n_retries - global max_retries config_file = determine_config_file_path() config_dir = config_file.parent @@ -233,8 +247,12 @@ def _get(config, key): apikey = _get(config, "apikey") server = _get(config, "server") short_cache_dir = _get(config, "cachedir") - connection_n_retries = int(_get(config, "connection_n_retries")) - max_retries = int(_get(config, "max_retries")) + + n_retries = _get(config, "connection_n_retries") + if n_retries is not None: + n_retries = int(n_retries) + + set_retry_policy(_get(config, "retry_policy"), n_retries) cache_directory = os.path.expanduser(short_cache_dir) # create the cache subdirectory @@ -256,12 +274,6 @@ def _get(config, key): "not working properly." % config_dir ) - if connection_n_retries > max_retries: - raise ValueError( - "A higher number of retries than {} is not allowed to keep the " - "server load reasonable".format(max_retries) - ) - def set_field_in_config_file(field: str, value: Any): """ Overwrites the `field` in the configuration file with the new `value`. """ @@ -312,7 +324,7 @@ def get_config_as_dict(): config["cachedir"] = cache_directory config["avoid_duplicate_runs"] = avoid_duplicate_runs config["connection_n_retries"] = connection_n_retries - config["max_retries"] = max_retries + config["retry_policy"] = retry_policy return config diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 2e2c609db..415dd5a2e 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -45,7 +45,7 @@ def test_get_config_as_dict(self): _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = False _config["connection_n_retries"] = 10 - _config["max_retries"] = 20 + _config["retry_policy"] = "human" self.assertIsInstance(config, dict) self.assertEqual(len(config), 6) self.assertDictEqual(config, _config) @@ -57,8 +57,8 @@ def test_setup_with_config(self): _config["server"] = "https://www.openml.org/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = True + _config["retry_policy"] = "human" _config["connection_n_retries"] = 100 - _config["max_retries"] = 1000 orig_config = openml.config.get_config_as_dict() openml.config._setup(_config) updated_config = openml.config.get_config_as_dict() From 9f522f34ce20cbc29bff41dcafa5ecd681f951d8 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 3 May 2021 11:18:19 +0200 Subject: [PATCH 4/5] Add retry_policy so cli config --- openml/cli.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/openml/cli.py b/openml/cli.py index ded1be54c..15654cfc6 100644 --- a/openml/cli.py +++ b/openml/cli.py @@ -215,6 +215,35 @@ def is_zero_through_two(verbosity: str) -> str: ) +def configure_retry_policy(value: str) -> None: + def is_known_policy(policy: str) -> str: + if policy in ["human", "robot"]: + return "" + return "Must be 'human' or 'robot'." + + def autocomplete_policy(policy: str) -> str: + for option in ["human", "robot"]: + if option.startswith(policy.lower()): + return option + return policy + + intro_message = ( + "Set the retry policy which determines how to react if the server is unresponsive." + "We recommend 'human' for interactive usage and 'robot' for scripts." + "'human': try a few times in quick succession, less reliable but quicker response." + "'robot': try many times with increasing intervals, more reliable but slower response." + ) + + configure_field( + field="retry_policy", + value=value, + check_with_message=is_known_policy, + intro_message=intro_message, + input_message="Enter 'human' or 'robot': ", + sanitize=autocomplete_policy, + ) + + def configure_field( field: str, value: Union[None, str], @@ -270,6 +299,7 @@ def configure(args: argparse.Namespace): "apikey": configure_apikey, "server": configure_server, "cachedir": configure_cachedir, + "retry_policy": configure_retry_policy, "connection_n_retries": configure_connection_n_retries, "avoid_duplicate_runs": configure_avoid_duplicate_runs, "verbosity": configure_verbosity, From 94927bca19ddededf975f36ae0ebd994ab658bf0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 6 May 2021 11:38:01 +0200 Subject: [PATCH 5/5] Use policy to determine wait time Some unit tests needed to be updated. In particular for the hash exception unit test, because we expect a hash exception, we don't want to use the ``robot`` policy as that will make it query and wait unnecessarily. --- openml/_api_calls.py | 15 +++++++++++---- openml/testing.py | 4 +++- tests/test_datasets/test_dataset_functions.py | 5 +++++ tests/test_openml/test_api_calls.py | 2 +- tests/test_openml/test_config.py | 4 ++-- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 2a7532cfd..b5ed976bc 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -263,10 +263,17 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None): if retry_counter >= n_retries: raise else: - wait = retry_counter + (1 / (1 + math.exp(-(retry_counter - 6)))) * 20 - variation = random.gauss(0, wait / 10) - wait_time = max(1.0, wait + variation) - time.sleep(wait_time) + + def robot(n: int) -> float: + wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 + variation = random.gauss(0, wait / 10) + return max(1.0, wait + variation) + + def human(n: int) -> float: + return max(1.0, n) + + delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter) + time.sleep(delay) if response is None: raise ValueError("This should never happen!") return response diff --git a/openml/testing.py b/openml/testing.py index f8e22bb4c..922d373b2 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -94,8 +94,9 @@ def setUp(self, n_levels: int = 1): openml.config.cache_directory = self.workdir # Increase the number of retries to avoid spurious server failures + self.retry_policy = openml.config.retry_policy self.connection_n_retries = openml.config.connection_n_retries - openml.config.connection_n_retries = 10 + openml.config.set_retry_policy("robot", n_retries=20) def tearDown(self): os.chdir(self.cwd) @@ -109,6 +110,7 @@ def tearDown(self): raise openml.config.server = self.production_server openml.config.connection_n_retries = self.connection_n_retries + openml.config.retry_policy = self.retry_policy @classmethod def _mark_entity_for_removal(self, entity_type, entity_id): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index ec9dd6c53..9d67ee177 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -506,6 +506,9 @@ def test__getarff_md5_issue(self): "oml:md5_checksum": "abc", "oml:url": "https://www.openml.org/data/download/61", } + n = openml.config.connection_n_retries + openml.config.connection_n_retries = 1 + self.assertRaisesRegex( OpenMLHashException, "Checksum of downloaded file is unequal to the expected checksum abc when downloading " @@ -514,6 +517,8 @@ def test__getarff_md5_issue(self): description, ) + openml.config.connection_n_retries = n + def test__get_dataset_features(self): features_file = _get_dataset_features_file(self.workdir, 2) self.assertIsInstance(features_file, str) diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index 459a0cdf5..16bdbc7df 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -29,4 +29,4 @@ def test_retry_on_database_error(self, Session_class_mock, _): ): openml._api_calls._send_request("get", "/abc", {}) - self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 10) + self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 20) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 415dd5a2e..638f02420 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -44,8 +44,8 @@ def test_get_config_as_dict(self): _config["server"] = "https://test.openml.org/api/v1/xml" _config["cachedir"] = self.workdir _config["avoid_duplicate_runs"] = False - _config["connection_n_retries"] = 10 - _config["retry_policy"] = "human" + _config["connection_n_retries"] = 20 + _config["retry_policy"] = "robot" self.assertIsInstance(config, dict) self.assertEqual(len(config), 6) self.assertDictEqual(config, _config)