From 529f4674264b2a32053bca6674f404bda0233790 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 3 May 2018 12:16:59 -0400 Subject: [PATCH 1/7] several bugfixes for listing fn --- openml/utils.py | 47 ++++++++++++++++------------- tests/test_utils/test_utils.py | 54 ++++++++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 26 deletions(-) diff --git a/openml/utils.py b/openml/utils.py index afe83f141..0bc8b681f 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -100,9 +100,6 @@ def list_all(listing_call, *args, **filters): Example usage: ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)`` - - Note: I wanted to make this a generator, but this is not possible since all - listing calls return dicts Parameters ---------- @@ -112,29 +109,37 @@ def list_all(listing_call, *args, **filters): Any required arguments for the listing call. **filters : Arbitrary keyword arguments Any filters that can be applied to the listing function. - + additionally, the batch_size can be specified. This is + useful for testing purposes. Returns ------- dict """ - # default batch size per paging. - batch_size = 10000 # eliminate filters that have a None value active_filters = {key: value for key, value in filters.items() if value is not None} page = 0 result = {} + + # default batch size per paging. This one can be set in filters (batch_size), + # but should not be changed afterwards. the derived batch_size can be changed. + BATCH_SIZE_ORIG = 10000 + if 'batch_size' in active_filters: + BATCH_SIZE_ORIG = active_filters['batch_size'] + del active_filters['batch_size'] + batch_size = BATCH_SIZE_ORIG + # max number of results to be shown - limit = None + LIMIT = None offset = 0 cycle = True if 'size' in active_filters: - limit = active_filters['size'] + LIMIT = active_filters['size'] del active_filters['size'] # check if the batch size is greater than the number of results that need to be returned. - if limit is not None: - if batch_size > limit: - batch_size = limit + if LIMIT is not None: + if BATCH_SIZE_ORIG > LIMIT: + batch_size = LIMIT if 'offset' in active_filters: offset = active_filters['offset'] del active_filters['offset'] @@ -143,24 +148,26 @@ def list_all(listing_call, *args, **filters): new_batch = listing_call( *args, limit=batch_size, - offset=offset + batch_size * page, + offset=offset + BATCH_SIZE_ORIG * page, **active_filters ) except OpenMLServerException as e: - if page == 0 and e.args[0] == 'No results': - raise e - else: + if page > 0 and e.args[0] == 'No results': + # exceptional case, as it can happen that we request a new page, + # already got results but there are no more results to obtain break + else: + raise e result.update(new_batch) page += 1 - if limit is not None: - limit -= batch_size + if LIMIT is not None: # check if the number of required results has been achieved - if limit == 0: + # always do a 'bigger than' check, in case of bugs to prevent infinite loops + if len(result) >= LIMIT: break # check if there are enough results to fulfill a batch - if limit < batch_size: - batch_size = limit + if BATCH_SIZE_ORIG > LIMIT - len(result): + batch_size = LIMIT - len(result) return result diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 9c5274810..a482fddcc 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -4,15 +4,57 @@ class OpenMLTaskTest(TestBase): _multiprocess_can_split_ = True + _batch_size = 25 def test_list_all(self): - list_datasets = openml.datasets.functions._list_datasets - datasets = openml.utils.list_all(list_datasets) + required_size = 127 # default test server reset value + datasets = openml.utils.list_all(openml.datasets._list_datasets, + batch_size=self._batch_size, size=required_size) - self.assertGreaterEqual(len(datasets), 100) + self.assertEquals(len(datasets), required_size) for did in datasets: self._check_dataset(datasets[did]) - # TODO implement these tests - # datasets = openml.utils.list_all(list_datasets, limit=50) - # self.assertEqual(len(datasets), 50) \ No newline at end of file + def test_list_all_for_datasets(self): + required_size = 127 # default test server reset value + datasets = openml.datasets.list_datasets(batch_size=self._batch_size, size=required_size) + + self.assertEquals(len(datasets), required_size) + for did in datasets: + self._check_dataset(datasets[did]) + + def test_list_all_for_tasks(self): + required_size = 1068 # default test server reset value + tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size) + + self.assertEquals(len(tasks), required_size) + + def test_list_all_for_flows(self): + required_size = 15 # default test server reset value + flows = openml.flows.list_flows(batch_size=self._batch_size, size=required_size) + + self.assertEquals(len(flows), required_size) + + def test_list_all_for_setups(self): + required_size = 50 + # TODO apparently list_setups function does not support kwargs + setups = openml.setups.list_setups(size=required_size) + + # might not be on test server after reset, please rerun test at least once if fails + self.assertEquals(len(setups), required_size) + + def test_list_all_for_runs(self): + required_size = 48 + runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size) + + # might not be on test server after reset, please rerun test at least once if fails + self.assertEquals(len(runs), required_size) + + def test_list_all_for_evaluations(self): + required_size = 57 + # TODO apparently list_evaluations function does not support kwargs + evaluations = openml.evaluations.list_evaluations(function='predictive_accuracy', + size=required_size) + + # might not be on test server after reset, please rerun test at least once if fails + self.assertEquals(len(evaluations), required_size) From 7cb8ffdfa459ad1670d963ebc2b0af5daddb3533 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 3 May 2018 12:20:39 -0400 Subject: [PATCH 2/7] refactored list all fn name to be protected --- openml/datasets/functions.py | 2 +- openml/evaluations/functions.py | 4 ++-- openml/flows/functions.py | 2 +- openml/runs/functions.py | 4 ++-- openml/setups/functions.py | 4 ++-- openml/tasks/functions.py | 2 +- openml/utils.py | 3 ++- tests/test_utils/test_utils.py | 4 ++-- 8 files changed, 13 insertions(+), 12 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index b447c671d..6a820e82a 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -190,7 +190,7 @@ def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs): these are also returned. """ - return openml.utils.list_all(_list_datasets, offset=offset, size=size, status=status, tag=tag, **kwargs) + return openml.utils._list_all(_list_datasets, offset=offset, size=size, status=status, tag=tag, **kwargs) def _list_datasets(**kwargs): diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index 115455a12..9d98e0470 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -38,8 +38,8 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None, dict """ - return openml.utils.list_all(_list_evaluations, function, offset=offset, size=size, - id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag) + return openml.utils._list_all(_list_evaluations, function, offset=offset, size=size, + id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag) def _list_evaluations(function, id=None, task=None, diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 35bbcfd1a..cf29fd143 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -62,7 +62,7 @@ def list_flows(offset=None, size=None, tag=None, **kwargs): - external version - uploader """ - return openml.utils.list_all(_list_flows, offset=offset, size=size, tag=tag, **kwargs) + return openml.utils._list_all(_list_flows, offset=offset, size=size, tag=tag, **kwargs) def _list_flows(**kwargs): diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 9e9697480..5f041bc2b 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -936,8 +936,8 @@ def list_runs(offset=None, size=None, id=None, task=None, setup=None, List of found runs. """ - return openml.utils.list_all(_list_runs, offset=offset, size=size, id=id, task=task, setup=setup, - flow=flow, uploader=uploader, tag=tag, display_errors=display_errors, **kwargs) + return openml.utils._list_all(_list_runs, offset=offset, size=size, id=id, task=task, setup=setup, + flow=flow, uploader=uploader, tag=tag, display_errors=display_errors, **kwargs) def _list_runs(id=None, task=None, setup=None, diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 745da5a1e..24e711107 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -124,8 +124,8 @@ def list_setups(offset=None, size=None, flow=None, tag=None, setup=None): dict """ - return openml.utils.list_all(_list_setups, offset=offset, size=size, - flow=flow, tag=tag, setup=setup) + return openml.utils._list_all(_list_setups, offset=offset, size=size, + flow=flow, tag=tag, setup=setup) def _list_setups(setup=None, **kwargs): diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 0fbdc9b21..87d9ebea8 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -132,7 +132,7 @@ def list_tasks(task_type_id=None, offset=None, size=None, tag=None, **kwargs): task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned. """ - return openml.utils.list_all(_list_tasks, task_type_id=task_type_id, offset=offset, size=size, tag=tag, **kwargs) + return openml.utils._list_all(_list_tasks, task_type_id=task_type_id, offset=offset, size=size, tag=tag, **kwargs) def _list_tasks(task_type_id=None, **kwargs): diff --git a/openml/utils.py b/openml/utils.py index 0bc8b681f..8a0b8acad 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -46,6 +46,7 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True): raise ValueError("Could not find tag '%s' in node '%s'" % (xml_tag_name, str(node))) + def _tag_entity(entity_type, entity_id, tag, untag=False): """Function that tags or untags a given entity on OpenML. As the OpenML API tag functions all consist of the same format, this function covers @@ -94,7 +95,7 @@ def _tag_entity(entity_type, entity_id, tag, untag=False): return [] -def list_all(listing_call, *args, **filters): +def _list_all(listing_call, *args, **filters): """Helper to handle paged listing requests. Example usage: diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index a482fddcc..183d93505 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -8,8 +8,8 @@ class OpenMLTaskTest(TestBase): def test_list_all(self): required_size = 127 # default test server reset value - datasets = openml.utils.list_all(openml.datasets._list_datasets, - batch_size=self._batch_size, size=required_size) + datasets = openml.utils._list_all(openml.datasets.functions._list_datasets, + batch_size=self._batch_size, size=required_size) self.assertEquals(len(datasets), required_size) for did in datasets: From 5db107b6e7c7759fdd315723d92bbc9fd54052da Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 3 May 2018 12:49:27 -0400 Subject: [PATCH 3/7] changed catched exception --- openml/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/openml/utils.py b/openml/utils.py index 8a0b8acad..055953067 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -5,7 +5,6 @@ import openml._api_calls from . import config -from openml.exceptions import OpenMLServerException def extract_xml_tags(xml_tag_name, node, allow_none=True): @@ -82,7 +81,6 @@ def _tag_entity(entity_type, entity_id, tag, untag=False): uri = '%s/untag' %entity_type main_tag = 'oml:%s_untag' %entity_type - post_variables = {'%s_id'%entity_type: entity_id, 'tag': tag} result_xml = openml._api_calls._perform_api_call(uri, post_variables) @@ -152,8 +150,8 @@ def _list_all(listing_call, *args, **filters): offset=offset + BATCH_SIZE_ORIG * page, **active_filters ) - except OpenMLServerException as e: - if page > 0 and e.args[0] == 'No results': + except openml.exceptions.OpenMLServerNoResult as e: + if page > 0: # exceptional case, as it can happen that we request a new page, # already got results but there are no more results to obtain break From 1b1ed8b7f17d1fa62d8b9f7f3b0a17171d30d057 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 3 May 2018 13:09:34 -0400 Subject: [PATCH 4/7] fixed unit test and _list_all --- openml/utils.py | 10 +++------- tests/test_runs/test_run_functions.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/openml/utils.py b/openml/utils.py index 055953067..d3e7fc1f5 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -150,13 +150,9 @@ def _list_all(listing_call, *args, **filters): offset=offset + BATCH_SIZE_ORIG * page, **active_filters ) - except openml.exceptions.OpenMLServerNoResult as e: - if page > 0: - # exceptional case, as it can happen that we request a new page, - # already got results but there are no more results to obtain - break - else: - raise e + except openml.exceptions.OpenMLServerNoResult: + # we want to return an empty dict in this case + break result.update(new_batch) page += 1 if LIMIT is not None: diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 341900190..bfb259f78 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -837,7 +837,7 @@ def test_get_runs_list(self): self._check_run(runs[rid]) def test_list_runs_empty(self): - runs = openml.runs.list_runs(task=[-1]) + runs = openml.runs.list_runs(task=[0]) if len(runs) > 0: raise ValueError('UnitTest Outdated, got somehow results') From 6f6b46eed2bcf6434c985f1034ac46dcee5b08b3 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 3 May 2018 13:56:35 -0400 Subject: [PATCH 5/7] batch size --- openml/setups/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 24e711107..51a10f905 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -125,7 +125,7 @@ def list_setups(offset=None, size=None, flow=None, tag=None, setup=None): """ return openml.utils._list_all(_list_setups, offset=offset, size=size, - flow=flow, tag=tag, setup=setup) + flow=flow, tag=tag, setup=setup, batch_size=1000) #batch size for setups is lower def _list_setups(setup=None, **kwargs): From 4a936cbb10f4426ba61516c68d7e2db345748e90 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 3 May 2018 14:56:53 -0400 Subject: [PATCH 6/7] changes suggested by @mfeurer --- openml/utils.py | 3 +-- tests/test_utils/test_utils.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/openml/utils.py b/openml/utils.py index d3e7fc1f5..39013d835 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -131,7 +131,6 @@ def _list_all(listing_call, *args, **filters): # max number of results to be shown LIMIT = None offset = 0 - cycle = True if 'size' in active_filters: LIMIT = active_filters['size'] del active_filters['size'] @@ -142,7 +141,7 @@ def _list_all(listing_call, *args, **filters): if 'offset' in active_filters: offset = active_filters['offset'] del active_filters['offset'] - while cycle: + while True: try: new_batch = listing_call( *args, diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 183d93505..e0c914acf 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -1,4 +1,5 @@ from openml.testing import TestBase +import numpy as np import openml @@ -7,13 +8,7 @@ class OpenMLTaskTest(TestBase): _batch_size = 25 def test_list_all(self): - required_size = 127 # default test server reset value - datasets = openml.utils._list_all(openml.datasets.functions._list_datasets, - batch_size=self._batch_size, size=required_size) - - self.assertEquals(len(datasets), required_size) - for did in datasets: - self._check_dataset(datasets[did]) + openml.utils._list_all(openml.tasks.functions._list_tasks) def test_list_all_for_datasets(self): required_size = 127 # default test server reset value @@ -23,6 +18,12 @@ def test_list_all_for_datasets(self): for did in datasets: self._check_dataset(datasets[did]) + def test_list_datasets_with_high_size_parameter(self): + datasets_a = openml.datasets.list_datasets() + datasets_b = openml.datasets.list_datasets(size=np.inf) + + self.assertEquals(len(datasets_a), len(datasets_b)) + def test_list_all_for_tasks(self): required_size = 1068 # default test server reset value tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size) From 1f9c46758fd46ef776dac34bdbb93a114b130713 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 3 May 2018 14:59:52 -0400 Subject: [PATCH 7/7] added to change log --- doc/progress.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/progress.rst b/doc/progress.rst index 6681f51b3..1cfbe31ba 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -9,6 +9,12 @@ Progress Changelog ========= +0.8.0 +~~~~~ +* Added serialize run / deserialize run function (for saving runs on disk before uploading) +* FIX: fixed bug related to listing functions (returns correct listing size) +* made openml.utils.list_all a hidden function (should be accessed only by the respective listing functions) + 0.3.0 ~~~~~