From 529f4674264b2a32053bca6674f404bda0233790 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Thu, 3 May 2018 12:16:59 -0400
Subject: [PATCH 1/7] several bugfixes for listing fn

---
 openml/utils.py                | 47 ++++++++++++++++-------------
 tests/test_utils/test_utils.py | 54 ++++++++++++++++++++++++++++++----
 2 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/openml/utils.py b/openml/utils.py
index afe83f141..0bc8b681f 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -100,9 +100,6 @@ def list_all(listing_call, *args, **filters):
     Example usage:
 
     ``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``
-
-    Note: I wanted to make this a generator, but this is not possible since all
-    listing calls return dicts
     
     Parameters
     ----------
@@ -112,29 +109,37 @@ def list_all(listing_call, *args, **filters):
         Any required arguments for the listing call.
     **filters : Arbitrary keyword arguments
         Any filters that can be applied to the listing function.
-        
+        additionally, the batch_size can be specified. This is
+        useful for testing purposes.
     Returns
     -------
     dict
     """
 
-    # default batch size per paging.
-    batch_size = 10000
     # eliminate filters that have a None value
     active_filters = {key: value for key, value in filters.items() if value is not None}
     page = 0
     result = {}
+
+    # default batch size per paging. This one can be set in filters (batch_size),
+    # but should not be changed afterwards. the derived batch_size can be changed.
+    BATCH_SIZE_ORIG = 10000
+    if 'batch_size' in active_filters:
+        BATCH_SIZE_ORIG = active_filters['batch_size']
+        del active_filters['batch_size']
+    batch_size = BATCH_SIZE_ORIG
+
     # max number of results to be shown
-    limit = None
+    LIMIT = None
     offset = 0
     cycle = True
     if 'size' in active_filters:
-        limit = active_filters['size']
+        LIMIT = active_filters['size']
         del active_filters['size']
     # check if the batch size is greater than the number of results that need to be returned.
-    if limit is not None:
-        if batch_size > limit:
-            batch_size = limit
+    if LIMIT is not None:
+        if BATCH_SIZE_ORIG > LIMIT:
+            batch_size = LIMIT
     if 'offset' in active_filters:
         offset = active_filters['offset']
         del active_filters['offset']
@@ -143,24 +148,26 @@ def list_all(listing_call, *args, **filters):
             new_batch = listing_call(
                 *args,
                 limit=batch_size,
-                offset=offset + batch_size * page,
+                offset=offset + BATCH_SIZE_ORIG * page,
                 **active_filters
             )
         except OpenMLServerException as e:
-            if page == 0 and e.args[0] == 'No results':
-                raise e
-            else:
+            if page > 0 and e.args[0] == 'No results':
+                # exceptional case, as it can happen that we request a new page,
+                # already got results but there are no more results to obtain
                 break
+            else:
+                raise e
         result.update(new_batch)
         page += 1
-        if limit is not None:
-            limit -= batch_size
+        if LIMIT is not None:
             # check if the number of required results has been achieved
-            if limit == 0:
+            # always do a 'bigger than' check, in case of bugs to prevent infinite loops
+            if len(result) >= LIMIT:
                 break
             # check if there are enough results to fulfill a batch
-            if limit < batch_size:
-                batch_size = limit
+            if BATCH_SIZE_ORIG > LIMIT - len(result):
+                batch_size = LIMIT - len(result)
 
     return result
 
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 9c5274810..a482fddcc 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -4,15 +4,57 @@
 
 class OpenMLTaskTest(TestBase):
     _multiprocess_can_split_ = True
+    _batch_size = 25
 
     def test_list_all(self):
-        list_datasets = openml.datasets.functions._list_datasets
-        datasets = openml.utils.list_all(list_datasets)
+        required_size = 127  # default test server reset value
+        datasets = openml.utils.list_all(openml.datasets._list_datasets,
+                                         batch_size=self._batch_size, size=required_size)
 
-        self.assertGreaterEqual(len(datasets), 100)
+        self.assertEquals(len(datasets), required_size)
         for did in datasets:
             self._check_dataset(datasets[did])
 
-        # TODO implement these tests
-        # datasets = openml.utils.list_all(list_datasets, limit=50)
-        # self.assertEqual(len(datasets), 50)
\ No newline at end of file
+    def test_list_all_for_datasets(self):
+        required_size = 127  # default test server reset value
+        datasets = openml.datasets.list_datasets(batch_size=self._batch_size, size=required_size)
+
+        self.assertEquals(len(datasets), required_size)
+        for did in datasets:
+            self._check_dataset(datasets[did])
+
+    def test_list_all_for_tasks(self):
+        required_size = 1068  # default test server reset value
+        tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size)
+
+        self.assertEquals(len(tasks), required_size)
+
+    def test_list_all_for_flows(self):
+        required_size = 15  # default test server reset value
+        flows = openml.flows.list_flows(batch_size=self._batch_size, size=required_size)
+
+        self.assertEquals(len(flows), required_size)
+
+    def test_list_all_for_setups(self):
+        required_size = 50
+        # TODO apparently list_setups function does not support kwargs
+        setups = openml.setups.list_setups(size=required_size)
+
+        # might not be on test server after reset, please rerun test at least once if fails
+        self.assertEquals(len(setups), required_size)
+
+    def test_list_all_for_runs(self):
+        required_size = 48
+        runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size)
+
+        # might not be on test server after reset, please rerun test at least once if fails
+        self.assertEquals(len(runs), required_size)
+
+    def test_list_all_for_evaluations(self):
+        required_size = 57
+        # TODO apparently list_evaluations function does not support kwargs
+        evaluations = openml.evaluations.list_evaluations(function='predictive_accuracy',
+                                                          size=required_size)
+
+        # might not be on test server after reset, please rerun test at least once if fails
+        self.assertEquals(len(evaluations), required_size)

From 7cb8ffdfa459ad1670d963ebc2b0af5daddb3533 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Thu, 3 May 2018 12:20:39 -0400
Subject: [PATCH 2/7] refactored list all fn name to be protected

---
 openml/datasets/functions.py    | 2 +-
 openml/evaluations/functions.py | 4 ++--
 openml/flows/functions.py       | 2 +-
 openml/runs/functions.py        | 4 ++--
 openml/setups/functions.py      | 4 ++--
 openml/tasks/functions.py       | 2 +-
 openml/utils.py                 | 3 ++-
 tests/test_utils/test_utils.py  | 4 ++--
 8 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index b447c671d..6a820e82a 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -190,7 +190,7 @@ def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs):
         these are also returned.
     """
 
-    return openml.utils.list_all(_list_datasets, offset=offset, size=size, status=status, tag=tag, **kwargs)
+    return openml.utils._list_all(_list_datasets, offset=offset, size=size, status=status, tag=tag, **kwargs)
 
 
 def _list_datasets(**kwargs):
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 115455a12..9d98e0470 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -38,8 +38,8 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None,
     dict
     """
 
-    return openml.utils.list_all(_list_evaluations, function, offset=offset, size=size,
-                                 id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag)
+    return openml.utils._list_all(_list_evaluations, function, offset=offset, size=size,
+                                  id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag)
 
 
 def _list_evaluations(function, id=None, task=None,
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 35bbcfd1a..cf29fd143 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -62,7 +62,7 @@ def list_flows(offset=None, size=None, tag=None, **kwargs):
         - external version
         - uploader
     """
-    return openml.utils.list_all(_list_flows, offset=offset, size=size, tag=tag, **kwargs)
+    return openml.utils._list_all(_list_flows, offset=offset, size=size, tag=tag, **kwargs)
 
 
 def _list_flows(**kwargs):
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 9e9697480..5f041bc2b 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -936,8 +936,8 @@ def list_runs(offset=None, size=None, id=None, task=None, setup=None,
         List of found runs.
     """
 
-    return openml.utils.list_all(_list_runs, offset=offset, size=size, id=id, task=task, setup=setup,
-                                 flow=flow, uploader=uploader, tag=tag, display_errors=display_errors, **kwargs)
+    return openml.utils._list_all(_list_runs, offset=offset, size=size, id=id, task=task, setup=setup,
+                                  flow=flow, uploader=uploader, tag=tag, display_errors=display_errors, **kwargs)
 
 
 def _list_runs(id=None, task=None, setup=None,
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 745da5a1e..24e711107 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -124,8 +124,8 @@ def list_setups(offset=None, size=None, flow=None, tag=None, setup=None):
     dict
         """
 
-    return openml.utils.list_all(_list_setups, offset=offset, size=size,
-                                 flow=flow, tag=tag, setup=setup)
+    return openml.utils._list_all(_list_setups, offset=offset, size=size,
+                                  flow=flow, tag=tag, setup=setup)
 
 
 def _list_setups(setup=None, **kwargs):
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 0fbdc9b21..87d9ebea8 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -132,7 +132,7 @@ def list_tasks(task_type_id=None, offset=None, size=None, tag=None, **kwargs):
         task id, dataset id, task_type and status. If qualities are calculated
         for the associated dataset, some of these are also returned.
     """
-    return openml.utils.list_all(_list_tasks, task_type_id=task_type_id, offset=offset, size=size, tag=tag, **kwargs)
+    return openml.utils._list_all(_list_tasks, task_type_id=task_type_id, offset=offset, size=size, tag=tag, **kwargs)
 
 
 def _list_tasks(task_type_id=None, **kwargs):
diff --git a/openml/utils.py b/openml/utils.py
index 0bc8b681f..8a0b8acad 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -46,6 +46,7 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
             raise ValueError("Could not find tag '%s' in node '%s'" %
                              (xml_tag_name, str(node)))
 
+
 def _tag_entity(entity_type, entity_id, tag, untag=False):
     """Function that tags or untags a given entity on OpenML. As the OpenML
        API tag functions all consist of the same format, this function covers
@@ -94,7 +95,7 @@ def _tag_entity(entity_type, entity_id, tag, untag=False):
         return []
 
 
-def list_all(listing_call, *args, **filters):
+def _list_all(listing_call, *args, **filters):
     """Helper to handle paged listing requests.
 
     Example usage:
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index a482fddcc..183d93505 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -8,8 +8,8 @@ class OpenMLTaskTest(TestBase):
 
     def test_list_all(self):
         required_size = 127  # default test server reset value
-        datasets = openml.utils.list_all(openml.datasets._list_datasets,
-                                         batch_size=self._batch_size, size=required_size)
+        datasets = openml.utils._list_all(openml.datasets.functions._list_datasets,
+                                          batch_size=self._batch_size, size=required_size)
 
         self.assertEquals(len(datasets), required_size)
         for did in datasets:

From 5db107b6e7c7759fdd315723d92bbc9fd54052da Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Thu, 3 May 2018 12:49:27 -0400
Subject: [PATCH 3/7] changed catched exception

---
 openml/utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/openml/utils.py b/openml/utils.py
index 8a0b8acad..055953067 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -5,7 +5,6 @@
 
 import openml._api_calls
 from . import config
-from openml.exceptions import OpenMLServerException
 
 
 def extract_xml_tags(xml_tag_name, node, allow_none=True):
@@ -82,7 +81,6 @@ def _tag_entity(entity_type, entity_id, tag, untag=False):
         uri = '%s/untag' %entity_type
         main_tag = 'oml:%s_untag' %entity_type
 
-
     post_variables = {'%s_id'%entity_type: entity_id, 'tag': tag}
     result_xml = openml._api_calls._perform_api_call(uri, post_variables)
 
@@ -152,8 +150,8 @@ def _list_all(listing_call, *args, **filters):
                 offset=offset + BATCH_SIZE_ORIG * page,
                 **active_filters
             )
-        except OpenMLServerException as e:
-            if page > 0 and e.args[0] == 'No results':
+        except openml.exceptions.OpenMLServerNoResult as e:
+            if page > 0:
                 # exceptional case, as it can happen that we request a new page,
                 # already got results but there are no more results to obtain
                 break

From 1b1ed8b7f17d1fa62d8b9f7f3b0a17171d30d057 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Thu, 3 May 2018 13:09:34 -0400
Subject: [PATCH 4/7] fixed unit test and _list_all

---
 openml/utils.py                       | 10 +++-------
 tests/test_runs/test_run_functions.py |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/openml/utils.py b/openml/utils.py
index 055953067..d3e7fc1f5 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -150,13 +150,9 @@ def _list_all(listing_call, *args, **filters):
                 offset=offset + BATCH_SIZE_ORIG * page,
                 **active_filters
             )
-        except openml.exceptions.OpenMLServerNoResult as e:
-            if page > 0:
-                # exceptional case, as it can happen that we request a new page,
-                # already got results but there are no more results to obtain
-                break
-            else:
-                raise e
+        except openml.exceptions.OpenMLServerNoResult:
+            # we want to return an empty dict in this case
+            break
         result.update(new_batch)
         page += 1
         if LIMIT is not None:
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 341900190..bfb259f78 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -837,7 +837,7 @@ def test_get_runs_list(self):
             self._check_run(runs[rid])
 
     def test_list_runs_empty(self):
-        runs = openml.runs.list_runs(task=[-1])
+        runs = openml.runs.list_runs(task=[0])
         if len(runs) > 0:
             raise ValueError('UnitTest Outdated, got somehow results')
 

From 6f6b46eed2bcf6434c985f1034ac46dcee5b08b3 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Thu, 3 May 2018 13:56:35 -0400
Subject: [PATCH 5/7] batch size

---
 openml/setups/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 24e711107..51a10f905 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -125,7 +125,7 @@ def list_setups(offset=None, size=None, flow=None, tag=None, setup=None):
         """
 
     return openml.utils._list_all(_list_setups, offset=offset, size=size,
-                                  flow=flow, tag=tag, setup=setup)
+                                  flow=flow, tag=tag, setup=setup, batch_size=1000)  #batch size for setups is lower
 
 
 def _list_setups(setup=None, **kwargs):

From 4a936cbb10f4426ba61516c68d7e2db345748e90 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Thu, 3 May 2018 14:56:53 -0400
Subject: [PATCH 6/7] changes suggested by @mfeurer

---
 openml/utils.py                |  3 +--
 tests/test_utils/test_utils.py | 15 ++++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/openml/utils.py b/openml/utils.py
index d3e7fc1f5..39013d835 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -131,7 +131,6 @@ def _list_all(listing_call, *args, **filters):
     # max number of results to be shown
     LIMIT = None
     offset = 0
-    cycle = True
     if 'size' in active_filters:
         LIMIT = active_filters['size']
         del active_filters['size']
@@ -142,7 +141,7 @@ def _list_all(listing_call, *args, **filters):
     if 'offset' in active_filters:
         offset = active_filters['offset']
         del active_filters['offset']
-    while cycle:
+    while True:
         try:
             new_batch = listing_call(
                 *args,
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 183d93505..e0c914acf 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -1,4 +1,5 @@
 from openml.testing import TestBase
+import numpy as np
 import openml
 
 
@@ -7,13 +8,7 @@ class OpenMLTaskTest(TestBase):
     _batch_size = 25
 
     def test_list_all(self):
-        required_size = 127  # default test server reset value
-        datasets = openml.utils._list_all(openml.datasets.functions._list_datasets,
-                                          batch_size=self._batch_size, size=required_size)
-
-        self.assertEquals(len(datasets), required_size)
-        for did in datasets:
-            self._check_dataset(datasets[did])
+        openml.utils._list_all(openml.tasks.functions._list_tasks)
 
     def test_list_all_for_datasets(self):
         required_size = 127  # default test server reset value
@@ -23,6 +18,12 @@ def test_list_all_for_datasets(self):
         for did in datasets:
             self._check_dataset(datasets[did])
 
+    def test_list_datasets_with_high_size_parameter(self):
+        datasets_a = openml.datasets.list_datasets()
+        datasets_b = openml.datasets.list_datasets(size=np.inf)
+
+        self.assertEquals(len(datasets_a), len(datasets_b))
+
     def test_list_all_for_tasks(self):
         required_size = 1068  # default test server reset value
         tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size)

From 1f9c46758fd46ef776dac34bdbb93a114b130713 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Thu, 3 May 2018 14:59:52 -0400
Subject: [PATCH 7/7] added to change log

---
 doc/progress.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/progress.rst b/doc/progress.rst
index 6681f51b3..1cfbe31ba 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -9,6 +9,12 @@ Progress
 Changelog
 =========
 
+0.8.0
+~~~~~
+* Added serialize run / deserialize run function (for saving runs on disk before uploading)
+* FIX: fixed bug related to listing functions (returns correct listing size)
+* made openml.utils.list_all a hidden function (should be accessed only by the respective listing functions)
+
 0.3.0
 ~~~~~