Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ Progress
Changelog
=========

0.8.0
~~~~~
* Added serialize run / deserialize run function (for saving runs on disk before uploading)
* FIX: fixed bug related to listing functions (returns correct listing size)
* made openml.utils.list_all a hidden function (should be accessed only by the respective listing functions)

0.3.0
~~~~~

Expand Down
2 changes: 1 addition & 1 deletion openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs):
these are also returned.
"""

return openml.utils.list_all(_list_datasets, offset=offset, size=size, status=status, tag=tag, **kwargs)
return openml.utils._list_all(_list_datasets, offset=offset, size=size, status=status, tag=tag, **kwargs)


def _list_datasets(**kwargs):
Expand Down
4 changes: 2 additions & 2 deletions openml/evaluations/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None,
dict
"""

return openml.utils.list_all(_list_evaluations, function, offset=offset, size=size,
id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag)
return openml.utils._list_all(_list_evaluations, function, offset=offset, size=size,
id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag)


def _list_evaluations(function, id=None, task=None,
Expand Down
2 changes: 1 addition & 1 deletion openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def list_flows(offset=None, size=None, tag=None, **kwargs):
- external version
- uploader
"""
return openml.utils.list_all(_list_flows, offset=offset, size=size, tag=tag, **kwargs)
return openml.utils._list_all(_list_flows, offset=offset, size=size, tag=tag, **kwargs)


def _list_flows(**kwargs):
Expand Down
4 changes: 2 additions & 2 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,8 +936,8 @@ def list_runs(offset=None, size=None, id=None, task=None, setup=None,
List of found runs.
"""

return openml.utils.list_all(_list_runs, offset=offset, size=size, id=id, task=task, setup=setup,
flow=flow, uploader=uploader, tag=tag, display_errors=display_errors, **kwargs)
return openml.utils._list_all(_list_runs, offset=offset, size=size, id=id, task=task, setup=setup,
flow=flow, uploader=uploader, tag=tag, display_errors=display_errors, **kwargs)


def _list_runs(id=None, task=None, setup=None,
Expand Down
4 changes: 2 additions & 2 deletions openml/setups/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ def list_setups(offset=None, size=None, flow=None, tag=None, setup=None):
dict
"""

return openml.utils.list_all(_list_setups, offset=offset, size=size,
flow=flow, tag=tag, setup=setup)
return openml.utils._list_all(_list_setups, offset=offset, size=size,
flow=flow, tag=tag, setup=setup, batch_size=1000) #batch size for setups is lower


def _list_setups(setup=None, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def list_tasks(task_type_id=None, offset=None, size=None, tag=None, **kwargs):
task id, dataset id, task_type and status. If qualities are calculated
for the associated dataset, some of these are also returned.
"""
return openml.utils.list_all(_list_tasks, task_type_id=task_type_id, offset=offset, size=size, tag=tag, **kwargs)
return openml.utils._list_all(_list_tasks, task_type_id=task_type_id, offset=offset, size=size, tag=tag, **kwargs)


def _list_tasks(task_type_id=None, **kwargs):
Expand Down
55 changes: 28 additions & 27 deletions openml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import openml._api_calls
from . import config
from openml.exceptions import OpenMLServerException


def extract_xml_tags(xml_tag_name, node, allow_none=True):
Expand Down Expand Up @@ -46,6 +45,7 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
raise ValueError("Could not find tag '%s' in node '%s'" %
(xml_tag_name, str(node)))


def _tag_entity(entity_type, entity_id, tag, untag=False):
"""Function that tags or untags a given entity on OpenML. As the OpenML
API tag functions all consist of the same format, this function covers
Expand Down Expand Up @@ -81,7 +81,6 @@ def _tag_entity(entity_type, entity_id, tag, untag=False):
uri = '%s/untag' %entity_type
main_tag = 'oml:%s_untag' %entity_type


post_variables = {'%s_id'%entity_type: entity_id, 'tag': tag}
result_xml = openml._api_calls._perform_api_call(uri, post_variables)

Expand All @@ -94,15 +93,12 @@ def _tag_entity(entity_type, entity_id, tag, untag=False):
return []


def list_all(listing_call, *args, **filters):
def _list_all(listing_call, *args, **filters):
"""Helper to handle paged listing requests.

Example usage:

``evaluations = list_all(list_evaluations, "predictive_accuracy", task=mytask)``

Note: I wanted to make this a generator, but this is not possible since all
listing calls return dicts

Parameters
----------
Expand All @@ -112,55 +108,60 @@ def list_all(listing_call, *args, **filters):
Any required arguments for the listing call.
**filters : Arbitrary keyword arguments
Any filters that can be applied to the listing function.

additionally, the batch_size can be specified. This is
useful for testing purposes.
Returns
-------
dict
"""

# default batch size per paging.
batch_size = 10000
# eliminate filters that have a None value
active_filters = {key: value for key, value in filters.items() if value is not None}
page = 0
result = {}

# default batch size per paging. This one can be set in filters (batch_size),
# but should not be changed afterwards. the derived batch_size can be changed.
BATCH_SIZE_ORIG = 10000
if 'batch_size' in active_filters:
BATCH_SIZE_ORIG = active_filters['batch_size']
del active_filters['batch_size']
batch_size = BATCH_SIZE_ORIG

# max number of results to be shown
limit = None
LIMIT = None
offset = 0
cycle = True
if 'size' in active_filters:
limit = active_filters['size']
LIMIT = active_filters['size']
del active_filters['size']
# check if the batch size is greater than the number of results that need to be returned.
if limit is not None:
if batch_size > limit:
batch_size = limit
if LIMIT is not None:
if BATCH_SIZE_ORIG > LIMIT:
batch_size = LIMIT
if 'offset' in active_filters:
offset = active_filters['offset']
del active_filters['offset']
while cycle:
while True:
try:
new_batch = listing_call(
*args,
limit=batch_size,
offset=offset + batch_size * page,
offset=offset + BATCH_SIZE_ORIG * page,
**active_filters
)
except OpenMLServerException as e:
if page == 0 and e.args[0] == 'No results':
raise e
else:
break
except openml.exceptions.OpenMLServerNoResult:
# we want to return an empty dict in this case
break
result.update(new_batch)
page += 1
if limit is not None:
limit -= batch_size
if LIMIT is not None:
# check if the number of required results has been achieved
if limit == 0:
# always do a 'bigger than' check, in case of bugs to prevent infinite loops
if len(result) >= LIMIT:
break
# check if there are enough results to fulfill a batch
if limit < batch_size:
batch_size = limit
if BATCH_SIZE_ORIG > LIMIT - len(result):
batch_size = LIMIT - len(result)

return result

Expand Down
2 changes: 1 addition & 1 deletion tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,7 @@ def test_get_runs_list(self):
self._check_run(runs[rid])

def test_list_runs_empty(self):
runs = openml.runs.list_runs(task=[-1])
runs = openml.runs.list_runs(task=[0])
if len(runs) > 0:
raise ValueError('UnitTest Outdated, got somehow results')

Expand Down
55 changes: 49 additions & 6 deletions tests/test_utils/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,61 @@
from openml.testing import TestBase
import numpy as np
import openml


class OpenMLTaskTest(TestBase):
_multiprocess_can_split_ = True
_batch_size = 25

def test_list_all(self):
list_datasets = openml.datasets.functions._list_datasets
datasets = openml.utils.list_all(list_datasets)
openml.utils._list_all(openml.tasks.functions._list_tasks)

self.assertGreaterEqual(len(datasets), 100)
def test_list_all_for_datasets(self):
required_size = 127 # default test server reset value
datasets = openml.datasets.list_datasets(batch_size=self._batch_size, size=required_size)

self.assertEquals(len(datasets), required_size)
for did in datasets:
self._check_dataset(datasets[did])

# TODO implement these tests
# datasets = openml.utils.list_all(list_datasets, limit=50)
# self.assertEqual(len(datasets), 50)
def test_list_datasets_with_high_size_parameter(self):
datasets_a = openml.datasets.list_datasets()
datasets_b = openml.datasets.list_datasets(size=np.inf)

self.assertEquals(len(datasets_a), len(datasets_b))

def test_list_all_for_tasks(self):
required_size = 1068 # default test server reset value
tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size)

self.assertEquals(len(tasks), required_size)

def test_list_all_for_flows(self):
required_size = 15 # default test server reset value
flows = openml.flows.list_flows(batch_size=self._batch_size, size=required_size)

self.assertEquals(len(flows), required_size)

def test_list_all_for_setups(self):
required_size = 50
# TODO apparently list_setups function does not support kwargs
setups = openml.setups.list_setups(size=required_size)

# might not be on test server after reset, please rerun test at least once if fails
self.assertEquals(len(setups), required_size)

def test_list_all_for_runs(self):
required_size = 48
runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size)

# might not be on test server after reset, please rerun test at least once if fails
self.assertEquals(len(runs), required_size)

def test_list_all_for_evaluations(self):
required_size = 57
# TODO apparently list_evaluations function does not support kwargs
evaluations = openml.evaluations.list_evaluations(function='predictive_accuracy',
size=required_size)

# might not be on test server after reset, please rerun test at least once if fails
self.assertEquals(len(evaluations), required_size)