diff --git a/.travis.yml b/.travis.yml index f0cecf80d..07e5f80fd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,11 @@ env: - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" +# Travis issue +# https://github.com/travis-ci/travis-ci/issues/8920 +before_install: + - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)" + install: source ci_scripts/install.sh script: bash ci_scripts/test.sh after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result" diff --git a/ci_scripts/flake8_diff.sh b/ci_scripts/flake8_diff.sh index 90d7923ad..9207163bb 100644 --- a/ci_scripts/flake8_diff.sh +++ b/ci_scripts/flake8_diff.sh @@ -125,7 +125,7 @@ check_files() { if [ -n "$files" ]; then # Conservative approach: diff without context (--unified=0) so that code # that was not changed does not create failures - git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options + git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --ignore E402 --diff --show-source $options fi } diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 962c9b98e..d68100648 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -5,41 +5,74 @@ A tutorial on how to create and upload a dataset to OpenML. """ import numpy as np -import openml import sklearn.datasets +from scipy.sparse import coo_matrix + +import openml +from openml.datasets.functions import create_dataset ############################################################################ -# For this example we will upload to the test server to not pollute the live server with countless copies of the same dataset. +# For this tutorial we will upload to the test server to not pollute the live +# server with countless copies of the same dataset. openml.config.server = 'https://test.openml.org/api/v1/xml' ############################################################################ -# Prepare the data -# ^^^^^^^^^^^^^^^^ -# Load an example dataset from scikit-learn which we will upload to OpenML.org via the API. -breast_cancer = sklearn.datasets.load_breast_cancer() -name = 'BreastCancer(scikit-learn)' -X = breast_cancer.data -y = breast_cancer.target -attribute_names = breast_cancer.feature_names -targets = breast_cancer.target_names -description = breast_cancer.DESCR +# Below we will cover the following cases of the +# dataset object: +# +# * A numpy array +# * A list +# * A sparse matrix ############################################################################ -# OpenML does not distinguish between the attributes and targets on the data level and stores all data in a -# single matrix. The target feature is indicated as meta-data of the dataset (and tasks on that data). +# Dataset is a numpy array +# ======================== +# A numpy array can contain lists in the case of dense data +# or it can contain OrderedDicts in the case of sparse data. +# +# Prepare dataset +# ^^^^^^^^^^^^^^^ +# Load an example dataset from scikit-learn which we +# will upload to OpenML.org via the API. + +diabetes = sklearn.datasets.load_diabetes() +name = 'Diabetes(scikit-learn)' +X = diabetes.data +y = diabetes.target +attribute_names = diabetes.feature_names +description = diabetes.DESCR + +############################################################################ +# OpenML does not distinguish between the attributes and +# targets on the data level and stores all data in a single matrix. +# +# The target feature is indicated as meta-data of the +# dataset (and tasks on that data). + data = np.concatenate((X, y.reshape((-1, 1))), axis=1) attribute_names = list(attribute_names) attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names -] + [('class', 'REAL')] +] + [('class', 'INTEGER')] +citation = ( + "Bradley Efron, Trevor Hastie, Iain Johnstone and " + "Robert Tibshirani (2004) (Least Angle Regression) " + "Annals of Statistics (with discussion), 407-499" +) +paper_url = ( + 'http://web.stanford.edu/~hastie/Papers/' + 'LARS/LeastAngle_2002.pdf' +) ############################################################################ # Create the dataset object # ^^^^^^^^^^^^^^^^^^^^^^^^^ -# The definition of all fields can be found in the XSD files describing the expected format: +# The definition of all fields can be found in the +# XSD files describing the expected format: # # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd -dataset = openml.datasets.functions.create_dataset( + +diabetes_dataset = create_dataset( # The name of the dataset (needs to be unique). # Must not be longer than 128 characters and only contain # a-z, A-Z, 0-9 and the following special characters: _\-\.(), @@ -47,11 +80,12 @@ # Textual description of the dataset. description=description, # The person who created the dataset. - creator='Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian', + creator="Bradley Efron, Trevor Hastie, " + "Iain Johnstone and Robert Tibshirani", # People who contributed to the current version of the dataset. contributor=None, # The date the data was originally collected, given by the uploader. - collection_date='01-11-1995', + collection_date='09-01-2012', # Language in which the data is represented. # Starts with 1 upper case letter, rest lower case, e.g. 'English'. language='English', @@ -64,26 +98,129 @@ # Attributes that should be excluded in modelling, such as identifiers and indexes. ignore_attribute=None, # How to cite the paper. - citation=( - "W.N. Street, W.H. Wolberg and O.L. Mangasarian. " - "Nuclear feature extraction for breast tumor diagnosis. " - "IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, " - "volume 1905, pages 861-870, San Jose, CA, 1993." - ), + citation=citation, # Attributes of the data attributes=attributes, data=data, - # Format of the dataset. Only 'arff' for now. - format='arff', # A version label which is provided by the user. version_label='test', - original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)', - paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1' + original_data_url=( + 'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html' + ), + paper_url=paper_url, ) ############################################################################ -try: - upload_id = dataset.publish() - print('URL for dataset: %s/data/%d' % (openml.config.server, upload_id)) -except openml.exceptions.PyOpenMLError as err: - print("OpenML: {0}".format(err)) + +upload_did = diabetes_dataset.publish() +print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) + +############################################################################ +# Dataset is a list +# ================= +# A list can contain lists in the case of dense data +# or it can contain OrderedDicts in the case of sparse data. +# +# Weather dataset: +# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html + +data = [ + ['sunny', 85, 85, 'FALSE', 'no'], + ['sunny', 80, 90, 'TRUE', 'no'], + ['overcast', 83, 86, 'FALSE', 'yes'], + ['rainy', 70, 96, 'FALSE', 'yes'], + ['rainy', 68, 80, 'FALSE', 'yes'], + ['rainy', 65, 70, 'TRUE', 'no'], + ['overcast', 64, 65, 'TRUE', 'yes'], + ['sunny', 72, 95, 'FALSE', 'no'], + ['sunny', 69, 70, 'FALSE', 'yes'], + ['rainy', 75, 80, 'FALSE', 'yes'], + ['sunny', 75, 70, 'TRUE', 'yes'], + ['overcast', 72, 90, 'TRUE', 'yes'], + ['overcast', 81, 75, 'FALSE', 'yes'], + ['rainy', 71, 91, 'TRUE', 'no'], +] + +attribute_names = [ + ('outlook', ['sunny', 'overcast', 'rainy']), + ('temperature', 'REAL'), + ('humidity', 'REAL'), + ('windy', ['TRUE', 'FALSE']), + ('play', ['yes', 'no']), +] + +description = ( + 'The weather problem is a tiny dataset that we will use repeatedly' + ' to illustrate machine learning methods. Entirely fictitious, it ' + 'supposedly concerns the conditions that are suitable for playing ' + 'some unspecified game. In general, instances in a dataset are ' + 'characterized by the values of features, or attributes, that measure ' + 'different aspects of the instance. In this case there are four ' + 'attributes: outlook, temperature, humidity, and windy. ' + 'The outcome is whether to play or not.' +) + +citation = ( + 'I. H. Witten, E. Frank, M. A. Hall, and ITPro,' + 'Data mining practical machine learning tools and techniques, ' + 'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011' +) + +weather_dataset = create_dataset( + name="Weather", + description=description, + creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro', + contributor=None, + collection_date='01-01-2011', + language='English', + licence=None, + default_target_attribute='play', + row_id_attribute=None, + ignore_attribute=None, + citation=citation, + attributes=attribute_names, + data=data, + version_label='example', +) + +############################################################################ + +upload_did = weather_dataset.publish() +print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) + +############################################################################ +# Dataset is a sparse matrix +# ========================== + +sparse_data = coo_matrix(( + [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]), +)) + +column_names = [ + ('input1', 'REAL'), + ('input2', 'REAL'), + ('y', 'REAL'), +] + +xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='example', +) + +############################################################################ + +upload_did = xor_dataset.publish() +print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index d4aa2690b..c0ce3676e 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -1,8 +1,21 @@ -from .functions import (list_datasets, check_datasets_active, - get_datasets, get_dataset, status_update) +from .functions import ( + check_datasets_active, + create_dataset, + get_dataset, + get_datasets, + list_datasets, + status_update, +) from .dataset import OpenMLDataset from .data_feature import OpenMLDataFeature -__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets', - 'OpenMLDataset', 'OpenMLDataFeature', 'list_datasets', - 'status_update'] +__all__ = [ + 'check_datasets_active', + 'create_dataset', + 'get_dataset', + 'get_datasets', + 'list_datasets', + 'OpenMLDataset', + 'OpenMLDataFeature', + 'status_update', +] diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index fe05fa29f..b4213e91a 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -1,20 +1,21 @@ -from collections import OrderedDict import gzip import io import logging import os -import six +from collections import OrderedDict import arff - import numpy as np import scipy.sparse -from six.moves import cPickle as pickle import xmltodict +import six +from six.moves import cPickle as pickle +from warnings import warn +import openml._api_calls from .data_feature import OpenMLDataFeature from ..exceptions import PyOpenMLError -import openml._api_calls + logger = logging.getLogger(__name__) @@ -31,7 +32,7 @@ class OpenMLDataset(object): description : str Description of the dataset. format : str - Format of the dataset. Only 'arff' for now. + Format of the dataset which can be either 'arff' or 'sparse_arff'. dataset_id : int, optional Id autogenerated by the server. version : int, optional @@ -86,23 +87,31 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, format, dataset_id=None, - version=None, creator=None, contributor=None, - collection_date=None, upload_date=None, language=None, - licence=None, url=None, default_target_attribute=None, + def __init__(self, name, description, format=None, + data_format='arff', dataset_id=None, version=None, + creator=None, contributor=None, collection_date=None, + upload_date=None, language=None, licence=None, + url=None, default_target_attribute=None, row_id_attribute=None, ignore_attribute=None, - version_label=None, citation=None, tag=None, visibility=None, - original_data_url=None, paper_url=None, update_comment=None, - md5_checksum=None, data_file=None, features=None, qualities=None, - dataset=None): - # TODO add function to check if the name is casual_string128 + version_label=None, citation=None, tag=None, + visibility=None, original_data_url=None, + paper_url=None, update_comment=None, + md5_checksum=None, data_file=None, features=None, + qualities=None, dataset=None): + # TODO add function to check if the name is casual_string128 # Attributes received by querying the RESTful API self.dataset_id = int(dataset_id) if dataset_id is not None else None self.name = name self.version = int(version) if version is not None else None self.description = description - self.format = format + if format is None: + self.format = data_format + else: + warn("The format parameter in the init will be deprecated " + "in the future." + "Please use data_format instead", DeprecationWarning) + self.format = format self.creator = creator self.contributor = contributor self.collection_date = collection_date @@ -128,7 +137,7 @@ def __init__(self, name, description, format, dataset_id=None, self.original_data_url = original_data_url self.paper_url = paper_url self.update_comment = update_comment - self.md5_cheksum = md5_checksum + self.md5_checksum = md5_checksum self.data_file = data_file self.features = None self.qualities = None @@ -169,13 +178,13 @@ def __init__(self, name, description, format, dataset_id=None, for name, type_ in data['attributes']] attribute_names = [name for name, type_ in data['attributes']] - if format.lower() == 'sparse_arff': + if self.format.lower() == 'sparse_arff': X = data['data'] X_shape = (max(X[1]) + 1, max(X[2]) + 1) X = scipy.sparse.coo_matrix( (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) X = X.tocsr() - elif format.lower() == 'arff': + elif self.format.lower() == 'arff': X = np.array(data['data'], dtype=np.float32) else: raise Exception() @@ -208,16 +217,33 @@ def remove_tag(self, tag): openml._api_calls._perform_api_call("/data/untag", data=data) def __eq__(self, other): + if type(other) != OpenMLDataset: return False - elif ( - self.dataset_id == other.dataset_id - or (self.name == other._name and self.version == other._version) - ): - return True - else: + + server_fields = { + 'dataset_id', + 'version', + 'upload_date', + 'url', + 'dataset', + 'data_file', + } + + # check that the keys are identical + self_keys = set(self.__dict__.keys()) - server_fields + other_keys = set(other.__dict__.keys()) - server_fields + if self_keys != other_keys: return False + # check that values of the common keys are identical + return all(self.__dict__[key] == other.__dict__[key] + for key in self_keys) + + def __ne__(self, other): + """Only needed for python 2, unnecessary in Python 3""" + return not self.__eq__(other) + def _get_arff(self, format): """Read ARFF file and return decoded arff. @@ -524,8 +550,6 @@ def _to_xml(self): xml_dataset : str XML description of the data. """ - xml_dataset = ('\n') props = ['id', 'name', 'version', 'description', 'format', 'creator', 'contributor', 'collection_date', 'upload_date', 'language', 'licence', 'url', 'default_target_attribute', diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index ef80f48b5..343429a84 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -1,20 +1,26 @@ -from collections import OrderedDict import hashlib import io import os import re -import shutil + +import numpy as np import six import arff - -from oslo_concurrency import lockutils import xmltodict +from scipy.sparse import coo_matrix +from oslo_concurrency import lockutils +from collections import OrderedDict +from warnings import warn import openml.utils import openml._api_calls from .dataset import OpenMLDataset -from ..exceptions import OpenMLCacheException, OpenMLServerException, \ - OpenMLHashException, PrivateDatasetError +from ..exceptions import ( + OpenMLCacheException, + OpenMLHashException, + OpenMLServerException, + PrivateDatasetError, +) from ..utils import ( _create_cache_directory, _remove_cache_dir_for_id, @@ -353,11 +359,13 @@ def get_dataset(dataset_id): return dataset -def create_dataset(name, description, creator, contributor, collection_date, - language, licence, attributes, data, default_target_attribute, - row_id_attribute, ignore_attribute, citation, format="arff", - original_data_url=None, paper_url=None, update_comment=None, - version_label=None): +def create_dataset(name, description, creator, contributor, + collection_date, language, + licence, attributes, data, + default_target_attribute, row_id_attribute, + ignore_attribute, citation, format=None, + original_data_url=None, paper_url=None, + update_comment=None, version_label=None): """Create a dataset. This function creates an OpenMLDataset object. @@ -370,6 +378,11 @@ def create_dataset(name, description, creator, contributor, collection_date, Name of the dataset. description : str Description of the dataset. + format : str, optional + Format of the dataset which can be either 'arff' or 'sparse_arff'. + By default, the format is automatically inferred. + .. deprecated: 0.8 + ``format`` is deprecated in 0.8 and will be removed in 0.10. creator : str The person who created the dataset. contributor : str @@ -383,7 +396,7 @@ def create_dataset(name, description, creator, contributor, collection_date, License of the data. attributes : list A list of tuples. Each tuple consists of the attribute name and type. - data : numpy.ndarray + data : numpy.ndarray | list | scipy.sparse.coo_matrix An array that contains both the attributes and the targets, with shape=(n_samples, n_features). The target feature is indicated as meta-data of the dataset. @@ -396,8 +409,6 @@ def create_dataset(name, description, creator, contributor, collection_date, Attributes that should be excluded in modelling, such as identifiers and indexes. citation : str Reference(s) that should be cited when building on this data. - format : str, optional - Format of the dataset. Only 'arff' for now. version_label : str, optional Version label provided by user, can be a date, hash, or some other type of id. original_data_url : str, optional @@ -411,6 +422,36 @@ def create_dataset(name, description, creator, contributor, collection_date, ------- class:`openml.OpenMLDataset` Dataset description.""" + + if format is not None: + warn("The format parameter will be deprecated in the future," + " the method will determine the format of the ARFF " + "based on the given data.", DeprecationWarning) + d_format = format + + # Determine ARFF format from the dataset + else: + if isinstance(data, list) or isinstance(data, np.ndarray): + if isinstance(data[0], list) or isinstance(data[0], np.ndarray): + d_format = 'arff' + elif isinstance(data[0], dict): + d_format = 'sparse_arff' + else: + raise ValueError( + 'When giving a list or a numpy.ndarray, ' + 'they should contain a list/ numpy.ndarray ' + 'for dense data or a dictionary for sparse ' + 'data. Got {!r} instead.' + .format(data[0]) + ) + elif isinstance(data, coo_matrix): + d_format = 'sparse_arff' + else: + raise ValueError( + 'Invalid data type. The data type can be a list, ' + 'a numpy ndarray or a scipy.sparse.coo_matrix' + ) + arff_object = { 'relation': name, 'description': description, @@ -418,22 +459,39 @@ def create_dataset(name, description, creator, contributor, collection_date, 'data': data } - # serializes the arff dataset object and returns a string + # serializes the ARFF dataset object and returns a string arff_dataset = arff.dumps(arff_object) try: - # check if arff is valid + # check if ARFF is valid decoder = arff.ArffDecoder() - decoder.decode(arff_dataset, encode_nominal=True) + decoder.decode( + arff_dataset, + encode_nominal=True, + return_type=arff.COO if d_format == 'sparse_arff' else arff.DENSE + ) except arff.ArffException: raise ValueError("The arguments you have provided \ - do not construct a valid arff file") - - return OpenMLDataset(name, description, format, creator=creator, - contributor=contributor, collection_date=collection_date, - language=language, licence=licence, default_target_attribute=default_target_attribute, - row_id_attribute=row_id_attribute, ignore_attribute=ignore_attribute, citation=citation, - version_label=version_label, original_data_url=original_data_url, paper_url=paper_url, - update_comment=update_comment, dataset=arff_dataset) + do not construct a valid ARFF file") + + return OpenMLDataset( + name, + description, + data_format=d_format, + creator=creator, + contributor=contributor, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=row_id_attribute, + ignore_attribute=ignore_attribute, + citation=citation, + version_label=version_label, + original_data_url=original_data_url, + paper_url=paper_url, + update_comment=update_comment, + dataset=arff_dataset, + ) def status_update(data_id, status): @@ -505,7 +563,7 @@ def _get_dataset_description(did_cache_dir, dataset_id): def _get_dataset_arff(did_cache_dir, description): - """Get the filepath to the dataset arff + """Get the filepath to the dataset ARFF Checks if the file is in the cache, if yes, return the path to the file. If not, downloads the file and caches it, then returns the file path. @@ -523,7 +581,7 @@ def _get_dataset_arff(did_cache_dir, description): Returns ------- output_filename : string - Location of arff file. + Location of ARFF file. """ output_file_path = os.path.join(did_cache_dir, "dataset.arff") md5_checksum_fixture = description.get("oml:md5_checksum") @@ -638,40 +696,86 @@ def _create_dataset_from_description(description, features, qualities, arff_file Parameters ---------- description : dict - Description of a dataset in xmlish dict. + Description of a dataset in xml dict. arff_file : string - Path of dataset arff file. + Path of dataset ARFF file. Returns ------- dataset : dataset object - Dataset object from dict and arff. + Dataset object from dict and ARFF. """ dataset = OpenMLDataset( description["oml:name"], description.get("oml:description"), - description["oml:format"], - description["oml:id"], - description["oml:version"], - description.get("oml:creator"), - description.get("oml:contributor"), - description.get("oml:collection_date"), - description.get("oml:upload_date"), - description.get("oml:language"), - description.get("oml:licence"), - description["oml:url"], - description.get("oml:default_target_attribute"), - description.get("oml:row_id_attribute"), - description.get("oml:ignore_attribute"), - description.get("oml:version_label"), - description.get("oml:citation"), - description.get("oml:tag"), - description.get("oml:visibility"), - description.get("oml:original_data_url"), - description.get("oml:paper_url"), - description.get("oml:update_comment"), - description.get("oml:md5_checksum"), + data_format=description["oml:format"], + dataset_id=description["oml:id"], + version=description["oml:version"], + creator=description.get("oml:creator"), + contributor=description.get("oml:contributor"), + collection_date=description.get("oml:collection_date"), + upload_date=description.get("oml:upload_date"), + language=description.get("oml:language"), + licence=description.get("oml:licence"), + url=description["oml:url"], + default_target_attribute=description.get( + "oml:default_target_attribute" + ), + row_id_attribute=description.get("oml:row_id_attribute"), + ignore_attribute=description.get("oml:ignore_attribute"), + version_label=description.get("oml:version_label"), + citation=description.get("oml:citation"), + tag=description.get("oml:tag"), + visibility=description.get("oml:visibility"), + original_data_url=description.get("oml:original_data_url"), + paper_url=description.get("oml:paper_url"), + update_comment=description.get("oml:update_comment"), + md5_checksum=description.get("oml:md5_checksum"), data_file=arff_file, features=features, - qualities=qualities) + qualities=qualities, + ) return dataset + + +def _get_online_dataset_arff(dataset_id): + """Download the ARFF file for a given dataset id + from the OpenML website. + + Parameters + ---------- + dataset_id : int + A dataset id. + + Returns + ------- + str + A string representation of an ARFF file. + """ + dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id) + # build a dict from the xml. + # use the url from the dataset description and return the ARFF string + return openml._api_calls._read_url( + xmltodict.parse(dataset_xml)['oml:data_set_description']['oml:url'] + ) + + +def _get_online_dataset_format(dataset_id): + """Get the dataset format for a given dataset id + from the OpenML website. + + Parameters + ---------- + dataset_id : int + A dataset id. + + Returns + ------- + str + Dataset format. + """ + dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id) + # build a dict from the xml and get the format from the dataset description + return xmltodict\ + .parse(dataset_xml)['oml:data_set_description']['oml:format']\ + .lower() diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 5ec6c816b..c2e507350 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1,10 +1,12 @@ +from time import time + import numpy as np -from scipy import sparse import six -from time import time +from scipy import sparse +from warnings import filterwarnings, catch_warnings -from openml.testing import TestBase import openml +from openml.testing import TestBase class OpenMLDatasetTest(TestBase): @@ -97,6 +99,18 @@ def test_get_data_with_ignore_attributes(self): self.assertEqual(len(categorical), 38) # TODO test multiple ignore attributes! + def test_dataset_format_constructor(self): + + with catch_warnings(): + filterwarnings('error') + self.assertRaises( + DeprecationWarning, + openml.OpenMLDataset, + 'Test', + 'Test', + format='arff' + ) + class OpenMLDatasetTestOnTestServer(TestBase): def setUp(self): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 367bf0c63..bea0b8317 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1,19 +1,18 @@ import unittest import os import sys - +import random if sys.version_info[0] >= 3: from unittest import mock else: import mock -import random +import arff import six - -from oslo_concurrency import lockutils - import numpy as np import scipy.sparse +from oslo_concurrency import lockutils +from warnings import filterwarnings, catch_warnings import openml from openml import OpenMLDataset @@ -21,16 +20,17 @@ OpenMLHashException, PrivateDatasetError from openml.testing import TestBase from openml.utils import _tag_entity, _create_cache_directory_for_id - from openml.datasets.functions import (create_dataset, _get_cached_dataset, _get_cached_dataset_features, _get_cached_dataset_qualities, _get_cached_datasets, - _get_dataset_description, _get_dataset_arff, + _get_dataset_description, _get_dataset_features, _get_dataset_qualities, + _get_online_dataset_arff, + _get_online_dataset_format, DATASETS_CACHE_DIR_NAME) @@ -58,6 +58,24 @@ def _remove_pickle_files(self): except: pass + def _get_empty_param_for_dataset(self): + + return { + 'name': None, + 'description': None, + 'creator': None, + 'contributor': None, + 'collection_date': None, + 'language': None, + 'licence': None, + 'default_target_attribute': None, + 'row_id_attribute': None, + 'ignore_attribute': None, + 'citation': None, + 'attributes': None, + 'data': None + } + def test__list_cached_datasets(self): openml.config.cache_directory = self.static_cache_dir cached_datasets = openml.datasets.functions._list_cached_datasets() @@ -295,7 +313,7 @@ def test__get_dataset_qualities(self): def test_deletion_of_cache_dir(self): # Simple removal - did_cache_dir = openml.utils._create_cache_directory_for_id( + did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, 1, ) self.assertTrue(os.path.exists(did_cache_dir)) @@ -317,12 +335,19 @@ def test_deletion_of_cache_dir_faulty_download(self, patch): self.assertEqual(len(os.listdir(datasets_cache_dir)), 0) def test_publish_dataset(self): + openml.datasets.get_dataset(3) file_path = os.path.join(openml.config.get_cache_directory(), "datasets", "3", "dataset.arff") dataset = OpenMLDataset( - "anneal", "test", "ARFF", - version=1, licence="public", default_target_attribute="class", data_file=file_path) + "anneal", + "test", + data_format="arff", + version=1, + licence="public", + default_target_attribute="class", + data_file=file_path, + ) dataset.publish() self.assertIsInstance(dataset.dataset_id, int) @@ -335,10 +360,14 @@ def test__retrieve_class_labels(self): self.assertEqual(labels, ['C', 'H', 'G']) def test_upload_dataset_with_url(self): + dataset = OpenMLDataset( - "UploadTestWithURL", "test", "ARFF", + "UploadTestWithURL", + "test", + data_format="arff", version=1, - url="https://www.openml.org/data/download/61/dataset_61_iris.arff") + url="https://www.openml.org/data/download/61/dataset_61_iris.arff", + ) dataset.publish() self.assertIsInstance(dataset.dataset_id, int) @@ -377,39 +406,268 @@ def test_data_status(self): self.assertEqual(result[did]['status'], 'active') def test_create_dataset_numpy(self): - data = np.array([[1, 2, 3], - [1.2, 2.5, 3.8], - [2, 5, 8], - [0, 1, 0]]).T + + data = np.array( + [ + [1, 2, 3], + [1.2, 2.5, 3.8], + [2, 5, 8], + [0, 1, 0] + ] + ).T + attributes = [('col_{}'.format(i), 'REAL') for i in range(data.shape[1])] - name = 'NumPy_testing_dataset' - description = 'Synthetic dataset created from a NumPy array' - creator = 'OpenML tester' - collection_date = '01-01-2018' - language = 'English' - licence = 'MIT' - default_target_attribute = 'col_{}'.format(data.shape[1] - 1) - citation = 'None' - original_data_url = 'http://openml.github.io/openml-python' - paper_url = 'http://openml.github.io/openml-python' - dataset = openml.datasets.functions.create_dataset( - name=name, - description=description, - creator=creator, + + dataset = create_dataset( + name='NumPy_testing_dataset', + description='Synthetic dataset created from a NumPy array', + creator='OpenML tester', contributor=None, - collection_date=collection_date, - language=language, - licence=licence, - default_target_attribute=default_target_attribute, + collection_date='01-01-2018', + language='English', + licence='MIT', + default_target_attribute='col_{}'.format(data.shape[1] - 1), row_id_attribute=None, ignore_attribute=None, - citation=citation, + citation='None', attributes=attributes, data=data, - format='arff', version_label='test', - original_data_url=original_data_url, - paper_url=paper_url + original_data_url='http://openml.github.io/openml-python', + paper_url='http://openml.github.io/openml-python' + ) + + upload_did = dataset.publish() + + self.assertEqual( + _get_online_dataset_arff(upload_did), + dataset._dataset, + "Uploaded arff does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'arff', + "Wrong format for dataset" + ) + + def test_create_dataset_list(self): + + data = [ + ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'], + ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'], + ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'], + ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'], + ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'], + ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'], + ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'], + ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'], + ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'], + ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'], + ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'], + ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'], + ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'], + ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'], + ] + + attributes = [ + ('rnd_str', 'STRING'), + ('outlook', ['sunny', 'overcast', 'rainy']), + ('temperature', 'REAL'), + ('humidity', 'REAL'), + ('windy', ['TRUE', 'FALSE']), + ('play', ['yes', 'no']), + ] + + dataset = create_dataset( + name="ModifiedWeather", + description=( + 'Testing dataset upload when the data is a list of lists' + ), + creator='OpenML test', + contributor=None, + collection_date='21-09-2018', + language='English', + licence='MIT', + default_target_attribute='play', + row_id_attribute=None, + ignore_attribute=None, + citation='None', + attributes=attributes, + data=data, + version_label='test', + original_data_url='http://openml.github.io/openml-python', + paper_url='http://openml.github.io/openml-python' + ) + + upload_did = dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + dataset._dataset, + "Uploaded ARFF does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'arff', + "Wrong format for dataset" + ) + + def test_create_dataset_sparse(self): + + # test the scipy.sparse.coo_matrix + sparse_data = scipy.sparse.coo_matrix(( + [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) + )) + + column_names = [ + ('input1', 'REAL'), + ('input2', 'REAL'), + ('y', 'REAL'), + ] + + xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='test', + ) + + upload_did = xor_dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + xor_dataset._dataset, + "Uploaded ARFF does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'sparse_arff', + "Wrong format for dataset" + ) + + # test the list of dicts sparse representation + sparse_data = [ + {0: 0.0}, + {1: 1.0, 2: 1.0}, + {0: 1.0, 2: 1.0}, + {0: 1.0, 1: 1.0} + ] + + xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='test', + ) + + upload_did = xor_dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + xor_dataset._dataset, + "Uploaded ARFF does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'sparse_arff', + "Wrong format for dataset" + ) + + def test_create_invalid_dataset(self): + + data = [ + 'sunny', + 'overcast', + 'overcast', + 'rainy', + 'rainy', + 'rainy', + 'overcast', + 'sunny', + 'sunny', + 'rainy', + 'sunny', + 'overcast', + 'overcast', + 'rainy', + ] + + param = self._get_empty_param_for_dataset() + param['data'] = data + + self.assertRaises( + ValueError, + create_dataset, + **param + ) + + param['data'] = data[0] + self.assertRaises( + ValueError, + create_dataset, + **param + ) + + def test_create_dataset_warning(self): + + parameters = self._get_empty_param_for_dataset() + parameters['format'] = 'arff' + with catch_warnings(): + filterwarnings('error') + self.assertRaises( + DeprecationWarning, + create_dataset, + **parameters + ) + + def test_get_online_dataset_arff(self): + + # Australian dataset + dataset_id = 100 + dataset = openml.datasets.get_dataset(dataset_id) + decoder = arff.ArffDecoder() + # check if the arff from the dataset is + # the same as the arff from _get_arff function + d_format = (dataset.format).lower() + + self.assertEqual( + dataset._get_arff(d_format), + decoder.decode( + _get_online_dataset_arff(dataset_id), + encode_nominal=True, + return_type=arff.DENSE + if d_format == 'arff' else arff.COO + ), + "ARFF files are not equal" + ) + + def test_get_online_dataset_format(self): + + # Phoneme dataset + dataset_id = 77 + dataset = openml.datasets.get_dataset(dataset_id) + + self.assertEqual( + (dataset.format).lower(), + _get_online_dataset_format(dataset_id), + "The format of the ARFF files is different" ) - dataset.publish()