openml · mfeurer · Oct 17, 2018 · Sep 21, 2018 · Sep 21, 2018 · Sep 21, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -25,6 +25,11 @@ env:
   - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
   - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2"
 
+# Travis issue
+# https://github.com/travis-ci/travis-ci/issues/8920
+before_install:
+ - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
+
 install: source ci_scripts/install.sh
 script: bash ci_scripts/test.sh
 after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result"

diff --git a/ci_scripts/flake8_diff.sh b/ci_scripts/flake8_diff.sh
@@ -125,7 +125,7 @@ check_files() {
     if [ -n "$files" ]; then
         # Conservative approach: diff without context (--unified=0) so that code
         # that was not changed does not create failures
-        git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options
+        git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --ignore E402 --diff --show-source $options
     fi
 }
 

diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py
@@ -5,53 +5,87 @@
 A tutorial on how to create and upload a dataset to OpenML.
 """
 import numpy as np
-import openml
 import sklearn.datasets
+from scipy.sparse import coo_matrix
+
+import openml
+from openml.datasets.functions import create_dataset
 
 ############################################################################
-# For this example we will upload to the test server to not  pollute the live server with countless copies of the same dataset.
+# For this tutorial we will upload to the test server to not  pollute the live
+# server with countless copies of the same dataset.
 openml.config.server = 'https://test.openml.org/api/v1/xml'
 
 ############################################################################
-# Prepare the data
-# ^^^^^^^^^^^^^^^^
-# Load an example dataset from scikit-learn which we will upload to OpenML.org via the API.
-breast_cancer = sklearn.datasets.load_breast_cancer()
-name = 'BreastCancer(scikit-learn)'
-X = breast_cancer.data
-y = breast_cancer.target
-attribute_names = breast_cancer.feature_names
-targets = breast_cancer.target_names
-description = breast_cancer.DESCR
+# Below we will cover the following cases of the
+# dataset object:
+#
+# * A numpy array
+# * A list
+# * A sparse matrix
 
 ############################################################################
-# OpenML does not distinguish between the attributes and targets on the data level and stores all data in a
-# single matrix. The target feature is indicated as meta-data of the dataset (and tasks on that data).
+# Dataset is a numpy array
+# ========================
+# A numpy array can contain lists in the case of dense data
+# or it can contain OrderedDicts in the case of sparse data.
+#
+# Prepare dataset
+# ^^^^^^^^^^^^^^^
+# Load an example dataset from scikit-learn which we
+# will upload to OpenML.org via the API.
+
+diabetes = sklearn.datasets.load_diabetes()
+name = 'Diabetes(scikit-learn)'
+X = diabetes.data
+y = diabetes.target
+attribute_names = diabetes.feature_names
+description = diabetes.DESCR
+
+############################################################################
+# OpenML does not distinguish between the attributes and
+# targets on the data level and stores all data in a single matrix.
+#
+# The target feature is indicated as meta-data of the
+# dataset (and tasks on that data).
+
 data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
 attribute_names = list(attribute_names)
 attributes = [
     (attribute_name, 'REAL') for attribute_name in attribute_names
-] + [('class', 'REAL')]
+] + [('class', 'INTEGER')]
+citation = (
+    "Bradley Efron, Trevor Hastie, Iain Johnstone and "
+    "Robert Tibshirani (2004) (Least Angle Regression) "
+    "Annals of Statistics (with discussion), 407-499"
+)
+paper_url = (
+    'http://web.stanford.edu/~hastie/Papers/'
+    'LARS/LeastAngle_2002.pdf'
+)
 
 ############################################################################
 # Create the dataset object
 # ^^^^^^^^^^^^^^^^^^^^^^^^^
-# The definition of all fields can be found in the XSD files describing the expected format:
+# The definition of all fields can be found in the
+# XSD files describing the expected format:
 #
 # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
-dataset = openml.datasets.functions.create_dataset(
+
+diabetes_dataset = create_dataset(
     # The name of the dataset (needs to be unique). 
     # Must not be longer than 128 characters and only contain
     # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
     name=name,
     # Textual description of the dataset.
     description=description,
     # The person who created the dataset.
-    creator='Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian',
+    creator="Bradley Efron, Trevor Hastie, "
+            "Iain Johnstone and Robert Tibshirani",
     # People who contributed to the current version of the dataset.
     contributor=None,
     # The date the data was originally collected, given by the uploader.
-    collection_date='01-11-1995',
+    collection_date='09-01-2012',
     # Language in which the data is represented.
     # Starts with 1 upper case letter, rest lower case, e.g. 'English'.
     language='English',
@@ -64,26 +98,129 @@
     # Attributes that should be excluded in modelling, such as identifiers and indexes.
     ignore_attribute=None,
     # How to cite the paper.
-    citation=(
-        "W.N. Street, W.H. Wolberg and O.L. Mangasarian. "
-        "Nuclear feature extraction for breast tumor diagnosis. "
-        "IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, "
-        "volume 1905, pages 861-870, San Jose, CA, 1993."
-    ),
+    citation=citation,
     # Attributes of the data
     attributes=attributes,
     data=data,
-    # Format of the dataset. Only 'arff' for now.
-    format='arff',
     # A version label which is provided by the user.
     version_label='test',
-    original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)',
-    paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1'
+    original_data_url=(
+        'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'
+    ),
+    paper_url=paper_url,
 )
 
 ############################################################################
-try:
-    upload_id = dataset.publish()
-    print('URL for dataset: %s/data/%d' % (openml.config.server, upload_id))
-except openml.exceptions.PyOpenMLError as err:
-    print("OpenML: {0}".format(err))
+
+upload_did = diabetes_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
+
+############################################################################
+# Dataset is a list
+# =================
+# A list can contain lists in the case of dense data
+# or it can contain OrderedDicts in the case of sparse data.
+#
+# Weather dataset:
+# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
+
+data = [
+    ['sunny', 85, 85, 'FALSE', 'no'],
+    ['sunny', 80, 90, 'TRUE', 'no'],
+    ['overcast', 83, 86, 'FALSE', 'yes'],
+    ['rainy', 70, 96, 'FALSE', 'yes'],
+    ['rainy', 68, 80, 'FALSE', 'yes'],
+    ['rainy', 65, 70, 'TRUE', 'no'],
+    ['overcast', 64, 65, 'TRUE', 'yes'],
+    ['sunny', 72, 95, 'FALSE', 'no'],
+    ['sunny', 69, 70, 'FALSE', 'yes'],
+    ['rainy', 75, 80, 'FALSE', 'yes'],
+    ['sunny', 75, 70, 'TRUE', 'yes'],
+    ['overcast', 72, 90, 'TRUE', 'yes'],
+    ['overcast', 81, 75, 'FALSE', 'yes'],
+    ['rainy', 71, 91, 'TRUE', 'no'],
+]
+
+attribute_names = [
+    ('outlook', ['sunny', 'overcast', 'rainy']),
+    ('temperature', 'REAL'),
+    ('humidity', 'REAL'),
+    ('windy', ['TRUE', 'FALSE']),
+    ('play', ['yes', 'no']),
+]
+
+description = (
+    'The weather problem is a tiny dataset that we will use repeatedly'
+    ' to illustrate machine learning methods. Entirely fictitious, it '
+    'supposedly concerns the conditions that are suitable for playing '
+    'some unspecified game. In general, instances in a dataset are '
+    'characterized by the values of features, or attributes, that measure '
+    'different aspects of the instance. In this case there are four '
+    'attributes: outlook, temperature, humidity, and windy. '
+    'The outcome is whether to play or not.'
+)
+
+citation = (
+    'I. H. Witten, E. Frank, M. A. Hall, and ITPro,'
+    'Data mining practical machine learning tools and techniques, '
+    'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011'
+)
+
+weather_dataset = create_dataset(
+    name="Weather",
+    description=description,
+    creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
+    contributor=None,
+    collection_date='01-01-2011',
+    language='English',
+    licence=None,
+    default_target_attribute='play',
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=citation,
+    attributes=attribute_names,
+    data=data,
+    version_label='example',
+)
+
+############################################################################
+
+upload_did = weather_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
+
+############################################################################
+# Dataset is a sparse matrix
+# ==========================
+
+sparse_data = coo_matrix((
+    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]),
+))
+
+column_names = [
+    ('input1', 'REAL'),
+    ('input2', 'REAL'),
+    ('y', 'REAL'),
+]
+
+xor_dataset = create_dataset(
+    name="XOR",
+    description='Dataset representing the XOR operation',
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language='English',
+    licence=None,
+    default_target_attribute='y',
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    attributes=column_names,
+    data=sparse_data,
+    version_label='example',
+)
+
+############################################################################
+
+upload_did = xor_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -1,8 +1,21 @@
-from .functions import (list_datasets, check_datasets_active,
-                        get_datasets, get_dataset, status_update)
+from .functions import (
+    check_datasets_active,
+    create_dataset,
+    get_dataset,
+    get_datasets,
+    list_datasets,
+    status_update,
+)
 from .dataset import OpenMLDataset
 from .data_feature import OpenMLDataFeature
 
-__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
-           'OpenMLDataset', 'OpenMLDataFeature', 'list_datasets',
-           'status_update']
+__all__ = [
+    'check_datasets_active',
+    'create_dataset',
+    'get_dataset',
+    'get_datasets',
+    'list_datasets',
+    'OpenMLDataset',
+    'OpenMLDataFeature',
+    'status_update',
+]