diff --git a/.travis.yml b/.travis.yml
index f0cecf80d..07e5f80fd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,11 @@ env:
   - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
   - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2"
 
+# Travis issue
+# https://github.com/travis-ci/travis-ci/issues/8920
+before_install:
+ - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
+
 install: source ci_scripts/install.sh
 script: bash ci_scripts/test.sh
 after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result"
diff --git a/ci_scripts/flake8_diff.sh b/ci_scripts/flake8_diff.sh
index 90d7923ad..9207163bb 100644
--- a/ci_scripts/flake8_diff.sh
+++ b/ci_scripts/flake8_diff.sh
@@ -125,7 +125,7 @@ check_files() {
     if [ -n "$files" ]; then
         # Conservative approach: diff without context (--unified=0) so that code
         # that was not changed does not create failures
-        git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options
+        git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --ignore E402 --diff --show-source $options
     fi
 }
 
diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py
index 962c9b98e..d68100648 100644
--- a/examples/create_upload_tutorial.py
+++ b/examples/create_upload_tutorial.py
@@ -5,41 +5,74 @@
 A tutorial on how to create and upload a dataset to OpenML.
 """
 import numpy as np
-import openml
 import sklearn.datasets
+from scipy.sparse import coo_matrix
+
+import openml
+from openml.datasets.functions import create_dataset
 
 ############################################################################
-# For this example we will upload to the test server to not  pollute the live server with countless copies of the same dataset.
+# For this tutorial we will upload to the test server to not  pollute the live
+# server with countless copies of the same dataset.
 openml.config.server = 'https://test.openml.org/api/v1/xml'
 
 ############################################################################
-# Prepare the data
-# ^^^^^^^^^^^^^^^^
-# Load an example dataset from scikit-learn which we will upload to OpenML.org via the API.
-breast_cancer = sklearn.datasets.load_breast_cancer()
-name = 'BreastCancer(scikit-learn)'
-X = breast_cancer.data
-y = breast_cancer.target
-attribute_names = breast_cancer.feature_names
-targets = breast_cancer.target_names
-description = breast_cancer.DESCR
+# Below we will cover the following cases of the
+# dataset object:
+#
+# * A numpy array
+# * A list
+# * A sparse matrix
 
 ############################################################################
-# OpenML does not distinguish between the attributes and targets on the data level and stores all data in a
-# single matrix. The target feature is indicated as meta-data of the dataset (and tasks on that data).
+# Dataset is a numpy array
+# ========================
+# A numpy array can contain lists in the case of dense data
+# or it can contain OrderedDicts in the case of sparse data.
+#
+# Prepare dataset
+# ^^^^^^^^^^^^^^^
+# Load an example dataset from scikit-learn which we
+# will upload to OpenML.org via the API.
+
+diabetes = sklearn.datasets.load_diabetes()
+name = 'Diabetes(scikit-learn)'
+X = diabetes.data
+y = diabetes.target
+attribute_names = diabetes.feature_names
+description = diabetes.DESCR
+
+############################################################################
+# OpenML does not distinguish between the attributes and
+# targets on the data level and stores all data in a single matrix.
+#
+# The target feature is indicated as meta-data of the
+# dataset (and tasks on that data).
+
 data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
 attribute_names = list(attribute_names)
 attributes = [
     (attribute_name, 'REAL') for attribute_name in attribute_names
-] + [('class', 'REAL')]
+] + [('class', 'INTEGER')]
+citation = (
+    "Bradley Efron, Trevor Hastie, Iain Johnstone and "
+    "Robert Tibshirani (2004) (Least Angle Regression) "
+    "Annals of Statistics (with discussion), 407-499"
+)
+paper_url = (
+    'http://web.stanford.edu/~hastie/Papers/'
+    'LARS/LeastAngle_2002.pdf'
+)
 
 ############################################################################
 # Create the dataset object
 # ^^^^^^^^^^^^^^^^^^^^^^^^^
-# The definition of all fields can be found in the XSD files describing the expected format:
+# The definition of all fields can be found in the
+# XSD files describing the expected format:
 #
 # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
-dataset = openml.datasets.functions.create_dataset(
+
+diabetes_dataset = create_dataset(
     # The name of the dataset (needs to be unique). 
     # Must not be longer than 128 characters and only contain
     # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
@@ -47,11 +80,12 @@
     # Textual description of the dataset.
     description=description,
     # The person who created the dataset.
-    creator='Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian',
+    creator="Bradley Efron, Trevor Hastie, "
+            "Iain Johnstone and Robert Tibshirani",
     # People who contributed to the current version of the dataset.
     contributor=None,
     # The date the data was originally collected, given by the uploader.
-    collection_date='01-11-1995',
+    collection_date='09-01-2012',
     # Language in which the data is represented.
     # Starts with 1 upper case letter, rest lower case, e.g. 'English'.
     language='English',
@@ -64,26 +98,129 @@
     # Attributes that should be excluded in modelling, such as identifiers and indexes.
     ignore_attribute=None,
     # How to cite the paper.
-    citation=(
-        "W.N. Street, W.H. Wolberg and O.L. Mangasarian. "
-        "Nuclear feature extraction for breast tumor diagnosis. "
-        "IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, "
-        "volume 1905, pages 861-870, San Jose, CA, 1993."
-    ),
+    citation=citation,
     # Attributes of the data
     attributes=attributes,
     data=data,
-    # Format of the dataset. Only 'arff' for now.
-    format='arff',
     # A version label which is provided by the user.
     version_label='test',
-    original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)',
-    paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1'
+    original_data_url=(
+        'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'
+    ),
+    paper_url=paper_url,
 )
 
 ############################################################################
-try:
-    upload_id = dataset.publish()
-    print('URL for dataset: %s/data/%d' % (openml.config.server, upload_id))
-except openml.exceptions.PyOpenMLError as err:
-    print("OpenML: {0}".format(err))
+
+upload_did = diabetes_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
+
+############################################################################
+# Dataset is a list
+# =================
+# A list can contain lists in the case of dense data
+# or it can contain OrderedDicts in the case of sparse data.
+#
+# Weather dataset:
+# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
+
+data = [
+    ['sunny', 85, 85, 'FALSE', 'no'],
+    ['sunny', 80, 90, 'TRUE', 'no'],
+    ['overcast', 83, 86, 'FALSE', 'yes'],
+    ['rainy', 70, 96, 'FALSE', 'yes'],
+    ['rainy', 68, 80, 'FALSE', 'yes'],
+    ['rainy', 65, 70, 'TRUE', 'no'],
+    ['overcast', 64, 65, 'TRUE', 'yes'],
+    ['sunny', 72, 95, 'FALSE', 'no'],
+    ['sunny', 69, 70, 'FALSE', 'yes'],
+    ['rainy', 75, 80, 'FALSE', 'yes'],
+    ['sunny', 75, 70, 'TRUE', 'yes'],
+    ['overcast', 72, 90, 'TRUE', 'yes'],
+    ['overcast', 81, 75, 'FALSE', 'yes'],
+    ['rainy', 71, 91, 'TRUE', 'no'],
+]
+
+attribute_names = [
+    ('outlook', ['sunny', 'overcast', 'rainy']),
+    ('temperature', 'REAL'),
+    ('humidity', 'REAL'),
+    ('windy', ['TRUE', 'FALSE']),
+    ('play', ['yes', 'no']),
+]
+
+description = (
+    'The weather problem is a tiny dataset that we will use repeatedly'
+    ' to illustrate machine learning methods. Entirely fictitious, it '
+    'supposedly concerns the conditions that are suitable for playing '
+    'some unspecified game. In general, instances in a dataset are '
+    'characterized by the values of features, or attributes, that measure '
+    'different aspects of the instance. In this case there are four '
+    'attributes: outlook, temperature, humidity, and windy. '
+    'The outcome is whether to play or not.'
+)
+
+citation = (
+    'I. H. Witten, E. Frank, M. A. Hall, and ITPro,'
+    'Data mining practical machine learning tools and techniques, '
+    'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011'
+)
+
+weather_dataset = create_dataset(
+    name="Weather",
+    description=description,
+    creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
+    contributor=None,
+    collection_date='01-01-2011',
+    language='English',
+    licence=None,
+    default_target_attribute='play',
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=citation,
+    attributes=attribute_names,
+    data=data,
+    version_label='example',
+)
+
+############################################################################
+
+upload_did = weather_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
+
+############################################################################
+# Dataset is a sparse matrix
+# ==========================
+
+sparse_data = coo_matrix((
+    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]),
+))
+
+column_names = [
+    ('input1', 'REAL'),
+    ('input2', 'REAL'),
+    ('y', 'REAL'),
+]
+
+xor_dataset = create_dataset(
+    name="XOR",
+    description='Dataset representing the XOR operation',
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language='English',
+    licence=None,
+    default_target_attribute='y',
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    attributes=column_names,
+    data=sparse_data,
+    version_label='example',
+)
+
+############################################################################
+
+upload_did = xor_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
index d4aa2690b..c0ce3676e 100644
--- a/openml/datasets/__init__.py
+++ b/openml/datasets/__init__.py
@@ -1,8 +1,21 @@
-from .functions import (list_datasets, check_datasets_active,
-                        get_datasets, get_dataset, status_update)
+from .functions import (
+    check_datasets_active,
+    create_dataset,
+    get_dataset,
+    get_datasets,
+    list_datasets,
+    status_update,
+)
 from .dataset import OpenMLDataset
 from .data_feature import OpenMLDataFeature
 
-__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
-           'OpenMLDataset', 'OpenMLDataFeature', 'list_datasets',
-           'status_update']
+__all__ = [
+    'check_datasets_active',
+    'create_dataset',
+    'get_dataset',
+    'get_datasets',
+    'list_datasets',
+    'OpenMLDataset',
+    'OpenMLDataFeature',
+    'status_update',
+]
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index fe05fa29f..b4213e91a 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -1,20 +1,21 @@
-from collections import OrderedDict
 import gzip
 import io
 import logging
 import os
-import six
+from collections import OrderedDict
 
 import arff
-
 import numpy as np
 import scipy.sparse
-from six.moves import cPickle as pickle
 import xmltodict
+import six
+from six.moves import cPickle as pickle
+from warnings import warn
 
+import openml._api_calls
 from .data_feature import OpenMLDataFeature
 from ..exceptions import PyOpenMLError
-import openml._api_calls
+
 
 logger = logging.getLogger(__name__)
 
@@ -31,7 +32,7 @@ class OpenMLDataset(object):
     description : str
         Description of the dataset.
     format : str
-        Format of the dataset. Only 'arff' for now.
+        Format of the dataset which can be either 'arff' or 'sparse_arff'.
     dataset_id : int, optional
         Id autogenerated by the server.
     version : int, optional
@@ -86,23 +87,31 @@ class OpenMLDataset(object):
     dataset: string, optional
         Serialized arff dataset string.
     """
-    def __init__(self, name, description, format, dataset_id=None,
-                 version=None, creator=None, contributor=None,
-                 collection_date=None, upload_date=None, language=None,
-                 licence=None, url=None, default_target_attribute=None,
+    def __init__(self, name, description, format=None,
+                 data_format='arff', dataset_id=None, version=None,
+                 creator=None, contributor=None, collection_date=None,
+                 upload_date=None, language=None, licence=None,
+                 url=None, default_target_attribute=None,
                  row_id_attribute=None, ignore_attribute=None,
-                 version_label=None, citation=None, tag=None, visibility=None,
-                 original_data_url=None, paper_url=None, update_comment=None,
-                 md5_checksum=None, data_file=None, features=None, qualities=None,
-                 dataset=None):
-        # TODO add function to check if the name is casual_string128
+                 version_label=None, citation=None, tag=None,
+                 visibility=None, original_data_url=None,
+                 paper_url=None, update_comment=None,
+                 md5_checksum=None, data_file=None, features=None,
+                 qualities=None, dataset=None):
 
+        # TODO add function to check if the name is casual_string128
         # Attributes received by querying the RESTful API
         self.dataset_id = int(dataset_id) if dataset_id is not None else None
         self.name = name
         self.version = int(version) if version is not None else None
         self.description = description
-        self.format = format
+        if format is None:
+            self.format = data_format
+        else:
+            warn("The format parameter in the init will be deprecated "
+                 "in the future."
+                 "Please use data_format instead", DeprecationWarning)
+            self.format = format
         self.creator = creator
         self.contributor = contributor
         self.collection_date = collection_date
@@ -128,7 +137,7 @@ def __init__(self, name, description, format, dataset_id=None,
         self.original_data_url = original_data_url
         self.paper_url = paper_url
         self.update_comment = update_comment
-        self.md5_cheksum = md5_checksum
+        self.md5_checksum = md5_checksum
         self.data_file = data_file
         self.features = None
         self.qualities = None
@@ -169,13 +178,13 @@ def __init__(self, name, description, format, dataset_id=None,
                                    for name, type_ in data['attributes']]
                     attribute_names = [name for name, type_ in data['attributes']]
 
-                    if format.lower() == 'sparse_arff':
+                    if self.format.lower() == 'sparse_arff':
                         X = data['data']
                         X_shape = (max(X[1]) + 1, max(X[2]) + 1)
                         X = scipy.sparse.coo_matrix(
                             (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
                         X = X.tocsr()
-                    elif format.lower() == 'arff':
+                    elif self.format.lower() == 'arff':
                         X = np.array(data['data'], dtype=np.float32)
                     else:
                         raise Exception()
@@ -208,16 +217,33 @@ def remove_tag(self, tag):
         openml._api_calls._perform_api_call("/data/untag", data=data)
 
     def __eq__(self, other):
+
         if type(other) != OpenMLDataset:
             return False
-        elif (
-            self.dataset_id == other.dataset_id
-            or (self.name == other._name and self.version == other._version)
-        ):
-            return True
-        else:
+
+        server_fields = {
+            'dataset_id',
+            'version',
+            'upload_date',
+            'url',
+            'dataset',
+            'data_file',
+        }
+
+        # check that the keys are identical
+        self_keys = set(self.__dict__.keys()) - server_fields
+        other_keys = set(other.__dict__.keys()) - server_fields
+        if self_keys != other_keys:
             return False
 
+        # check that values of the common keys are identical
+        return all(self.__dict__[key] == other.__dict__[key]
+                   for key in self_keys)
+
+    def __ne__(self, other):
+        """Only needed for python 2, unnecessary in Python 3"""
+        return not self.__eq__(other)
+
     def _get_arff(self, format):
         """Read ARFF file and return decoded arff.
 
@@ -524,8 +550,6 @@ def _to_xml(self):
         xml_dataset : str
             XML description of the data.
         """
-        xml_dataset = ('<oml:data_set_description '
-                       'xmlns:oml="http://openml.org/openml">\n')
         props = ['id', 'name', 'version', 'description', 'format', 'creator',
                  'contributor', 'collection_date', 'upload_date', 'language',
                  'licence', 'url', 'default_target_attribute',
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index ef80f48b5..343429a84 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -1,20 +1,26 @@
-from collections import OrderedDict
 import hashlib
 import io
 import os
 import re
-import shutil
+
+import numpy as np
 import six
 import arff
-
-from oslo_concurrency import lockutils
 import xmltodict
+from scipy.sparse import coo_matrix
+from oslo_concurrency import lockutils
+from collections import OrderedDict
+from warnings import warn
 
 import openml.utils
 import openml._api_calls
 from .dataset import OpenMLDataset
-from ..exceptions import OpenMLCacheException, OpenMLServerException, \
-    OpenMLHashException, PrivateDatasetError
+from ..exceptions import (
+    OpenMLCacheException,
+    OpenMLHashException,
+    OpenMLServerException,
+    PrivateDatasetError,
+)
 from ..utils import (
     _create_cache_directory,
     _remove_cache_dir_for_id,
@@ -353,11 +359,13 @@ def get_dataset(dataset_id):
     return dataset
 
 
-def create_dataset(name, description, creator, contributor, collection_date,
-                   language, licence, attributes, data, default_target_attribute,
-                   row_id_attribute, ignore_attribute, citation, format="arff",
-                   original_data_url=None, paper_url=None, update_comment=None,
-                   version_label=None):
+def create_dataset(name, description, creator, contributor,
+                   collection_date, language,
+                   licence, attributes, data,
+                   default_target_attribute, row_id_attribute,
+                   ignore_attribute, citation, format=None,
+                   original_data_url=None, paper_url=None,
+                   update_comment=None, version_label=None):
     """Create a dataset.
 
     This function creates an OpenMLDataset object.
@@ -370,6 +378,11 @@ def create_dataset(name, description, creator, contributor, collection_date,
         Name of the dataset.
     description : str
         Description of the dataset.
+    format : str, optional
+        Format of the dataset which can be either 'arff' or 'sparse_arff'.
+        By default, the format is automatically inferred.
+        .. deprecated: 0.8
+            ``format`` is deprecated in 0.8 and will be removed in 0.10.
     creator : str
         The person who created the dataset.
     contributor : str
@@ -383,7 +396,7 @@ def create_dataset(name, description, creator, contributor, collection_date,
         License of the data.
     attributes : list
         A list of tuples. Each tuple consists of the attribute name and type.
-    data : numpy.ndarray
+    data : numpy.ndarray | list | scipy.sparse.coo_matrix
         An array that contains both the attributes and the targets, with
         shape=(n_samples, n_features).
         The target feature is indicated as meta-data of the dataset.
@@ -396,8 +409,6 @@ def create_dataset(name, description, creator, contributor, collection_date,
         Attributes that should be excluded in modelling, such as identifiers and indexes.
     citation : str
         Reference(s) that should be cited when building on this data.
-    format : str, optional
-        Format of the dataset. Only 'arff' for now.
     version_label : str, optional
         Version label provided by user, can be a date, hash, or some other type of id.
     original_data_url : str, optional
@@ -411,6 +422,36 @@ def create_dataset(name, description, creator, contributor, collection_date,
     -------
     class:`openml.OpenMLDataset`
         Dataset description."""
+
+    if format is not None:
+        warn("The format parameter will be deprecated in the future,"
+             " the method will determine the format of the ARFF "
+             "based on the given data.", DeprecationWarning)
+        d_format = format
+
+    # Determine ARFF format from the dataset
+    else:
+        if isinstance(data, list) or isinstance(data, np.ndarray):
+            if isinstance(data[0], list) or isinstance(data[0], np.ndarray):
+                d_format = 'arff'
+            elif isinstance(data[0], dict):
+                d_format = 'sparse_arff'
+            else:
+                raise ValueError(
+                    'When giving a list or a numpy.ndarray, '
+                    'they should contain a list/ numpy.ndarray '
+                    'for dense data or a dictionary for sparse '
+                    'data. Got {!r} instead.'
+                    .format(data[0])
+                )
+        elif isinstance(data, coo_matrix):
+            d_format = 'sparse_arff'
+        else:
+            raise ValueError(
+                'Invalid data type. The data type can be a list, '
+                'a numpy ndarray or a scipy.sparse.coo_matrix'
+            )
+
     arff_object = {
         'relation': name,
         'description': description,
@@ -418,22 +459,39 @@ def create_dataset(name, description, creator, contributor, collection_date,
         'data': data
     }
 
-    # serializes the arff dataset object and returns a string
+    # serializes the ARFF dataset object and returns a string
     arff_dataset = arff.dumps(arff_object)
     try:
-        # check if arff is valid
+        # check if ARFF is valid
         decoder = arff.ArffDecoder()
-        decoder.decode(arff_dataset, encode_nominal=True)
+        decoder.decode(
+            arff_dataset,
+            encode_nominal=True,
+            return_type=arff.COO if d_format == 'sparse_arff' else arff.DENSE
+        )
     except arff.ArffException:
         raise ValueError("The arguments you have provided \
-                             do not construct a valid arff file")
-
-    return OpenMLDataset(name, description, format, creator=creator,
-                         contributor=contributor, collection_date=collection_date,
-                         language=language, licence=licence, default_target_attribute=default_target_attribute,
-                         row_id_attribute=row_id_attribute, ignore_attribute=ignore_attribute, citation=citation,
-                         version_label=version_label, original_data_url=original_data_url, paper_url=paper_url,
-                         update_comment=update_comment, dataset=arff_dataset)
+                             do not construct a valid ARFF file")
+
+    return OpenMLDataset(
+        name,
+        description,
+        data_format=d_format,
+        creator=creator,
+        contributor=contributor,
+        collection_date=collection_date,
+        language=language,
+        licence=licence,
+        default_target_attribute=default_target_attribute,
+        row_id_attribute=row_id_attribute,
+        ignore_attribute=ignore_attribute,
+        citation=citation,
+        version_label=version_label,
+        original_data_url=original_data_url,
+        paper_url=paper_url,
+        update_comment=update_comment,
+        dataset=arff_dataset,
+    )
 
 
 def status_update(data_id, status):
@@ -505,7 +563,7 @@ def _get_dataset_description(did_cache_dir, dataset_id):
 
 
 def _get_dataset_arff(did_cache_dir, description):
-    """Get the filepath to the dataset arff
+    """Get the filepath to the dataset ARFF
 
     Checks if the file is in the cache, if yes, return the path to the file. If
     not, downloads the file and caches it, then returns the file path.
@@ -523,7 +581,7 @@ def _get_dataset_arff(did_cache_dir, description):
     Returns
     -------
     output_filename : string
-        Location of arff file.
+        Location of ARFF file.
     """
     output_file_path = os.path.join(did_cache_dir, "dataset.arff")
     md5_checksum_fixture = description.get("oml:md5_checksum")
@@ -638,40 +696,86 @@ def _create_dataset_from_description(description, features, qualities, arff_file
     Parameters
     ----------
     description : dict
-        Description of a dataset in xmlish dict.
+        Description of a dataset in xml dict.
     arff_file : string
-        Path of dataset arff file.
+        Path of dataset ARFF file.
 
     Returns
     -------
     dataset : dataset object
-        Dataset object from dict and arff.
+        Dataset object from dict and ARFF.
     """
     dataset = OpenMLDataset(
         description["oml:name"],
         description.get("oml:description"),
-        description["oml:format"],
-        description["oml:id"],
-        description["oml:version"],
-        description.get("oml:creator"),
-        description.get("oml:contributor"),
-        description.get("oml:collection_date"),
-        description.get("oml:upload_date"),
-        description.get("oml:language"),
-        description.get("oml:licence"),
-        description["oml:url"],
-        description.get("oml:default_target_attribute"),
-        description.get("oml:row_id_attribute"),
-        description.get("oml:ignore_attribute"),
-        description.get("oml:version_label"),
-        description.get("oml:citation"),
-        description.get("oml:tag"),
-        description.get("oml:visibility"),
-        description.get("oml:original_data_url"),
-        description.get("oml:paper_url"),
-        description.get("oml:update_comment"),
-        description.get("oml:md5_checksum"),
+        data_format=description["oml:format"],
+        dataset_id=description["oml:id"],
+        version=description["oml:version"],
+        creator=description.get("oml:creator"),
+        contributor=description.get("oml:contributor"),
+        collection_date=description.get("oml:collection_date"),
+        upload_date=description.get("oml:upload_date"),
+        language=description.get("oml:language"),
+        licence=description.get("oml:licence"),
+        url=description["oml:url"],
+        default_target_attribute=description.get(
+            "oml:default_target_attribute"
+        ),
+        row_id_attribute=description.get("oml:row_id_attribute"),
+        ignore_attribute=description.get("oml:ignore_attribute"),
+        version_label=description.get("oml:version_label"),
+        citation=description.get("oml:citation"),
+        tag=description.get("oml:tag"),
+        visibility=description.get("oml:visibility"),
+        original_data_url=description.get("oml:original_data_url"),
+        paper_url=description.get("oml:paper_url"),
+        update_comment=description.get("oml:update_comment"),
+        md5_checksum=description.get("oml:md5_checksum"),
         data_file=arff_file,
         features=features,
-        qualities=qualities)
+        qualities=qualities,
+    )
     return dataset
+
+
+def _get_online_dataset_arff(dataset_id):
+    """Download the ARFF file for a given dataset id
+    from the OpenML website.
+
+    Parameters
+    ----------
+    dataset_id : int
+        A dataset id.
+
+    Returns
+    -------
+    str
+        A string representation of an ARFF file.
+    """
+    dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id)
+    # build a dict from the xml.
+    # use the url from the dataset description and return the ARFF string
+    return openml._api_calls._read_url(
+        xmltodict.parse(dataset_xml)['oml:data_set_description']['oml:url']
+    )
+
+
+def _get_online_dataset_format(dataset_id):
+    """Get the dataset format for a given dataset id
+    from the OpenML website.
+
+    Parameters
+    ----------
+    dataset_id : int
+        A dataset id.
+
+    Returns
+    -------
+    str
+        Dataset format.
+    """
+    dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id)
+    # build a dict from the xml and get the format from the dataset description
+    return xmltodict\
+        .parse(dataset_xml)['oml:data_set_description']['oml:format']\
+        .lower()
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 5ec6c816b..c2e507350 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -1,10 +1,12 @@
+from time import time
+
 import numpy as np
-from scipy import sparse
 import six
-from time import time
+from scipy import sparse
+from warnings import filterwarnings, catch_warnings
 
-from openml.testing import TestBase
 import openml
+from openml.testing import TestBase
 
 
 class OpenMLDatasetTest(TestBase):
@@ -97,6 +99,18 @@ def test_get_data_with_ignore_attributes(self):
         self.assertEqual(len(categorical), 38)
         # TODO test multiple ignore attributes!
 
+    def test_dataset_format_constructor(self):
+
+        with catch_warnings():
+            filterwarnings('error')
+            self.assertRaises(
+                DeprecationWarning,
+                openml.OpenMLDataset,
+                'Test',
+                'Test',
+                format='arff'
+            )
+
 
 class OpenMLDatasetTestOnTestServer(TestBase):
     def setUp(self):
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 367bf0c63..bea0b8317 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -1,19 +1,18 @@
 import unittest
 import os
 import sys
-
+import random
 if sys.version_info[0] >= 3:
     from unittest import mock
 else:
     import mock
 
-import random
+import arff
 import six
-
-from oslo_concurrency import lockutils
-
 import numpy as np
 import scipy.sparse
+from oslo_concurrency import lockutils
+from warnings import filterwarnings, catch_warnings
 
 import openml
 from openml import OpenMLDataset
@@ -21,16 +20,17 @@
     OpenMLHashException, PrivateDatasetError
 from openml.testing import TestBase
 from openml.utils import _tag_entity, _create_cache_directory_for_id
-
 from openml.datasets.functions import (create_dataset,
                                        _get_cached_dataset,
                                        _get_cached_dataset_features,
                                        _get_cached_dataset_qualities,
                                        _get_cached_datasets,
-                                       _get_dataset_description,
                                        _get_dataset_arff,
+                                       _get_dataset_description,
                                        _get_dataset_features,
                                        _get_dataset_qualities,
+                                       _get_online_dataset_arff,
+                                       _get_online_dataset_format,
                                        DATASETS_CACHE_DIR_NAME)
 
 
@@ -58,6 +58,24 @@ def _remove_pickle_files(self):
                 except:
                     pass
 
+    def _get_empty_param_for_dataset(self):
+
+        return {
+            'name': None,
+            'description': None,
+            'creator': None,
+            'contributor': None,
+            'collection_date': None,
+            'language': None,
+            'licence': None,
+            'default_target_attribute': None,
+            'row_id_attribute': None,
+            'ignore_attribute': None,
+            'citation': None,
+            'attributes': None,
+            'data': None
+        }
+
     def test__list_cached_datasets(self):
         openml.config.cache_directory = self.static_cache_dir
         cached_datasets = openml.datasets.functions._list_cached_datasets()
@@ -295,7 +313,7 @@ def test__get_dataset_qualities(self):
 
     def test_deletion_of_cache_dir(self):
         # Simple removal
-        did_cache_dir = openml.utils._create_cache_directory_for_id(
+        did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME, 1,
         )
         self.assertTrue(os.path.exists(did_cache_dir))
@@ -317,12 +335,19 @@ def test_deletion_of_cache_dir_faulty_download(self, patch):
         self.assertEqual(len(os.listdir(datasets_cache_dir)), 0)
 
     def test_publish_dataset(self):
+
         openml.datasets.get_dataset(3)
         file_path = os.path.join(openml.config.get_cache_directory(),
                                  "datasets", "3", "dataset.arff")
         dataset = OpenMLDataset(
-            "anneal", "test", "ARFF",
-            version=1, licence="public", default_target_attribute="class", data_file=file_path)
+            "anneal",
+            "test",
+            data_format="arff",
+            version=1,
+            licence="public",
+            default_target_attribute="class",
+            data_file=file_path,
+        )
         dataset.publish()
         self.assertIsInstance(dataset.dataset_id, int)
 
@@ -335,10 +360,14 @@ def test__retrieve_class_labels(self):
         self.assertEqual(labels, ['C', 'H', 'G'])
 
     def test_upload_dataset_with_url(self):
+
         dataset = OpenMLDataset(
-            "UploadTestWithURL", "test", "ARFF",
+            "UploadTestWithURL",
+            "test",
+            data_format="arff",
             version=1,
-            url="https://www.openml.org/data/download/61/dataset_61_iris.arff")
+            url="https://www.openml.org/data/download/61/dataset_61_iris.arff",
+        )
         dataset.publish()
         self.assertIsInstance(dataset.dataset_id, int)
 
@@ -377,39 +406,268 @@ def test_data_status(self):
         self.assertEqual(result[did]['status'], 'active')
 
     def test_create_dataset_numpy(self):
-        data = np.array([[1, 2, 3],
-                         [1.2, 2.5, 3.8],
-                         [2, 5, 8],
-                         [0, 1, 0]]).T
+
+        data = np.array(
+            [
+                [1, 2, 3],
+                [1.2, 2.5, 3.8],
+                [2, 5, 8],
+                [0, 1, 0]
+            ]
+        ).T
+
         attributes = [('col_{}'.format(i), 'REAL')
                       for i in range(data.shape[1])]
-        name = 'NumPy_testing_dataset'
-        description = 'Synthetic dataset created from a NumPy array'
-        creator = 'OpenML tester'
-        collection_date = '01-01-2018'
-        language = 'English'
-        licence = 'MIT'
-        default_target_attribute = 'col_{}'.format(data.shape[1] - 1)
-        citation = 'None'
-        original_data_url = 'http://openml.github.io/openml-python'
-        paper_url = 'http://openml.github.io/openml-python'
-        dataset = openml.datasets.functions.create_dataset(
-            name=name,
-            description=description,
-            creator=creator,
+
+        dataset = create_dataset(
+            name='NumPy_testing_dataset',
+            description='Synthetic dataset created from a NumPy array',
+            creator='OpenML tester',
             contributor=None,
-            collection_date=collection_date,
-            language=language,
-            licence=licence,
-            default_target_attribute=default_target_attribute,
+            collection_date='01-01-2018',
+            language='English',
+            licence='MIT',
+            default_target_attribute='col_{}'.format(data.shape[1] - 1),
             row_id_attribute=None,
             ignore_attribute=None,
-            citation=citation,
+            citation='None',
             attributes=attributes,
             data=data,
-            format='arff',
             version_label='test',
-            original_data_url=original_data_url,
-            paper_url=paper_url
+            original_data_url='http://openml.github.io/openml-python',
+            paper_url='http://openml.github.io/openml-python'
+        )
+
+        upload_did = dataset.publish()
+
+        self.assertEqual(
+            _get_online_dataset_arff(upload_did),
+            dataset._dataset,
+            "Uploaded arff does not match original one"
+        )
+        self.assertEqual(
+            _get_online_dataset_format(upload_did),
+            'arff',
+            "Wrong format for dataset"
+        )
+
+    def test_create_dataset_list(self):
+
+        data = [
+            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
+            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
+            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
+            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
+            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'],
+            ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'],
+            ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'],
+            ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'],
+            ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'],
+            ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'],
+            ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'],
+            ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'],
+            ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'],
+            ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'],
+        ]
+
+        attributes = [
+            ('rnd_str', 'STRING'),
+            ('outlook', ['sunny', 'overcast', 'rainy']),
+            ('temperature', 'REAL'),
+            ('humidity', 'REAL'),
+            ('windy', ['TRUE', 'FALSE']),
+            ('play', ['yes', 'no']),
+        ]
+
+        dataset = create_dataset(
+            name="ModifiedWeather",
+            description=(
+                'Testing dataset upload when the data is a list of lists'
+            ),
+            creator='OpenML test',
+            contributor=None,
+            collection_date='21-09-2018',
+            language='English',
+            licence='MIT',
+            default_target_attribute='play',
+            row_id_attribute=None,
+            ignore_attribute=None,
+            citation='None',
+            attributes=attributes,
+            data=data,
+            version_label='test',
+            original_data_url='http://openml.github.io/openml-python',
+            paper_url='http://openml.github.io/openml-python'
+        )
+
+        upload_did = dataset.publish()
+        self.assertEqual(
+            _get_online_dataset_arff(upload_did),
+            dataset._dataset,
+            "Uploaded ARFF does not match original one"
+        )
+        self.assertEqual(
+            _get_online_dataset_format(upload_did),
+            'arff',
+            "Wrong format for dataset"
+        )
+
+    def test_create_dataset_sparse(self):
+
+        # test the scipy.sparse.coo_matrix
+        sparse_data = scipy.sparse.coo_matrix((
+            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
+        ))
+
+        column_names = [
+            ('input1', 'REAL'),
+            ('input2', 'REAL'),
+            ('y', 'REAL'),
+        ]
+
+        xor_dataset = create_dataset(
+            name="XOR",
+            description='Dataset representing the XOR operation',
+            creator=None,
+            contributor=None,
+            collection_date=None,
+            language='English',
+            licence=None,
+            default_target_attribute='y',
+            row_id_attribute=None,
+            ignore_attribute=None,
+            citation=None,
+            attributes=column_names,
+            data=sparse_data,
+            version_label='test',
+        )
+
+        upload_did = xor_dataset.publish()
+        self.assertEqual(
+            _get_online_dataset_arff(upload_did),
+            xor_dataset._dataset,
+            "Uploaded ARFF does not match original one"
+        )
+        self.assertEqual(
+            _get_online_dataset_format(upload_did),
+            'sparse_arff',
+            "Wrong format for dataset"
+        )
+
+        # test the list of dicts sparse representation
+        sparse_data = [
+            {0: 0.0},
+            {1: 1.0, 2: 1.0},
+            {0: 1.0, 2: 1.0},
+            {0: 1.0, 1: 1.0}
+        ]
+
+        xor_dataset = create_dataset(
+            name="XOR",
+            description='Dataset representing the XOR operation',
+            creator=None,
+            contributor=None,
+            collection_date=None,
+            language='English',
+            licence=None,
+            default_target_attribute='y',
+            row_id_attribute=None,
+            ignore_attribute=None,
+            citation=None,
+            attributes=column_names,
+            data=sparse_data,
+            version_label='test',
+        )
+
+        upload_did = xor_dataset.publish()
+        self.assertEqual(
+            _get_online_dataset_arff(upload_did),
+            xor_dataset._dataset,
+            "Uploaded ARFF does not match original one"
+        )
+        self.assertEqual(
+            _get_online_dataset_format(upload_did),
+            'sparse_arff',
+            "Wrong format for dataset"
+        )
+
+    def test_create_invalid_dataset(self):
+
+        data = [
+            'sunny',
+            'overcast',
+            'overcast',
+            'rainy',
+            'rainy',
+            'rainy',
+            'overcast',
+            'sunny',
+            'sunny',
+            'rainy',
+            'sunny',
+            'overcast',
+            'overcast',
+            'rainy',
+        ]
+
+        param = self._get_empty_param_for_dataset()
+        param['data'] = data
+
+        self.assertRaises(
+            ValueError,
+            create_dataset,
+            **param
+        )
+
+        param['data'] = data[0]
+        self.assertRaises(
+            ValueError,
+            create_dataset,
+            **param
+        )
+
+    def test_create_dataset_warning(self):
+
+        parameters = self._get_empty_param_for_dataset()
+        parameters['format'] = 'arff'
+        with catch_warnings():
+            filterwarnings('error')
+            self.assertRaises(
+                DeprecationWarning,
+                create_dataset,
+                **parameters
+            )
+
+    def test_get_online_dataset_arff(self):
+
+        # Australian dataset
+        dataset_id = 100
+        dataset = openml.datasets.get_dataset(dataset_id)
+        decoder = arff.ArffDecoder()
+        # check if the arff from the dataset is
+        # the same as the arff from _get_arff function
+        d_format = (dataset.format).lower()
+
+        self.assertEqual(
+            dataset._get_arff(d_format),
+            decoder.decode(
+                _get_online_dataset_arff(dataset_id),
+                encode_nominal=True,
+                return_type=arff.DENSE
+                if d_format == 'arff' else arff.COO
+            ),
+            "ARFF files are not equal"
+        )
+
+    def test_get_online_dataset_format(self):
+
+        # Phoneme dataset
+        dataset_id = 77
+        dataset = openml.datasets.get_dataset(dataset_id)
+
+        self.assertEqual(
+            (dataset.format).lower(),
+            _get_online_dataset_format(dataset_id),
+            "The format of the ARFF files is different"
         )
-        dataset.publish()