Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified ci_scripts/flake8_diff.sh
100644 → 100755
Empty file.
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Top-level Classes
:toctree: generated/
:template: function.rst

attributes_arff_from_df
check_datasets_active
create_dataset
get_dataset
Expand Down
4 changes: 2 additions & 2 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@
('Start', 'index'),
('User Guide', 'usage'),
('API', 'api'),
('Changelog', 'progress'),
('Examples', 'examples/index'),
('Contributing', 'contributing'),
('Progress', 'progress'),
('Changelog', 'progress'),
],

# Render the next and previous page links in navbar. (Default: true)
Expand Down
4 changes: 2 additions & 2 deletions doc/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ in python, `scikit-learn <http://scikit-learn.org/stable/index.html>`_.
Thereby it will automatically be compatible with many machine learning
libraries written in Python.

We aim to keep the package as leight-weight as possible and we will try to
We aim to keep the package as light-weight as possible and we will try to
keep the number of potential installation dependencies as low as possible.
Therefore, the connection to other machine learning libraries such as
*pytorch*, *keras* or *tensorflow* should not be done directly inside this
Expand All @@ -43,7 +43,7 @@ Open issues and potential todos

We collect open issues and feature requests in an `issue tracker on github <https://github.com/openml/openml-python/issues>`_.
The issue tracker contains issues marked as *Good first issue*, which shows
issues which are good for beginers. We also maintain a somewhat up-to-date
issues which are good for beginners. We also maintain a somewhat up-to-date
`roadmap <https://github.com/openml/openml-python/issues/410>`_ which
contains longer-term goals.

Expand Down
123 changes: 104 additions & 19 deletions examples/create_upload_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,37 @@
A tutorial on how to create and upload a dataset to OpenML.
"""
import numpy as np
import pandas as pd
import sklearn.datasets
from scipy.sparse import coo_matrix

import openml
from openml.datasets.functions import create_dataset

############################################################################
# For this tutorial we will upload to the test server to not pollute the live
# For this tutorial we will upload to the test server to not pollute the live
# server with countless copies of the same dataset.
openml.config.server = 'https://test.openml.org/api/v1/xml'

############################################################################
# Below we will cover the following cases of the
# dataset object:
# Below we will cover the following cases of the dataset object:
#
# * A numpy array
# * A list
# * A pandas dataframe
# * A sparse matrix
# * A pandas sparse dataframe

############################################################################
# Dataset is a numpy array
# ========================
# A numpy array can contain lists in the case of dense data
# or it can contain OrderedDicts in the case of sparse data.
# A numpy array can contain lists in the case of dense data or it can contain
# OrderedDicts in the case of sparse data.
#
# Prepare dataset
# ^^^^^^^^^^^^^^^
# Load an example dataset from scikit-learn which we
# will upload to OpenML.org via the API.
# Load an example dataset from scikit-learn which we will upload to OpenML.org
# via the API.

diabetes = sklearn.datasets.load_diabetes()
name = 'Diabetes(scikit-learn)'
Expand All @@ -43,11 +45,11 @@
description = diabetes.DESCR

############################################################################
# OpenML does not distinguish between the attributes and
# targets on the data level and stores all data in a single matrix.
# OpenML does not distinguish between the attributes and targets on the data
# level and stores all data in a single matrix.
#
# The target feature is indicated as meta-data of the
# dataset (and tasks on that data).
# The target feature is indicated as meta-data of the dataset (and tasks on
# that data).

data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
attribute_names = list(attribute_names)
Expand All @@ -67,13 +69,13 @@
############################################################################
# Create the dataset object
# ^^^^^^^^^^^^^^^^^^^^^^^^^
# The definition of all fields can be found in the
# XSD files describing the expected format:
# The definition of all fields can be found in the XSD files describing the
# expected format:
#
# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd

diabetes_dataset = create_dataset(
# The name of the dataset (needs to be unique).
# The name of the dataset (needs to be unique).
# Must not be longer than 128 characters and only contain
# a-z, A-Z, 0-9 and the following special characters: _\-\.(),
name=name,
Expand All @@ -93,9 +95,11 @@
licence='BSD (from scikit-learn)',
# Name of the target. Can also have multiple values (comma-separated).
default_target_attribute='class',
# The attribute that represents the row-id column, if present in the dataset.
# The attribute that represents the row-id column, if present in the
# dataset.
row_id_attribute=None,
# Attributes that should be excluded in modelling, such as identifiers and indexes.
# Attributes that should be excluded in modelling, such as identifiers and
# indexes.
ignore_attribute=None,
# How to cite the paper.
citation=citation,
Expand All @@ -118,8 +122,8 @@
############################################################################
# Dataset is a list
# =================
# A list can contain lists in the case of dense data
# or it can contain OrderedDicts in the case of sparse data.
# A list can contain lists in the case of dense data or it can contain
# OrderedDicts in the case of sparse data.
#
# Weather dataset:
# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
Expand Down Expand Up @@ -188,13 +192,59 @@
upload_did = weather_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))

############################################################################
# Dataset is a pandas DataFrame
# =============================
# It might happen that your dataset is made of heterogeneous data which can be
# usually stored as a Pandas DataFrame. DataFrame offers the adavantages to
# store the type of data for each column as well as the attribute names.
# Therefore, when providing a Pandas DataFrame, OpenML can infer those
# information without the need to specifically provide them when calling the
# function :func:`create_dataset`. In this regard, you only need to pass
# ``'auto'`` to the ``attributes`` parameter.

df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names])
# enforce the categorical column to have a categorical dtype
df['outlook'] = df['outlook'].astype('category')
df['windy'] = df['windy'].astype('bool')
df['play'] = df['play'].astype('category')
print(df.info())

############################################################################
# We enforce the column 'outlook', 'windy', and 'play' to be a categorical
# dtype while the column 'rnd_str' is kept as a string column. Then, we can
# call :func:`create_dataset` by passing the dataframe and fixing the parameter
# ``attributes`` to ``'auto'``.

weather_dataset = create_dataset(
name="Weather",
description=description,
creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
contributor=None,
collection_date='01-01-2011',
language='English',
licence=None,
default_target_attribute='play',
row_id_attribute=None,
ignore_attribute=None,
citation=citation,
attributes='auto',
data=df,
version_label='example',
)

############################################################################

upload_did = weather_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))

############################################################################
# Dataset is a sparse matrix
# ==========================

sparse_data = coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]),
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))

column_names = [
Expand Down Expand Up @@ -224,3 +274,38 @@

upload_did = xor_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))


############################################################################
# Dataset is a pandas sparse dataframe
# ====================================

sparse_data = coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
print(df.info())

xor_dataset = create_dataset(
name="XOR",
description='Dataset representing the XOR operation',
creator=None,
contributor=None,
collection_date=None,
language='English',
licence=None,
default_target_attribute='y',
row_id_attribute=None,
ignore_attribute=None,
citation=None,
attributes='auto',
data=df,
version_label='example',
)

############################################################################

upload_did = xor_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
8 changes: 4 additions & 4 deletions openml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
'apikey': None,
'server': "https://www.openml.org/api/v1/xml",
'verbosity': 0,
'cachedir': os.path.expanduser('~/.openml/cache'),
'cachedir': os.path.expanduser(os.path.join('~', '.openml', 'cache')),
'avoid_duplicate_runs': 'True',
}

config_file = os.path.expanduser('~/.openml/config')
config_file = os.path.expanduser(os.path.join('~', '.openml' 'config'))

# Default values are actually added here in the _setup() function which is
# called at the end of this module
Expand All @@ -48,7 +48,7 @@ def _setup():
global avoid_duplicate_runs
# read config file, create cache directory
try:
os.mkdir(os.path.expanduser('~/.openml'))
os.mkdir(os.path.expanduser(os.path.join('~', '.openml')))
except (IOError, OSError):
# TODO add debug information
pass
Expand Down Expand Up @@ -96,7 +96,7 @@ def get_cache_directory():

"""
url_suffix = urlparse(server).netloc
reversed_url_suffix = '/'.join(url_suffix.split('.')[::-1])
reversed_url_suffix = os.sep.join(url_suffix.split('.')[::-1])
if not cache_directory:
_cachedir = _defaults(cache_directory)
else:
Expand Down
Loading