Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 24 additions & 33 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def create_dataset(name, description, creator, contributor,
licence, attributes, data,
default_target_attribute,
ignore_attribute, citation,
row_id_attribute=None, format=None,
row_id_attribute=None,
original_data_url=None, paper_url=None,
update_comment=None, version_label=None):
"""Create a dataset.
Expand Down Expand Up @@ -473,11 +473,6 @@ def create_dataset(name, description, creator, contributor,
be discarded.
.. versionadded: 0.8
Inference of ``row_id_attribute`` from a dataframe.
format : str, optional
Format of the dataset which can be either 'arff' or 'sparse_arff'.
By default, the format is automatically inferred.
.. deprecated: 0.8
``format`` is deprecated in 0.8 and will be removed in 0.10.
original_data_url : str, optional
For derived data, the url to the original dataset.
paper_url : str, optional
Expand Down Expand Up @@ -536,34 +531,29 @@ def create_dataset(name, description, creator, contributor,
else:
data = data.values

if format is not None:
warn("The format parameter will be deprecated in the future,"
" the method will determine the format of the ARFF "
"based on the given data.", DeprecationWarning)
d_format = format

# Determine ARFF format from the dataset
else:
if isinstance(data, (list, np.ndarray)):
if isinstance(data[0], (list, np.ndarray)):
d_format = 'arff'
elif isinstance(data[0], dict):
d_format = 'sparse_arff'
else:
raise ValueError(
'When giving a list or a numpy.ndarray, '
'they should contain a list/ numpy.ndarray '
'for dense data or a dictionary for sparse '
'data. Got {!r} instead.'
.format(data[0])
)
elif isinstance(data, coo_matrix):
d_format = 'sparse_arff'
if isinstance(data, (list, np.ndarray)):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, don't you need to ckeck for pd.DataFrame and pd.SparseDataFrame, too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True

if isinstance(data[0], (list, np.ndarray)):
data_format = 'arff'
elif isinstance(data[0], dict):
data_format = 'sparse_arff'
else:
raise ValueError(
'Invalid data type. The data type can be a list, '
'a numpy ndarray or a scipy.sparse.coo_matrix'
'When giving a list or a numpy.ndarray, '
'they should contain a list/ numpy.ndarray '
'for dense data or a dictionary for sparse '
'data. Got {!r} instead.'
.format(data[0])
)
elif isinstance(data, coo_matrix):
data_format = 'sparse_arff'
else:
raise ValueError(
'When giving a list or a numpy.ndarray, '
'they should contain a list/ numpy.ndarray '
'for dense data or a dictionary for sparse '
'data. Got {!r} instead.'
.format(data[0])
)

arff_object = {
'relation': name,
Expand All @@ -577,10 +567,11 @@ def create_dataset(name, description, creator, contributor,
try:
# check if ARFF is valid
decoder = arff.ArffDecoder()
return_type = arff.COO if data_format == 'sparse_arff' else arff.DENSE
decoder.decode(
arff_dataset,
encode_nominal=True,
return_type=arff.COO if d_format == 'sparse_arff' else arff.DENSE
return_type=return_type
)
except arff.ArffException:
raise ValueError("The arguments you have provided \
Expand All @@ -589,7 +580,7 @@ def create_dataset(name, description, creator, contributor,
return OpenMLDataset(
name,
description,
data_format=d_format,
data_format=data_format,
creator=creator,
contributor=contributor,
collection_date=collection_date,
Expand Down
18 changes: 0 additions & 18 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,18 +683,6 @@ def test_create_invalid_dataset(self):
**param
)

def test_create_dataset_warning(self):

parameters = self._get_empty_param_for_dataset()
parameters['format'] = 'arff'
with catch_warnings():
filterwarnings('error')
self.assertRaises(
DeprecationWarning,
create_dataset,
**parameters
)

def test_get_online_dataset_arff(self):

# Australian dataset
Expand Down Expand Up @@ -768,7 +756,6 @@ def test_create_dataset_pandas(self):
citation=citation,
attributes='auto',
data=df,
format=None,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
Expand Down Expand Up @@ -803,7 +790,6 @@ def test_create_dataset_pandas(self):
citation=citation,
attributes='auto',
data=df,
format=None,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
Expand Down Expand Up @@ -840,7 +826,6 @@ def test_create_dataset_pandas(self):
citation=citation,
attributes=attributes,
data=df,
format=None,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
Expand Down Expand Up @@ -892,7 +877,6 @@ def test_create_dataset_row_id_attribute_error(self):
attributes='auto',
data=df,
row_id_attribute='unknown_row_id',
format=None,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
Expand Down Expand Up @@ -939,7 +923,6 @@ def test_create_dataset_row_id_attribute_inference(self):
attributes='auto',
data=df,
row_id_attribute=row_id,
format=None,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
Expand Down Expand Up @@ -986,7 +969,6 @@ def test_create_dataset_attributes_auto_without_df(self):
citation=citation,
attributes=attributes,
data=data,
format=None,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
Expand Down