Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ env:
- TEST_DIR=/tmp/test_dir/
- MODULE=openml
matrix:
- DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.20.0"
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.0"
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" RUN_FLAKE8="true" SKIP_TESTS="true"
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" COVERAGE="true" DOCPUSH="true"
- DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2"
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2"
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
# Checks for older scikit-learn versions (which also don't nicely work with
# Python3.7)
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
Expand Down
8 changes: 7 additions & 1 deletion openml/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,4 +321,10 @@ def _check_fold_timing_evaluations(
self.assertLessEqual(evaluation, max_val)


__all__ = ['TestBase']
try:
from sklearn.impute import SimpleImputer
except ImportError:
from sklearn.preprocessing import Imputer as SimpleImputer


__all__ = ['TestBase', 'SimpleImputer']
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,15 @@
import sklearn.tree
import sklearn.cluster

if LooseVersion(sklearn.__version__) < "0.20":
from sklearn.preprocessing import Imputer
else:
from sklearn.impute import SimpleImputer as Imputer

import openml
from openml.extensions.sklearn import SklearnExtension
from openml.exceptions import PyOpenMLError
from openml.flows import OpenMLFlow
from openml.flows.functions import assert_flows_equal
from openml.runs.trace import OpenMLRunTrace
from openml.testing import TestBase
from openml.testing import TestBase, SimpleImputer


this_directory = os.path.dirname(os.path.abspath(__file__))
sys.path.append(this_directory)
Expand Down Expand Up @@ -285,11 +282,14 @@ def test_serialize_pipeline(self):
# Comparing the pipeline
# The parameters only have the name of base objects(not the whole flow)
# as value
# memory parameter has been added in 0.19
# memory parameter has been added in 0.19, verbose in 0.21
if LooseVersion(sklearn.__version__) < "0.19":
self.assertEqual(len(serialization.parameters), 1)
else:
elif LooseVersion(sklearn.__version__) < "0.21":
self.assertEqual(len(serialization.parameters), 2)
else:
self.assertEqual(len(serialization.parameters), 3)

# Hard to compare two representations of a dict due to possibly
# different sorting. Making a json makes it easier
self.assertEqual(
Expand Down Expand Up @@ -374,8 +374,10 @@ def test_serialize_pipeline_clustering(self):
# memory parameter has been added in 0.19
if LooseVersion(sklearn.__version__) < "0.19":
self.assertEqual(len(serialization.parameters), 1)
else:
elif LooseVersion(sklearn.__version__) < "0.21":
self.assertEqual(len(serialization.parameters), 2)
else:
self.assertEqual(len(serialization.parameters), 3)
# Hard to compare two representations of a dict due to possibly
# different sorting. Making a json makes it easier
self.assertEqual(
Expand Down Expand Up @@ -624,7 +626,7 @@ def test_serialize_feature_union_switched_names(self):
.format(module_name_encoder))

def test_serialize_complex_flow(self):
ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0])
ohe = sklearn.preprocessing.OneHotEncoder()
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
boosting = sklearn.ensemble.AdaBoostClassifier(
base_estimator=sklearn.tree.DecisionTreeClassifier())
Expand Down Expand Up @@ -747,25 +749,26 @@ def test_serialize_simple_parameter_grid(self):
# Examples from the scikit-learn documentation
models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
grids = \
[[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
'kernel': ['rbf']}],
{"max_depth": [3, None],
"max_features": [1, 3, 10],
"min_samples_split": [1, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}]
[[OrderedDict([('C', [1, 10, 100, 1000]), ('kernel', ['linear'])]),
OrderedDict([('C', [1, 10, 100, 1000]), ('gamma', [0.001, 0.0001]),
('kernel', ['rbf'])])],
OrderedDict([("bootstrap", [True, False]),
("criterion", ["gini", "entropy"]),
("max_depth", [3, None]),
("max_features", [1, 3, 10]),
("min_samples_leaf", [1, 3, 10]),
("min_samples_split", [1, 3, 10])
])]

for grid, model in zip(grids, models):
serialized = self.extension.model_to_flow(grid)
deserialized = self.extension.flow_to_model(serialized)

self.assertEqual(deserialized, grid)
self.assertIsNot(deserialized, grid)

# providing error_score because nan != nan
hpo = sklearn.model_selection.GridSearchCV(
param_grid=grid, estimator=model)
param_grid=grid, estimator=model, error_score=-1000)

serialized = self.extension.model_to_flow(hpo)
deserialized = self.extension.flow_to_model(serialized)
Expand Down Expand Up @@ -943,7 +946,7 @@ def test_illegal_parameter_names(self):
def test_illegal_parameter_names_pipeline(self):
# illegal name: steps
steps = [
('Imputer', Imputer(strategy='median')),
('Imputer', SimpleImputer(strategy='median')),
('OneHotEncoder',
sklearn.preprocessing.OneHotEncoder(sparse=False,
handle_unknown='ignore')),
Expand All @@ -956,7 +959,7 @@ def test_illegal_parameter_names_featureunion(self):
# illegal name: transformer_list
transformer_list = [
('transformer_list',
Imputer(strategy='median')),
SimpleImputer(strategy='median')),
('OneHotEncoder',
sklearn.preprocessing.OneHotEncoder(sparse=False,
handle_unknown='ignore'))
Expand Down Expand Up @@ -1015,18 +1018,25 @@ def test_paralizable_check(self):
self.extension._prevent_optimize_n_jobs(model)

def test__get_fn_arguments_with_defaults(self):
if LooseVersion(sklearn.__version__) < "0.19":
sklearn_version = LooseVersion(sklearn.__version__)
if sklearn_version < "0.19":
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 15),
(sklearn.tree.DecisionTreeClassifier.__init__, 12),
(sklearn.pipeline.Pipeline.__init__, 0)
]
else:
elif sklearn_version < "0.21":
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 16),
(sklearn.tree.DecisionTreeClassifier.__init__, 13),
(sklearn.pipeline.Pipeline.__init__, 1)
]
else:
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 16),
(sklearn.tree.DecisionTreeClassifier.__init__, 13),
(sklearn.pipeline.Pipeline.__init__, 2)
]

for fn, num_params_with_defaults in fns:
defaults, defaultless = (
Expand All @@ -1047,7 +1057,7 @@ def test_deserialize_with_defaults(self):
# used the 'initialize_with_defaults' flag of the deserialization
# method to return a flow that contains default hyperparameter
# settings.
steps = [('Imputer', Imputer()),
steps = [('Imputer', SimpleImputer()),
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
('Estimator', sklearn.tree.DecisionTreeClassifier())]
pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
Expand All @@ -1071,7 +1081,7 @@ def test_deserialize_adaboost_with_defaults(self):
# used the 'initialize_with_defaults' flag of the deserialization
# method to return a flow that contains default hyperparameter
# settings.
steps = [('Imputer', Imputer()),
steps = [('Imputer', SimpleImputer()),
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
('Estimator', sklearn.ensemble.AdaBoostClassifier(
sklearn.tree.DecisionTreeClassifier()))]
Expand All @@ -1097,7 +1107,7 @@ def test_deserialize_complex_with_defaults(self):
# method to return a flow that contains default hyperparameter
# settings.
steps = [
('Imputer', Imputer()),
('Imputer', SimpleImputer()),
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
(
'Estimator',
Expand Down Expand Up @@ -1237,7 +1247,7 @@ def test_run_model_on_task(self):
class MyPipe(sklearn.pipeline.Pipeline):
pass
task = openml.tasks.get_task(1)
pipe = MyPipe([('imp', Imputer()),
pipe = MyPipe([('imp', SimpleImputer()),
('dummy', sklearn.dummy.DummyClassifier())])
openml.runs.run_model_on_task(pipe, task)

Expand Down Expand Up @@ -1309,7 +1319,7 @@ def test_run_model_on_fold_classification_1(self):
y_test = y[test_indices]

pipeline = sklearn.pipeline.Pipeline(steps=[
('imp', sklearn.preprocessing.Imputer()),
('imp', SimpleImputer()),
('clf', sklearn.tree.DecisionTreeClassifier()),
])
# TODO add some mocking here to actually test the innards of this function, too!
Expand Down Expand Up @@ -1435,11 +1445,11 @@ def predict_proba(*args, **kwargs):
y_train = y[train_indices]
X_test = X[test_indices]
clf1 = sklearn.pipeline.Pipeline(steps=[
('imputer', sklearn.preprocessing.Imputer()),
('imputer', SimpleImputer()),
('estimator', sklearn.naive_bayes.GaussianNB())
])
clf2 = sklearn.pipeline.Pipeline(steps=[
('imputer', sklearn.preprocessing.Imputer()),
('imputer', SimpleImputer()),
('estimator', HardNaiveBayes())
])

Expand Down Expand Up @@ -1492,7 +1502,7 @@ def test_run_model_on_fold_regression(self):
y_test = y[test_indices]

pipeline = sklearn.pipeline.Pipeline(steps=[
('imp', sklearn.preprocessing.Imputer()),
('imp', SimpleImputer()),
('clf', sklearn.tree.DecisionTreeRegressor()),
])
# TODO add some mocking here to actually test the innards of this function, too!
Expand Down Expand Up @@ -1537,7 +1547,7 @@ def test_run_model_on_fold_clustering(self):
X = task.get_X(dataset_format='array')

pipeline = sklearn.pipeline.Pipeline(steps=[
('imp', sklearn.preprocessing.Imputer()),
('imp', SimpleImputer()),
('clf', sklearn.cluster.KMeans()),
])
# TODO add some mocking here to actually test the innards of this function, too!
Expand Down Expand Up @@ -1626,7 +1636,7 @@ def test_trim_flow_name(self):
long = """sklearn.pipeline.Pipeline(
columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
numeric=sklearn.pipeline.Pipeline(
imputer=sklearn.preprocessing.imputation.Imputer,
SimpleImputer=sklearn.preprocessing.imputation.Imputer,
standardscaler=sklearn.preprocessing.data.StandardScaler),
nominal=sklearn.pipeline.Pipeline(
simpleimputer=sklearn.impute.SimpleImputer,
Expand All @@ -1650,7 +1660,7 @@ def test_trim_flow_name(self):
self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))

long = """sklearn.pipeline.Pipeline(
Imputer=sklearn.preprocessing.imputation.Imputer,
SimpleImputer=sklearn.preprocessing.imputation.Imputer,
VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501
Estimator=sklearn.model_selection._search.RandomizedSearchCV(
estimator=sklearn.tree.tree.DecisionTreeClassifier))"""
Expand All @@ -1660,7 +1670,7 @@ def test_trim_flow_name(self):

long = """sklearn.model_selection._search.RandomizedSearchCV(
estimator=sklearn.pipeline.Pipeline(
Imputer=sklearn.preprocessing.imputation.Imputer,
SimpleImputer=sklearn.preprocessing.imputation.Imputer,
classifier=sklearn.ensemble.forest.RandomForestClassifier))"""
short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))"
long_stripped, _ = re.subn(r'\s', '', long)
Expand Down
13 changes: 4 additions & 9 deletions tests/test_flows/test_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,13 @@
import sklearn.naive_bayes
import sklearn.tree

if LooseVersion(sklearn.__version__) < "0.20":
from sklearn.preprocessing import Imputer
else:
from sklearn.impute import SimpleImputer as Imputer

import xmltodict

import openml
from openml._api_calls import _perform_api_call
import openml.exceptions
import openml.extensions.sklearn
from openml.testing import TestBase
from openml.testing import TestBase, SimpleImputer
import openml.utils


Expand Down Expand Up @@ -318,8 +313,8 @@ def test_illegal_flow(self):
# should throw error as it contains two imputers
illegal = sklearn.pipeline.Pipeline(
steps=[
('imputer1', Imputer()),
('imputer2', Imputer()),
('imputer1', SimpleImputer()),
('imputer2', SimpleImputer()),
('classif', sklearn.tree.DecisionTreeClassifier())
]
)
Expand Down Expand Up @@ -350,7 +345,7 @@ def test_existing_flow_exists(self):
if LooseVersion(sklearn.__version__) >= '0.20':
ohe_params['categories'] = 'auto'
steps = [
('imputation', Imputer(strategy='median')),
('imputation', SimpleImputer(strategy='median')),
('hotencoding', sklearn.preprocessing.OneHotEncoder(**ohe_params)),
(
'variencethreshold',
Expand Down
2 changes: 1 addition & 1 deletion tests/test_flows/test_flow_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_get_flow_reinstantiate_model_no_extension(self):
def test_get_flow_reinstantiate_model_wrong_version(self):
# Note that CI does not test against 0.19.1.
openml.config.server = self.production_server
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
flow = 8175
expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.'
self.assertRaisesRegex(ValueError,
Expand Down
11 changes: 5 additions & 6 deletions tests/test_runs/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

from openml.testing import TestBase
from openml.testing import TestBase, SimpleImputer
import openml
import openml.extensions.sklearn

Expand Down Expand Up @@ -106,7 +105,7 @@ def _check_array(array, type_):
def test_to_from_filesystem_vanilla(self):

model = Pipeline([
('imputer', Imputer(strategy='mean')),
('imputer', SimpleImputer(strategy='mean')),
('classifier', DecisionTreeClassifier(max_depth=1)),
])
task = openml.tasks.get_task(119)
Expand Down Expand Up @@ -139,7 +138,7 @@ def test_to_from_filesystem_vanilla(self):
def test_to_from_filesystem_search(self):

model = Pipeline([
('imputer', Imputer(strategy='mean')),
('imputer', SimpleImputer(strategy='mean')),
('classifier', DecisionTreeClassifier(max_depth=1)),
])
model = GridSearchCV(
Expand Down Expand Up @@ -175,7 +174,7 @@ def test_to_from_filesystem_search(self):
def test_to_from_filesystem_no_model(self):

model = Pipeline([
('imputer', Imputer(strategy='mean')),
('imputer', SimpleImputer(strategy='mean')),
('classifier', DummyClassifier()),
])
task = openml.tasks.get_task(119)
Expand Down Expand Up @@ -205,7 +204,7 @@ def test_publish_with_local_loaded_flow(self):
extension = openml.extensions.sklearn.SklearnExtension()

model = Pipeline([
('imputer', Imputer(strategy='mean')),
('imputer', SimpleImputer(strategy='mean')),
('classifier', DummyClassifier()),
])
task = openml.tasks.get_task(119)
Expand Down
Loading