diff --git a/.travis.yml b/.travis.yml index 675186469..beaa3b53e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,10 +15,11 @@ env: - TEST_DIR=/tmp/test_dir/ - MODULE=openml matrix: - - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.20.0" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.0" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" RUN_FLAKE8="true" SKIP_TESTS="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" COVERAGE="true" DOCPUSH="true" + - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2" # Checks for older scikit-learn versions (which also don't nicely work with # Python3.7) - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" diff --git a/openml/testing.py b/openml/testing.py index 82302a03d..4841ca4b6 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -321,4 +321,10 @@ def _check_fold_timing_evaluations( self.assertLessEqual(evaluation, max_val) -__all__ = ['TestBase'] +try: + from sklearn.impute import SimpleImputer +except ImportError: + from sklearn.preprocessing import Imputer as SimpleImputer + + +__all__ = ['TestBase', 'SimpleImputer'] diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index f731f7388..8bc615516 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -28,10 +28,6 @@ import sklearn.tree import sklearn.cluster -if LooseVersion(sklearn.__version__) < "0.20": - from sklearn.preprocessing import Imputer -else: - from sklearn.impute import SimpleImputer as Imputer import openml from openml.extensions.sklearn import SklearnExtension @@ -39,7 +35,8 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer + this_directory = os.path.dirname(os.path.abspath(__file__)) sys.path.append(this_directory) @@ -285,11 +282,14 @@ def test_serialize_pipeline(self): # Comparing the pipeline # The parameters only have the name of base objects(not the whole flow) # as value - # memory parameter has been added in 0.19 + # memory parameter has been added in 0.19, verbose in 0.21 if LooseVersion(sklearn.__version__) < "0.19": self.assertEqual(len(serialization.parameters), 1) - else: + elif LooseVersion(sklearn.__version__) < "0.21": self.assertEqual(len(serialization.parameters), 2) + else: + self.assertEqual(len(serialization.parameters), 3) + # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual( @@ -374,8 +374,10 @@ def test_serialize_pipeline_clustering(self): # memory parameter has been added in 0.19 if LooseVersion(sklearn.__version__) < "0.19": self.assertEqual(len(serialization.parameters), 1) - else: + elif LooseVersion(sklearn.__version__) < "0.21": self.assertEqual(len(serialization.parameters), 2) + else: + self.assertEqual(len(serialization.parameters), 3) # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual( @@ -624,7 +626,7 @@ def test_serialize_feature_union_switched_names(self): .format(module_name_encoder)) def test_serialize_complex_flow(self): - ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0]) + ohe = sklearn.preprocessing.OneHotEncoder() scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( base_estimator=sklearn.tree.DecisionTreeClassifier()) @@ -747,15 +749,16 @@ def test_serialize_simple_parameter_grid(self): # Examples from the scikit-learn documentation models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()] grids = \ - [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, - {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], - 'kernel': ['rbf']}], - {"max_depth": [3, None], - "max_features": [1, 3, 10], - "min_samples_split": [1, 3, 10], - "min_samples_leaf": [1, 3, 10], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]}] + [[OrderedDict([('C', [1, 10, 100, 1000]), ('kernel', ['linear'])]), + OrderedDict([('C', [1, 10, 100, 1000]), ('gamma', [0.001, 0.0001]), + ('kernel', ['rbf'])])], + OrderedDict([("bootstrap", [True, False]), + ("criterion", ["gini", "entropy"]), + ("max_depth", [3, None]), + ("max_features", [1, 3, 10]), + ("min_samples_leaf", [1, 3, 10]), + ("min_samples_split", [1, 3, 10]) + ])] for grid, model in zip(grids, models): serialized = self.extension.model_to_flow(grid) @@ -763,9 +766,9 @@ def test_serialize_simple_parameter_grid(self): self.assertEqual(deserialized, grid) self.assertIsNot(deserialized, grid) - + # providing error_score because nan != nan hpo = sklearn.model_selection.GridSearchCV( - param_grid=grid, estimator=model) + param_grid=grid, estimator=model, error_score=-1000) serialized = self.extension.model_to_flow(hpo) deserialized = self.extension.flow_to_model(serialized) @@ -943,7 +946,7 @@ def test_illegal_parameter_names(self): def test_illegal_parameter_names_pipeline(self): # illegal name: steps steps = [ - ('Imputer', Imputer(strategy='median')), + ('Imputer', SimpleImputer(strategy='median')), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')), @@ -956,7 +959,7 @@ def test_illegal_parameter_names_featureunion(self): # illegal name: transformer_list transformer_list = [ ('transformer_list', - Imputer(strategy='median')), + SimpleImputer(strategy='median')), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')) @@ -1015,18 +1018,25 @@ def test_paralizable_check(self): self.extension._prevent_optimize_n_jobs(model) def test__get_fn_arguments_with_defaults(self): - if LooseVersion(sklearn.__version__) < "0.19": + sklearn_version = LooseVersion(sklearn.__version__) + if sklearn_version < "0.19": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 15), (sklearn.tree.DecisionTreeClassifier.__init__, 12), (sklearn.pipeline.Pipeline.__init__, 0) ] - else: + elif sklearn_version < "0.21": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 16), (sklearn.tree.DecisionTreeClassifier.__init__, 13), (sklearn.pipeline.Pipeline.__init__, 1) ] + else: + fns = [ + (sklearn.ensemble.RandomForestRegressor.__init__, 16), + (sklearn.tree.DecisionTreeClassifier.__init__, 13), + (sklearn.pipeline.Pipeline.__init__, 2) + ] for fn, num_params_with_defaults in fns: defaults, defaultless = ( @@ -1047,7 +1057,7 @@ def test_deserialize_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter # settings. - steps = [('Imputer', Imputer()), + steps = [('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ('Estimator', sklearn.tree.DecisionTreeClassifier())] pipe_orig = sklearn.pipeline.Pipeline(steps=steps) @@ -1071,7 +1081,7 @@ def test_deserialize_adaboost_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter # settings. - steps = [('Imputer', Imputer()), + steps = [('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ('Estimator', sklearn.ensemble.AdaBoostClassifier( sklearn.tree.DecisionTreeClassifier()))] @@ -1097,7 +1107,7 @@ def test_deserialize_complex_with_defaults(self): # method to return a flow that contains default hyperparameter # settings. steps = [ - ('Imputer', Imputer()), + ('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ( 'Estimator', @@ -1237,7 +1247,7 @@ def test_run_model_on_task(self): class MyPipe(sklearn.pipeline.Pipeline): pass task = openml.tasks.get_task(1) - pipe = MyPipe([('imp', Imputer()), + pipe = MyPipe([('imp', SimpleImputer()), ('dummy', sklearn.dummy.DummyClassifier())]) openml.runs.run_model_on_task(pipe, task) @@ -1309,7 +1319,7 @@ def test_run_model_on_fold_classification_1(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeClassifier()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1435,11 +1445,11 @@ def predict_proba(*args, **kwargs): y_train = y[train_indices] X_test = X[test_indices] clf1 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', sklearn.naive_bayes.GaussianNB()) ]) clf2 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', HardNaiveBayes()) ]) @@ -1492,7 +1502,7 @@ def test_run_model_on_fold_regression(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeRegressor()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1537,7 +1547,7 @@ def test_run_model_on_fold_clustering(self): X = task.get_X(dataset_format='array') pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.cluster.KMeans()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1626,7 +1636,7 @@ def test_trim_flow_name(self): long = """sklearn.pipeline.Pipeline( columntransformer=sklearn.compose._column_transformer.ColumnTransformer( numeric=sklearn.pipeline.Pipeline( - imputer=sklearn.preprocessing.imputation.Imputer, + SimpleImputer=sklearn.preprocessing.imputation.Imputer, standardscaler=sklearn.preprocessing.data.StandardScaler), nominal=sklearn.pipeline.Pipeline( simpleimputer=sklearn.impute.SimpleImputer, @@ -1650,7 +1660,7 @@ def test_trim_flow_name(self): self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) long = """sklearn.pipeline.Pipeline( - Imputer=sklearn.preprocessing.imputation.Imputer, + SimpleImputer=sklearn.preprocessing.imputation.Imputer, VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501 Estimator=sklearn.model_selection._search.RandomizedSearchCV( estimator=sklearn.tree.tree.DecisionTreeClassifier))""" @@ -1660,7 +1670,7 @@ def test_trim_flow_name(self): long = """sklearn.model_selection._search.RandomizedSearchCV( estimator=sklearn.pipeline.Pipeline( - Imputer=sklearn.preprocessing.imputation.Imputer, + SimpleImputer=sklearn.preprocessing.imputation.Imputer, classifier=sklearn.ensemble.forest.RandomForestClassifier))""" short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))" long_stripped, _ = re.subn(r'\s', '', long) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 6e7eb7fbb..25e2dacfb 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -19,18 +19,13 @@ import sklearn.naive_bayes import sklearn.tree -if LooseVersion(sklearn.__version__) < "0.20": - from sklearn.preprocessing import Imputer -else: - from sklearn.impute import SimpleImputer as Imputer - import xmltodict import openml from openml._api_calls import _perform_api_call import openml.exceptions import openml.extensions.sklearn -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer import openml.utils @@ -318,8 +313,8 @@ def test_illegal_flow(self): # should throw error as it contains two imputers illegal = sklearn.pipeline.Pipeline( steps=[ - ('imputer1', Imputer()), - ('imputer2', Imputer()), + ('imputer1', SimpleImputer()), + ('imputer2', SimpleImputer()), ('classif', sklearn.tree.DecisionTreeClassifier()) ] ) @@ -350,7 +345,7 @@ def test_existing_flow_exists(self): if LooseVersion(sklearn.__version__) >= '0.20': ohe_params['categories'] = 'auto' steps = [ - ('imputation', Imputer(strategy='median')), + ('imputation', SimpleImputer(strategy='median')), ('hotencoding', sklearn.preprocessing.OneHotEncoder(**ohe_params)), ( 'variencethreshold', diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index de933731a..95b4fa3f0 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -288,7 +288,7 @@ def test_get_flow_reinstantiate_model_no_extension(self): def test_get_flow_reinstantiate_model_wrong_version(self): # Note that CI does not test against 0.19.1. openml.config.server = self.production_server - _, sklearn_major, _ = LooseVersion(sklearn.__version__).version + _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3] flow = 8175 expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.' self.assertRaisesRegex(ValueError, diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 23ab43df0..88fe8d6ef 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -7,9 +7,8 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline -from sklearn.preprocessing import Imputer -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer import openml import openml.extensions.sklearn @@ -106,7 +105,7 @@ def _check_array(array, type_): def test_to_from_filesystem_vanilla(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DecisionTreeClassifier(max_depth=1)), ]) task = openml.tasks.get_task(119) @@ -139,7 +138,7 @@ def test_to_from_filesystem_vanilla(self): def test_to_from_filesystem_search(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DecisionTreeClassifier(max_depth=1)), ]) model = GridSearchCV( @@ -175,7 +174,7 @@ def test_to_from_filesystem_search(self): def test_to_from_filesystem_no_model(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DummyClassifier()), ]) task = openml.tasks.get_task(119) @@ -205,7 +204,7 @@ def test_publish_with_local_loaded_flow(self): extension = openml.extensions.sklearn.SklearnExtension() model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DummyClassifier()), ]) task = openml.tasks.get_task(119) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index bd123cd37..2b09ef501 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -17,7 +17,7 @@ import pandas as pd import openml.extensions.sklearn -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer from openml.runs.functions import ( _run_task_get_arffcontent, run_exists, @@ -28,7 +28,7 @@ from sklearn.naive_bayes import GaussianNB from sklearn.model_selection._search import BaseSearchCV from sklearn.tree import DecisionTreeClassifier -from sklearn.preprocessing.imputation import Imputer + from sklearn.dummy import DummyClassifier from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import VarianceThreshold @@ -550,7 +550,7 @@ def get_ct_cf(nominal_indices, numeric_indices): '62501', sentinel=sentinel) def test_run_and_upload_decision_tree_pipeline(self): - pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV( DecisionTreeClassifier(), @@ -657,7 +657,7 @@ def test_learning_curve_task_2(self): num_folds = 10 num_samples = 8 - pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV( DecisionTreeClassifier(), @@ -714,9 +714,9 @@ def _test_local_evaluations(self, run): np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores) - # also check if we can obtain some other scores: # TODO: how to do AUC? + # also check if we can obtain some other scores: tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}), - (sklearn.metrics.auc, {'reorder': True}), + (sklearn.metrics.roc_auc_score, {}), (sklearn.metrics.average_precision_score, {}), (sklearn.metrics.jaccard_similarity_score, {}), (sklearn.metrics.precision_score, {'average': 'macro'}), @@ -734,7 +734,7 @@ def _test_local_evaluations(self, run): def test_local_run_swapped_parameter_order_model(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) # download task @@ -752,7 +752,7 @@ def test_local_run_swapped_parameter_order_model(self): def test_local_run_swapped_parameter_order_flow(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) flow = self.extension.model_to_flow(clf) @@ -771,7 +771,7 @@ def test_local_run_swapped_parameter_order_flow(self): def test_local_run_metric_score(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) # download task @@ -798,7 +798,7 @@ def test_online_run_metric_score(self): def test_initialize_model_from_run(self): clf = sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='median')), + ('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold(threshold=0.05)), ('Estimator', GaussianNB())]) task = openml.tasks.get_task(11) @@ -882,12 +882,12 @@ def test__run_exists(self): rs = 1 clfs = [ sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='mean')), + ('Imputer', SimpleImputer(strategy='mean')), ('VarianceThreshold', VarianceThreshold(threshold=0.05)), ('Estimator', DecisionTreeClassifier(max_depth=4)) ]), sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='most_frequent')), + ('Imputer', SimpleImputer(strategy='most_frequent')), ('VarianceThreshold', VarianceThreshold(threshold=0.1)), ('Estimator', DecisionTreeClassifier(max_depth=4))] ) @@ -1251,7 +1251,7 @@ def test_run_on_dataset_with_missing_labels(self): flow.name = 'dummy' task = openml.tasks.get_task(2) - model = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + model = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('Estimator', DecisionTreeClassifier())]) data_content, _, _, _ = _run_task_get_arffcontent( diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index 62d1a98c8..1d9c56d54 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,4 +1,4 @@ -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer class TestStudyFunctions(TestBase): @@ -30,12 +30,13 @@ def test_Figure1a(self): import sklearn.pipeline import sklearn.preprocessing import sklearn.tree + benchmark_suite = openml.study.get_study( 'OpenML100', 'tasks' ) # obtain the benchmark suite clf = sklearn.pipeline.Pipeline( steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', sklearn.tree.DecisionTreeClassifier()) ] ) # build a sklearn classifier