From 5665c9788d9fb4f8d4e491a134118b1ef193abd9 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 24 Jul 2019 15:39:48 -0400 Subject: [PATCH 01/17] test against scikit-learn 0.21 --- .travis.yml | 2 +- .../test_sklearn_extension.py | 31 +++++++++---------- tests/test_flows/test_flow.py | 11 +++---- tests/test_runs/test_run.py | 10 +++--- tests/test_runs/test_run_functions.py | 21 +++++++------ tests/test_study/test_study_examples.py | 3 +- 6 files changed, 38 insertions(+), 40 deletions(-) diff --git a/.travis.yml b/.travis.yml index 675186469..96effeee9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,7 +18,7 @@ env: - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.20.0" - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.0" - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" RUN_FLAKE8="true" SKIP_TESTS="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" COVERAGE="true" DOCPUSH="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true" # Checks for older scikit-learn versions (which also don't nicely work with # Python3.7) - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 2728076fe..67b5cc419 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -27,10 +27,6 @@ import sklearn.tree import sklearn.cluster -if LooseVersion(sklearn.__version__) < "0.20": - from sklearn.preprocessing import Imputer -else: - from sklearn.impute import SimpleImputer as Imputer import openml from openml.extensions.sklearn import SklearnExtension @@ -39,6 +35,7 @@ from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace from openml.testing import TestBase +from openml._backport import SimpleImputer this_directory = os.path.dirname(os.path.abspath(__file__)) sys.path.append(this_directory) @@ -941,7 +938,7 @@ def test_illegal_parameter_names(self): def test_illegal_parameter_names_pipeline(self): # illegal name: steps steps = [ - ('Imputer', Imputer(strategy='median')), + ('Imputer', SimpleImputer(strategy='median')), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')), @@ -954,7 +951,7 @@ def test_illegal_parameter_names_featureunion(self): # illegal name: transformer_list transformer_list = [ ('transformer_list', - Imputer(strategy='median')), + SimpleImputer(strategy='median')), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')) @@ -1045,7 +1042,7 @@ def test_deserialize_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter # settings. - steps = [('Imputer', Imputer()), + steps = [('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ('Estimator', sklearn.tree.DecisionTreeClassifier())] pipe_orig = sklearn.pipeline.Pipeline(steps=steps) @@ -1069,7 +1066,7 @@ def test_deserialize_adaboost_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter # settings. - steps = [('Imputer', Imputer()), + steps = [('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ('Estimator', sklearn.ensemble.AdaBoostClassifier( sklearn.tree.DecisionTreeClassifier()))] @@ -1095,7 +1092,7 @@ def test_deserialize_complex_with_defaults(self): # method to return a flow that contains default hyperparameter # settings. steps = [ - ('Imputer', Imputer()), + ('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ( 'Estimator', @@ -1299,7 +1296,7 @@ def test_run_model_on_fold_classification_1(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeClassifier()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1425,11 +1422,11 @@ def predict_proba(*args, **kwargs): y_train = y[train_indices] X_test = X[test_indices] clf1 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', sklearn.naive_bayes.GaussianNB()) ]) clf2 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', HardNaiveBayes()) ]) @@ -1482,7 +1479,7 @@ def test_run_model_on_fold_regression(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeRegressor()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1527,7 +1524,7 @@ def test_run_model_on_fold_clustering(self): X = task.get_X(dataset_format='array') pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.cluster.KMeans()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1616,7 +1613,7 @@ def test_trim_flow_name(self): long = """sklearn.pipeline.Pipeline( columntransformer=sklearn.compose._column_transformer.ColumnTransformer( numeric=sklearn.pipeline.Pipeline( - imputer=sklearn.preprocessing.imputation.Imputer, + SimpleImputer=sklearn.preprocessing.imputation.Imputer, standardscaler=sklearn.preprocessing.data.StandardScaler), nominal=sklearn.pipeline.Pipeline( simpleimputer=sklearn.impute.SimpleImputer, @@ -1640,7 +1637,7 @@ def test_trim_flow_name(self): self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) long = """sklearn.pipeline.Pipeline( - Imputer=sklearn.preprocessing.imputation.Imputer, + SimpleImputer=sklearn.preprocessing.imputation.Imputer, VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501 Estimator=sklearn.model_selection._search.RandomizedSearchCV( estimator=sklearn.tree.tree.DecisionTreeClassifier))""" @@ -1650,7 +1647,7 @@ def test_trim_flow_name(self): long = """sklearn.model_selection._search.RandomizedSearchCV( estimator=sklearn.pipeline.Pipeline( - Imputer=sklearn.preprocessing.imputation.Imputer, + SimpleImputer=sklearn.preprocessing.imputation.Imputer, classifier=sklearn.ensemble.forest.RandomForestClassifier))""" short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))" long_stripped, _ = re.subn(r'\s', '', long) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 6e7eb7fbb..f6829838c 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -19,10 +19,7 @@ import sklearn.naive_bayes import sklearn.tree -if LooseVersion(sklearn.__version__) < "0.20": - from sklearn.preprocessing import Imputer -else: - from sklearn.impute import SimpleImputer as Imputer +from openml._backport import SimpleImputer import xmltodict @@ -76,6 +73,8 @@ def test_get_flow(self): self.assertEqual(subflow_3.parameters['L'], '-1') self.assertEqual(len(subflow_3.components), 0) + + def test_get_structure(self): # also responsible for testing: flow.get_subflow # We need to use the production server here because 4024 is not the @@ -302,8 +301,8 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): flow.flow_id)) fixture = ( - "The flow on the server is inconsistent with the local flow. " - "The server flow ID is 1. Please check manually and remove " + "Flow was not stored correctly on the server. " + "New flow ID is 1. Please check manually and remove " "the flow if necessary! Error is:\n" "'Flow sklearn.ensemble.forest.RandomForestClassifier: " "values for attribute 'name' differ: " diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 23ab43df0..a9651a785 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -7,8 +7,8 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline -from sklearn.preprocessing import Imputer +from openml._backport import SimpleImputer from openml.testing import TestBase import openml import openml.extensions.sklearn @@ -106,7 +106,7 @@ def _check_array(array, type_): def test_to_from_filesystem_vanilla(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DecisionTreeClassifier(max_depth=1)), ]) task = openml.tasks.get_task(119) @@ -139,7 +139,7 @@ def test_to_from_filesystem_vanilla(self): def test_to_from_filesystem_search(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DecisionTreeClassifier(max_depth=1)), ]) model = GridSearchCV( @@ -175,7 +175,7 @@ def test_to_from_filesystem_search(self): def test_to_from_filesystem_no_model(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DummyClassifier()), ]) task = openml.tasks.get_task(119) @@ -205,7 +205,7 @@ def test_publish_with_local_loaded_flow(self): extension = openml.extensions.sklearn.SklearnExtension() model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DummyClassifier()), ]) task = openml.tasks.get_task(119) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index bd123cd37..56ee7d909 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -24,11 +24,12 @@ ) from openml.runs.trace import OpenMLRunTrace from openml.tasks import TaskTypeEnum +from openml._backport import SimpleImputer from sklearn.naive_bayes import GaussianNB from sklearn.model_selection._search import BaseSearchCV from sklearn.tree import DecisionTreeClassifier -from sklearn.preprocessing.imputation import Imputer + from sklearn.dummy import DummyClassifier from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import VarianceThreshold @@ -550,7 +551,7 @@ def get_ct_cf(nominal_indices, numeric_indices): '62501', sentinel=sentinel) def test_run_and_upload_decision_tree_pipeline(self): - pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV( DecisionTreeClassifier(), @@ -657,7 +658,7 @@ def test_learning_curve_task_2(self): num_folds = 10 num_samples = 8 - pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV( DecisionTreeClassifier(), @@ -734,7 +735,7 @@ def _test_local_evaluations(self, run): def test_local_run_swapped_parameter_order_model(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) # download task @@ -752,7 +753,7 @@ def test_local_run_swapped_parameter_order_model(self): def test_local_run_swapped_parameter_order_flow(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) flow = self.extension.model_to_flow(clf) @@ -771,7 +772,7 @@ def test_local_run_swapped_parameter_order_flow(self): def test_local_run_metric_score(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) # download task @@ -798,7 +799,7 @@ def test_online_run_metric_score(self): def test_initialize_model_from_run(self): clf = sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='median')), + ('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold(threshold=0.05)), ('Estimator', GaussianNB())]) task = openml.tasks.get_task(11) @@ -882,12 +883,12 @@ def test__run_exists(self): rs = 1 clfs = [ sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='mean')), + ('Imputer', SimpleImputer(strategy='mean')), ('VarianceThreshold', VarianceThreshold(threshold=0.05)), ('Estimator', DecisionTreeClassifier(max_depth=4)) ]), sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='most_frequent')), + ('Imputer', SimpleImputer(strategy='most_frequent')), ('VarianceThreshold', VarianceThreshold(threshold=0.1)), ('Estimator', DecisionTreeClassifier(max_depth=4))] ) @@ -1251,7 +1252,7 @@ def test_run_on_dataset_with_missing_labels(self): flow.name = 'dummy' task = openml.tasks.get_task(2) - model = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + model = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('Estimator', DecisionTreeClassifier())]) data_content, _, _, _ = _run_task_get_arffcontent( diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index 62d1a98c8..c4919abb9 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -30,12 +30,13 @@ def test_Figure1a(self): import sklearn.pipeline import sklearn.preprocessing import sklearn.tree + from openml._backport import SimpleImputer benchmark_suite = openml.study.get_study( 'OpenML100', 'tasks' ) # obtain the benchmark suite clf = sklearn.pipeline.Pipeline( steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', sklearn.tree.DecisionTreeClassifier()) ] ) # build a sklearn classifier From fba9efa6c6e6d8b7cbb2939f443b961281fe64ce Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 11:27:27 -0400 Subject: [PATCH 02/17] fix call to roc_auc --- tests/test_runs/test_run_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 56ee7d909..746546c04 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -715,9 +715,9 @@ def _test_local_evaluations(self, run): np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores) - # also check if we can obtain some other scores: # TODO: how to do AUC? + # also check if we can obtain some other scores: tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}), - (sklearn.metrics.auc, {'reorder': True}), + (sklearn.metrics.roc_auc, {}), (sklearn.metrics.average_precision_score, {}), (sklearn.metrics.jaccard_similarity_score, {}), (sklearn.metrics.precision_score, {'average': 'macro'}), From 29b13c1927a8f8b6d457ba68443be8e6b3cba9e1 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 11:44:18 -0400 Subject: [PATCH 03/17] added verbose parameter to pipeline in 0.21 --- .../test_sklearn_extension/test_sklearn_extension.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index a3e6f8d00..5073c605f 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -1012,18 +1012,25 @@ def test_paralizable_check(self): self.extension._prevent_optimize_n_jobs(model) def test__get_fn_arguments_with_defaults(self): - if LooseVersion(sklearn.__version__) < "0.19": + sklearn_version = LooseVersion(sklearn.__version__) + if sklearn_version < "0.19": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 15), (sklearn.tree.DecisionTreeClassifier.__init__, 12), (sklearn.pipeline.Pipeline.__init__, 0) ] - else: + elif sklearn_version < "0.21": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 16), (sklearn.tree.DecisionTreeClassifier.__init__, 13), (sklearn.pipeline.Pipeline.__init__, 1) ] + else: + fns = [ + (sklearn.ensemble.RandomForestRegressor.__init__, 16), + (sklearn.tree.DecisionTreeClassifier.__init__, 13), + (sklearn.pipeline.Pipeline.__init__, 2) + ] for fn, num_params_with_defaults in fns: defaults, defaultless = ( From 609ad77cf896fe0c5cbc8d70eb0a9b6906c511cb Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 11:46:41 -0400 Subject: [PATCH 04/17] remove no-longer-existant categorical_features paramter --- .../test_sklearn_extension/test_sklearn_extension.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 5073c605f..3b87ab0d0 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -621,7 +621,7 @@ def test_serialize_feature_union_switched_names(self): .format(module_name_encoder)) def test_serialize_complex_flow(self): - ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0]) + ohe = sklearn.preprocessing.OneHotEncoder() scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( base_estimator=sklearn.tree.DecisionTreeClassifier()) From 1b5dbd125a761ea5587b2e35b25a29dd6fa7766a Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 11:52:05 -0400 Subject: [PATCH 05/17] more pipeline parameter checks --- .../test_sklearn_extension/test_sklearn_extension.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 3b87ab0d0..e502d309c 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -282,11 +282,14 @@ def test_serialize_pipeline(self): # Comparing the pipeline # The parameters only have the name of base objects(not the whole flow) # as value - # memory parameter has been added in 0.19 + # memory parameter has been added in 0.19, verbose in 0.21 if LooseVersion(sklearn.__version__) < "0.19": self.assertEqual(len(serialization.parameters), 1) - else: + elif LooseVersion(sklearn.__version__) < "0.21": self.assertEqual(len(serialization.parameters), 2) + else: + self.assertEqual(len(serialization.parameters), 3) + # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual( @@ -371,8 +374,10 @@ def test_serialize_pipeline_clustering(self): # memory parameter has been added in 0.19 if LooseVersion(sklearn.__version__) < "0.19": self.assertEqual(len(serialization.parameters), 1) - else: + elif LooseVersion(sklearn.__version__) < "0.21": self.assertEqual(len(serialization.parameters), 2) + else: + self.assertEqual(len(serialization.parameters), 3) # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual( From ebaa18fdca666226c42577fa8d36b38812a1b1d4 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 11:52:13 -0400 Subject: [PATCH 06/17] more imputer replacements --- tests/test_flows/test_flow.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index f6829838c..f539a86e5 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -74,7 +74,6 @@ def test_get_flow(self): self.assertEqual(len(subflow_3.components), 0) - def test_get_structure(self): # also responsible for testing: flow.get_subflow # We need to use the production server here because 4024 is not the @@ -317,8 +316,8 @@ def test_illegal_flow(self): # should throw error as it contains two imputers illegal = sklearn.pipeline.Pipeline( steps=[ - ('imputer1', Imputer()), - ('imputer2', Imputer()), + ('imputer1', SimpleImputer()), + ('imputer2', SimpleImputer()), ('classif', sklearn.tree.DecisionTreeClassifier()) ] ) @@ -349,7 +348,7 @@ def test_existing_flow_exists(self): if LooseVersion(sklearn.__version__) >= '0.20': ohe_params['categories'] = 'auto' steps = [ - ('imputation', Imputer(strategy='median')), + ('imputation', SimpleImputer(strategy='median')), ('hotencoding', sklearn.preprocessing.OneHotEncoder(**ohe_params)), ( 'variencethreshold', From 1098594b5ac91cc11b46697204f002107f588fc8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 11:53:42 -0400 Subject: [PATCH 07/17] don't break on dev versions --- tests/test_flows/test_flow_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index de933731a..95b4fa3f0 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -288,7 +288,7 @@ def test_get_flow_reinstantiate_model_no_extension(self): def test_get_flow_reinstantiate_model_wrong_version(self): # Note that CI does not test against 0.19.1. openml.config.server = self.production_server - _, sklearn_major, _ = LooseVersion(sklearn.__version__).version + _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3] flow = 8175 expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.' self.assertRaisesRegex(ValueError, From 2eb7f7b2e90f50100f9dd5a2cce00246afeb6d41 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 11:53:51 -0400 Subject: [PATCH 08/17] typo on roc_auc_score name --- tests/test_runs/test_run_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 746546c04..ab4999586 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -717,7 +717,7 @@ def _test_local_evaluations(self, run): # also check if we can obtain some other scores: tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}), - (sklearn.metrics.roc_auc, {}), + (sklearn.metrics.roc_auc_score, {}), (sklearn.metrics.average_precision_score, {}), (sklearn.metrics.jaccard_similarity_score, {}), (sklearn.metrics.precision_score, {'average': 'macro'}), From 0b033d9a938f0b3187e4bc78403a741c1012a509 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 12:07:30 -0400 Subject: [PATCH 09/17] use ordered dicts, avoid nan comparison --- .../test_sklearn_extension.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index e502d309c..3554555d1 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -749,15 +749,16 @@ def test_serialize_simple_parameter_grid(self): # Examples from the scikit-learn documentation models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()] grids = \ - [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, - {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], - 'kernel': ['rbf']}], - {"max_depth": [3, None], - "max_features": [1, 3, 10], - "min_samples_split": [1, 3, 10], - "min_samples_leaf": [1, 3, 10], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]}] + [[OrderedDict({'C': [1, 10, 100, 1000], 'kernel': ['linear']}), + OrderedDict({'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], + 'kernel': ['rbf']})], + OrderedDict({"bootstrap": [True, False], + "criterion": ["gini", "entropy"], + "max_depth": [3, None], + "max_features": [1, 3, 10], + "min_samples_leaf": [1, 3, 10], + "min_samples_split": [1, 3, 10] + })] for grid, model in zip(grids, models): serialized = self.extension.model_to_flow(grid) @@ -765,9 +766,9 @@ def test_serialize_simple_parameter_grid(self): self.assertEqual(deserialized, grid) self.assertIsNot(deserialized, grid) - + # providing error_score because nan != nan hpo = sklearn.model_selection.GridSearchCV( - param_grid=grid, estimator=model) + param_grid=grid, estimator=model, error_score=-1000) serialized = self.extension.model_to_flow(hpo) deserialized = self.extension.flow_to_model(serialized) From 4e9f75c8fb6f5c120f340f548c1947772ff699c5 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 12:09:56 -0400 Subject: [PATCH 10/17] undid weird merge artifact --- tests/test_flows/test_flow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index f539a86e5..841e9d1cb 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -300,8 +300,8 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): flow.flow_id)) fixture = ( - "Flow was not stored correctly on the server. " - "New flow ID is 1. Please check manually and remove " + "The flow on the server is inconsistent with the local flow. " + "The server flow ID is 1. Please check manually and remove " "the flow if necessary! Error is:\n" "'Flow sklearn.ensemble.forest.RandomForestClassifier: " "values for attribute 'name' differ: " From 54382a4ea0ab0ff4bbc0aa0df205932494f40f45 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 13:05:15 -0400 Subject: [PATCH 11/17] add missing file whoops --- openml/_backport.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 openml/_backport.py diff --git a/openml/_backport.py b/openml/_backport.py new file mode 100644 index 000000000..b8642dd10 --- /dev/null +++ b/openml/_backport.py @@ -0,0 +1,6 @@ +try: + from sklearn.impute import SimpleImputer +except ImportError: + from sklearn.preprocessing.impute import Imputer as SimpleImputer + +__all__ = ['SimpleImputer'] From a8083571ffd14c9e20d7acc861f02958d9a16d11 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 13:07:01 -0400 Subject: [PATCH 12/17] flake8 --- .../test_sklearn_extension/test_sklearn_extension.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 3554555d1..49c753ee8 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -1311,7 +1311,7 @@ def test_run_model_on_fold_classification_1(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', SimpleImputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeClassifier()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1437,11 +1437,11 @@ def predict_proba(*args, **kwargs): y_train = y[train_indices] X_test = X[test_indices] clf1 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', SimpleImputer()), + ('imputer', SimpleImputer()), ('estimator', sklearn.naive_bayes.GaussianNB()) ]) clf2 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', SimpleImputer()), + ('imputer', SimpleImputer()), ('estimator', HardNaiveBayes()) ]) @@ -1494,7 +1494,7 @@ def test_run_model_on_fold_regression(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', SimpleImputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeRegressor()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1539,7 +1539,7 @@ def test_run_model_on_fold_clustering(self): X = task.get_X(dataset_format='array') pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', SimpleImputer()), + ('imp', SimpleImputer()), ('clf', sklearn.cluster.KMeans()), ]) # TODO add some mocking here to actually test the innards of this function, too! From 5d4db1ebfdd807c3c7955ad1f274333a68602d51 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 25 Jul 2019 14:00:32 -0400 Subject: [PATCH 13/17] try fixing import in backport, pep8 --- openml/_backport.py | 2 +- tests/test_flows/test_flow.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/openml/_backport.py b/openml/_backport.py index b8642dd10..cc68ab222 100644 --- a/openml/_backport.py +++ b/openml/_backport.py @@ -1,6 +1,6 @@ try: from sklearn.impute import SimpleImputer except ImportError: - from sklearn.preprocessing.impute import Imputer as SimpleImputer + from sklearn.preprocessing import Imputer as SimpleImputer __all__ = ['SimpleImputer'] diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 841e9d1cb..eb5d23e7f 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -73,7 +73,6 @@ def test_get_flow(self): self.assertEqual(subflow_3.parameters['L'], '-1') self.assertEqual(len(subflow_3.components), 0) - def test_get_structure(self): # also responsible for testing: flow.get_subflow # We need to use the production server here because 4024 is not the From 1686dfd75014cb603e8ae85e162f2ca958c24fc1 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 26 Jul 2019 10:47:13 -0400 Subject: [PATCH 14/17] move SimpleImputer to testing module --- openml/_backport.py | 6 ------ openml/testing.py | 8 +++++++- .../test_sklearn_extension/test_sklearn_extension.py | 4 ++-- tests/test_flows/test_flow.py | 4 +--- tests/test_runs/test_run_functions.py | 3 +-- tests/test_study/test_study_examples.py | 4 ++-- 6 files changed, 13 insertions(+), 16 deletions(-) delete mode 100644 openml/_backport.py diff --git a/openml/_backport.py b/openml/_backport.py deleted file mode 100644 index cc68ab222..000000000 --- a/openml/_backport.py +++ /dev/null @@ -1,6 +0,0 @@ -try: - from sklearn.impute import SimpleImputer -except ImportError: - from sklearn.preprocessing import Imputer as SimpleImputer - -__all__ = ['SimpleImputer'] diff --git a/openml/testing.py b/openml/testing.py index dad1aa9f5..c5a12068c 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -319,4 +319,10 @@ def _check_fold_timing_evaluations( self.assertLessEqual(evaluation, max_val) -__all__ = ['TestBase'] +try: + from sklearn.impute import SimpleImputer +except ImportError: + from sklearn.preprocessing import Imputer as SimpleImputer + + +__all__ = ['TestBase', 'SimpleImputer'] diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 49c753ee8..3fbe94b5d 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -35,8 +35,8 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase -from openml._backport import SimpleImputer +from openml.testing import TestBase, SimpleImputer + this_directory = os.path.dirname(os.path.abspath(__file__)) sys.path.append(this_directory) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index eb5d23e7f..25e2dacfb 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -19,15 +19,13 @@ import sklearn.naive_bayes import sklearn.tree -from openml._backport import SimpleImputer - import xmltodict import openml from openml._api_calls import _perform_api_call import openml.exceptions import openml.extensions.sklearn -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer import openml.utils diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index ab4999586..2b09ef501 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -17,14 +17,13 @@ import pandas as pd import openml.extensions.sklearn -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer from openml.runs.functions import ( _run_task_get_arffcontent, run_exists, ) from openml.runs.trace import OpenMLRunTrace from openml.tasks import TaskTypeEnum -from openml._backport import SimpleImputer from sklearn.naive_bayes import GaussianNB from sklearn.model_selection._search import BaseSearchCV diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index c4919abb9..1d9c56d54 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,4 +1,4 @@ -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer class TestStudyFunctions(TestBase): @@ -30,7 +30,7 @@ def test_Figure1a(self): import sklearn.pipeline import sklearn.preprocessing import sklearn.tree - from openml._backport import SimpleImputer + benchmark_suite = openml.study.get_study( 'OpenML100', 'tasks' ) # obtain the benchmark suite From 34c24236523252e976ebc7e92641d16193c14207 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 26 Jul 2019 10:51:47 -0400 Subject: [PATCH 15/17] don't trust dicts to be ordered --- .../test_sklearn_extension.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 3fbe94b5d..7a854a20e 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -749,16 +749,16 @@ def test_serialize_simple_parameter_grid(self): # Examples from the scikit-learn documentation models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()] grids = \ - [[OrderedDict({'C': [1, 10, 100, 1000], 'kernel': ['linear']}), - OrderedDict({'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], - 'kernel': ['rbf']})], - OrderedDict({"bootstrap": [True, False], - "criterion": ["gini", "entropy"], - "max_depth": [3, None], - "max_features": [1, 3, 10], - "min_samples_leaf": [1, 3, 10], - "min_samples_split": [1, 3, 10] - })] + [[OrderedDict([('C', [1, 10, 100, 1000]), ('kernel', ['linear'])]), + OrderedDict([('C', [1, 10, 100, 1000]), ('gamma', [0.001, 0.0001]), + ('kernel', ['rbf'])])], + OrderedDict([("bootstrap", [True, False]), + ("criterion", ["gini", "entropy"]), + ("max_depth", [3, None]), + ("max_features", [1, 3, 10]), + ("min_samples_leaf", [1, 3, 10]), + ("min_samples_split", [1, 3, 10]) + ])] for grid, model in zip(grids, models): serialized = self.extension.model_to_flow(grid) From 87a1366b1d36b6318614fdf8366ba19679d5421b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 26 Jul 2019 11:06:01 -0400 Subject: [PATCH 16/17] run CI mostly on 0.21.2 --- .travis.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 96effeee9..beaa3b53e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,10 +15,11 @@ env: - TEST_DIR=/tmp/test_dir/ - MODULE=openml matrix: - - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.20.0" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.0" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" RUN_FLAKE8="true" SKIP_TESTS="true" + - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true" - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2" # Checks for older scikit-learn versions (which also don't nicely work with # Python3.7) - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" From bdbb1f43244f32a6d33026fc405326882eef8e55 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 26 Jul 2019 11:15:23 -0400 Subject: [PATCH 17/17] failed to safe lol --- .../test_sklearn_extension/test_sklearn_extension.py | 2 +- tests/test_runs/test_run.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 2203a568e..8bc615516 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -1247,7 +1247,7 @@ def test_run_model_on_task(self): class MyPipe(sklearn.pipeline.Pipeline): pass task = openml.tasks.get_task(1) - pipe = MyPipe([('imp', Imputer()), + pipe = MyPipe([('imp', SimpleImputer()), ('dummy', sklearn.dummy.DummyClassifier())]) openml.runs.run_model_on_task(pipe, task) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index a9651a785..88fe8d6ef 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -8,8 +8,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline -from openml._backport import SimpleImputer -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer import openml import openml.extensions.sklearn