diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py new file mode 100644 index 000000000..714e64221 --- /dev/null +++ b/examples/30_extended/plot_svm_hyperparameters_tutorial.py @@ -0,0 +1,78 @@ +""" +================================ +Plotting hyperparameter surfaces +================================ +""" +import openml +import numpy as np + +#################################################################################################### +# First step - obtaining the data +# =============================== +# First, we nood to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are +# not part of this tutorial, this could for example be done via the website. +# +# For this we use the function ``list_evaluations_setup`` which can automatically join +# evaluations conducted by the server with the hyperparameter settings extracted from the +# uploaded runs (called *setup*). +df = openml.evaluations.list_evaluations_setups( + function='predictive_accuracy', + flow=[8353], + task=[6], + output_format='dataframe', + # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise, + # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary. + parameters_in_separate_columns=True, +) +print(df.head(n=10)) + +#################################################################################################### +# We can see all the hyperparameter names in the columns of the dataframe: +for name in df.columns: + print(name) + +#################################################################################################### +# Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we +# can nicely plot them. +hyperparameters = ['sklearn.svm.classes.SVC(16)_C', 'sklearn.svm.classes.SVC(16)_gamma'] +df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log) + +#################################################################################################### +# Option 1 - plotting via the pandas helper functions +# =================================================== +# +df.plot.hexbin( + x='sklearn.svm.classes.SVC(16)_C', + y='sklearn.svm.classes.SVC(16)_gamma', + C='value', + reduce_C_function=np.mean, + gridsize=25, + title='SVM performance landscape', +) + +#################################################################################################### +# Option 2 - plotting via matplotlib +# ================================== +# +import matplotlib.pyplot as plt + +fig, ax = plt.subplots() + +C = df['sklearn.svm.classes.SVC(16)_C'] +gamma = df['sklearn.svm.classes.SVC(16)_gamma'] +score = df['value'] + +# Plotting all evaluations: +ax.plot(C, gamma, 'ko', ms=1) +# Create a contour plot +cntr = ax.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r") +# Adjusting the colorbar +fig.colorbar(cntr, ax=ax, label="accuracy") +# Adjusting the axis limits +ax.set( + xlim=(min(C), max(C)), + ylim=(min(gamma), max(gamma)), + xlabel="C (log10)", + ylabel="gamma (log10)", +) +ax.set_title('SVM performance landscape') diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py new file mode 100644 index 000000000..0d75c928e --- /dev/null +++ b/tests/test_evaluations/test_evaluations_example.py @@ -0,0 +1,31 @@ +import unittest + + +class TestEvaluationsExample(unittest.TestCase): + + def test_example_python_paper(self): + # Example script which will appear in the upcoming OpenML-Python paper + # This test ensures that the example will keep running! + + import openml + import numpy as np + import matplotlib.pyplot as plt + + df = openml.evaluations.list_evaluations_setups( + 'predictive_accuracy', + flow=[8353], + task=[6], + output_format='dataframe', + parameters_in_separate_columns=True, + ) # Choose an SVM flow, for example 8353, and a task. + + hp_names = ['sklearn.svm.classes.SVC(16)_C', 'sklearn.svm.classes.SVC(16)_gamma'] + df[hp_names] = df[hp_names].astype(float).apply(np.log) + C, gamma, score = df[hp_names[0]], df[hp_names[1]], df['value'] + + cntr = plt.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r") + plt.colorbar(cntr, label="accuracy") + plt.xlim((min(C), max(C))) + plt.ylim((min(gamma), max(gamma))) + plt.xlabel("C (log10)") + plt.ylabel("gamma (log10)")