From 5359f3ad081abbec16d8e09d8dbc61ab4bb53ad4 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Tue, 22 Jan 2019 18:44:56 -0500
Subject: [PATCH 1/3] added ability to obtain per fold evaluation measures

---
 openml/evaluations/evaluation.py              |  9 ++--
 openml/evaluations/functions.py               | 42 +++++++++++-----
 openml/runs/functions.py                      |  2 +-
 .../test_evaluation_functions.py              | 48 +++++++++++++++++--
 4 files changed, 82 insertions(+), 19 deletions(-)

diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index 70acf0029..f297d7054 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -1,6 +1,6 @@
 
 class OpenMLEvaluation(object):
-    '''
+    """
     Contains all meta-information about a run / evaluation combination,
     according to the evaluation/list function
 
@@ -26,11 +26,13 @@ class OpenMLEvaluation(object):
         the time of evaluation
     value : float
         the value of this evaluation
+    values : List[float]
+        the values per repeat and fold (if requested)
     array_data : str
         list of information per class (e.g., in case of precision, auroc, recall)
-    '''
+    """
     def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
-                 data_id, data_name, function, upload_time, value,
+                 data_id, data_name, function, upload_time, value, values,
                  array_data=None):
         self.run_id = run_id
         self.task_id = task_id
@@ -42,4 +44,5 @@ def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
         self.function = function
         self.upload_time = upload_time
         self.value = value
+        self.values = values
         self.array_data = array_data
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index a7691a72e..88916026d 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -1,13 +1,14 @@
+import csv
 import xmltodict
 
-from openml.exceptions import OpenMLServerNoResult
 import openml.utils
 import openml._api_calls
 from ..evaluations import OpenMLEvaluation
 
 
 def list_evaluations(function, offset=None, size=None, id=None, task=None,
-                     setup=None, flow=None, uploader=None, tag=None):
+                     setup=None, flow=None, uploader=None, tag=None,
+                     per_fold=None):
     """
     List all run-evaluation pairs matching all of the given filters.
     (Supports large amount of results)
@@ -33,13 +34,19 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None,
 
     tag : str, optional
 
+    per_fold : bool, optional
+
     Returns
     -------
     dict
     """
+    if per_fold is not None:
+        per_fold = str(per_fold).lower()
 
-    return openml.utils._list_all(_list_evaluations, function, offset=offset, size=size,
-                                  id=id, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag)
+    return openml.utils._list_all(_list_evaluations, function, offset=offset,
+                                  size=size, id=id, task=task, setup=setup,
+                                  flow=flow, uploader=uploader, tag=tag,
+                                  per_fold=per_fold)
 
 
 def _list_evaluations(function, id=None, task=None,
@@ -94,11 +101,12 @@ def _list_evaluations(function, id=None, task=None,
 def __list_evaluations(api_call):
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call)
+    print(xml_string)
     evals_dict = xmltodict.parse(xml_string, force_list=('oml:evaluation',))
     # Minimalistic check if the XML is useful
     if 'oml:evaluations' not in evals_dict:
-        raise ValueError('Error in return XML, does not contain "oml:evaluations": %s'
-                         % str(evals_dict))
+        raise ValueError('Error in return XML, does not contain '
+                         '"oml:evaluations": %s' % str(evals_dict))
 
     assert type(evals_dict['oml:evaluations']['oml:evaluation']) == list, \
         type(evals_dict['oml:evaluations'])
@@ -106,15 +114,25 @@ def __list_evaluations(api_call):
     evals = dict()
     for eval_ in evals_dict['oml:evaluations']['oml:evaluation']:
         run_id = int(eval_['oml:run_id'])
+        value = None
+        values = None
         array_data = None
+        if 'oml:value' in eval_:
+            value = float(eval_['oml:value'])
+        if 'oml:values' in eval_:
+            values = csv.reader(eval_['oml:values'])
         if 'oml:array_data' in eval_:
             array_data = eval_['oml:array_data']
 
-        evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']), int(eval_['oml:task_id']),
-                                      int(eval_['oml:setup_id']), int(eval_['oml:flow_id']),
-                                      eval_['oml:flow_name'], eval_['oml:data_id'],
-                                      eval_['oml:data_name'], eval_['oml:function'],
-                                      eval_['oml:upload_time'], float(eval_['oml:value']),
-                                      array_data)
+        evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']),
+                                         int(eval_['oml:task_id']),
+                                         int(eval_['oml:setup_id']),
+                                         int(eval_['oml:flow_id']),
+                                         eval_['oml:flow_name'],
+                                         eval_['oml:data_id'],
+                                         eval_['oml:data_name'],
+                                         eval_['oml:function'],
+                                         eval_['oml:upload_time'],
+                                         value, values, array_data)
 
     return evals
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 9dcb96a42..5dbfe1948 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -179,7 +179,7 @@ def _publish_flow_if_necessary(flow):
     except OpenMLServerException as e:
         if e.message == "flow already exists":
             # TODO: JvR: the following lines of code can be replaced by
-            # a pass (after changing the unit test) as run_flow_on_task does
+            # a pass (after changing the unit tests) as run_flow_on_task does
             # not longer rely on it
             flow_id = openml.flows.flow_exists(flow.name,
                                                flow.external_version)
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index be55c2cd8..598655de9 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -2,6 +2,7 @@
 import openml.evaluations
 from openml.testing import TestBase
 
+
 class TestEvaluationFunctions(TestBase):
     _multiprocess_can_split_ = True
 
@@ -15,6 +16,10 @@ def test_evaluation_list_filter_task(self):
         self.assertGreater(len(evaluations), 100)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].task_id, task_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_filter_uploader_ID_16(self):
         openml.config.server = self.production_server
@@ -23,7 +28,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", uploader=[uploader_id])
 
-        self.assertGreater(len(evaluations), 100)
+        self.assertGreater(len(evaluations), 50)
 
     def test_evaluation_list_filter_uploader_ID_10(self):
         openml.config.server = self.production_server
@@ -32,9 +37,13 @@ def test_evaluation_list_filter_uploader_ID_10(self):
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", setup=[setup_id])
 
-        self.assertGreater(len(evaluations), 100)
+        self.assertGreater(len(evaluations), 50)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].setup_id, setup_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_filter_flow(self):
         openml.config.server = self.production_server
@@ -46,17 +55,25 @@ def test_evaluation_list_filter_flow(self):
         self.assertGreater(len(evaluations), 2)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].flow_id, flow_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_filter_run(self):
         openml.config.server = self.production_server
 
-        run_id = 1
+        run_id = 12
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", id=[run_id])
 
         self.assertEquals(len(evaluations), 1)
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].run_id, run_id)
+            # default behaviour of this method: return aggregated results (not
+            # per fold)
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)
 
     def test_evaluation_list_limit(self):
         openml.config.server = self.production_server
@@ -70,3 +87,28 @@ def test_list_evaluations_empty(self):
             raise ValueError('UnitTest Outdated, got somehow results')
 
         self.assertIsInstance(evaluations, dict)
+
+    def test_evaluation_list_per_fold(self):
+        openml.config.server = self.production_server
+        size = 1000
+        task_ids = [6]
+        uploader_ids = [1]
+        flow_ids = [6969]
+
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=size, offset=0, task=task_ids,
+            flow=flow_ids, uploader=uploader_ids, per_fold=True)
+
+        self.assertEquals(len(evaluations), size)
+        for run_id in evaluations.keys():
+            self.assertIsNone(evaluations[run_id].value)
+            self.assertIsNotNone(evaluations[run_id].values)
+            # potentially we could also test array values, but these might be
+            # added in the future
+
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=size, offset=0, task=task_ids,
+            flow=flow_ids, uploader=uploader_ids, per_fold=False)
+        for run_id in evaluations.keys():
+            self.assertIsNotNone(evaluations[run_id].value)
+            self.assertIsNone(evaluations[run_id].values)

From 1884c6c5894803e371d2744266c33a81ab7ed7d4 Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Wed, 23 Jan 2019 15:17:38 -0500
Subject: [PATCH 2/3] added json loads

---
 openml/evaluations/functions.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 88916026d..02a3152bb 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -1,4 +1,4 @@
-import csv
+import json
 import xmltodict
 
 import openml.utils
@@ -101,7 +101,6 @@ def _list_evaluations(function, id=None, task=None,
 def __list_evaluations(api_call):
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call)
-    print(xml_string)
     evals_dict = xmltodict.parse(xml_string, force_list=('oml:evaluation',))
     # Minimalistic check if the XML is useful
     if 'oml:evaluations' not in evals_dict:
@@ -120,7 +119,7 @@ def __list_evaluations(api_call):
         if 'oml:value' in eval_:
             value = float(eval_['oml:value'])
         if 'oml:values' in eval_:
-            values = csv.reader(eval_['oml:values'])
+            values = json.loads(eval_['oml:values'])
         if 'oml:array_data' in eval_:
             array_data = eval_['oml:array_data']
 

From 1457bc052a85c749f8af1ae7dc0f542e53074b4f Mon Sep 17 00:00:00 2001
From: janvanrijn <janvanrijn@gmail.com>
Date: Mon, 11 Feb 2019 15:52:34 +0100
Subject: [PATCH 3/3] updated unit test

---
 tests/test_runs/test_run_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 0c983d861..1bee66d3d 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -999,7 +999,7 @@ def _check_run(self, run):
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
-        runs = openml.runs.list_runs(id=[2])
+        runs = openml.runs.list_runs(id=[2], show_errors=True)
         self.assertEqual(len(runs), 1)
         for rid in runs:
             self._check_run(runs[rid])