diff --git a/openml/base.py b/openml/base.py new file mode 100644 index 000000000..64d8a770a --- /dev/null +++ b/openml/base.py @@ -0,0 +1,129 @@ +from abc import ABC, abstractmethod +from collections import OrderedDict +import re +from typing import Optional, List, Tuple, Union +import webbrowser + +import xmltodict + +import openml.config +from .utils import _tag_openml_base + + +class OpenMLBase(ABC): + """ Base object for functionality that is shared across entities. """ + + def __repr__(self): + body_fields = self._get_repr_body_fields() + return self._apply_repr_template(body_fields) + + @property + @abstractmethod + def id(self) -> Optional[int]: + """ The id of the entity, it is unique for its entity type. """ + pass + + @property + def openml_url(self) -> Optional[str]: + """ The URL of the object on the server, if it was uploaded, else None. """ + if self.id is None: + return None + return self.__class__.url_for_id(self.id) + + @classmethod + def url_for_id(cls, id_: int) -> str: + """ Return the OpenML URL for the object of the class entity with the given id. """ + # Sample url for a flow: openml.org/f/123 + return "{}/{}/{}".format(openml.config.server_base_url, cls._entity_letter(), id_) + + @classmethod + def _entity_letter(cls) -> str: + """ Return the letter which represents the entity type in urls, e.g. 'f' for flow.""" + # We take advantage of the class naming convention (OpenMLX), + # which holds for all entities except studies and tasks, which overwrite this method. + return cls.__name__.lower()[len('OpenML'):][0] + + @abstractmethod + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. + + Returns + ------ + body_fields : List[Tuple[str, Union[str, int, List[str]]]] + A list of (name, value) pairs to display in the body of the __repr__. + E.g.: [('metric', 'accuracy'), ('dataset', 'iris')] + If value is a List of str, then each item of the list will appear in a separate row. + """ + # Should be implemented in the base class. + pass + + def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str: + """ Generates the header and formats the body for string representation of the object. + + Parameters + ---------- + body_fields: List[Tuple[str, str]] + A list of (name, value) pairs to display in the body of the __repr__. + """ + # We add spaces between capitals, e.g. ClassificationTask -> Classification Task + name_with_spaces = re.sub(r"(\w)([A-Z])", r"\1 \2", + self.__class__.__name__[len('OpenML'):]) + header_text = 'OpenML {}'.format(name_with_spaces) + header = '{}\n{}\n'.format(header_text, '=' * len(header_text)) + + longest_field_name_length = max(len(name) for name, value in body_fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in body_fields) + return header + body + + @abstractmethod + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. + + Uses OrderedDict to ensure consistent ordering when converting to xml. + The return value (OrderedDict) will be used to create the upload xml file. + The xml file must have the tags in exactly the order of the object's xsd. + (see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/). + + Returns + ------- + OrderedDict + Flow represented as OrderedDict. + + """ + # Should be implemented in the base class. + pass + + def _to_xml(self) -> str: + """ Generate xml representation of self for upload to server. """ + dict_representation = self._to_dict() + xml_representation = xmltodict.unparse(dict_representation, pretty=True) + + # A task may not be uploaded with the xml encoding specification: + # + encoding_specification, xml_body = xml_representation.split('\n', 1) + return xml_body + + def open_in_browser(self): + """ Opens the OpenML web page corresponding to this object in your default browser. """ + webbrowser.open(self.openml_url) + + def push_tag(self, tag: str): + """Annotates this entity with a tag on the server. + + Parameters + ---------- + tag : str + Tag to attach to the flow. + """ + _tag_openml_base(self, tag) + + def remove_tag(self, tag: str): + """Removes a tag from this entity on the server. + + Parameters + ---------- + tag : str + Tag to attach to the flow. + """ + _tag_openml_base(self, tag, untag=True) diff --git a/openml/config.py b/openml/config.py index 91d7345e0..0a2332e18 100644 --- a/openml/config.py +++ b/openml/config.py @@ -28,7 +28,8 @@ # Default values are actually added here in the _setup() function which is # called at the end of this module -server = _defaults['server'] +server = str(_defaults['server']) # so mypy knows it is a string +server_base_url = server[:-len('/api/v1/xml')] apikey = _defaults['apikey'] # The current cache directory (without the server name) cache_directory = _defaults['cachedir'] diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index be50c0378..61c7da000 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -15,15 +15,15 @@ from warnings import warn import openml._api_calls +from openml.base import OpenMLBase from .data_feature import OpenMLDataFeature from ..exceptions import PyOpenMLError -from ..utils import _tag_entity logger = logging.getLogger(__name__) -class OpenMLDataset(object): +class OpenMLDataset(OpenMLBase): """Dataset object. Allows fetching and uploading datasets to OpenML. @@ -184,11 +184,12 @@ def __init__(self, name, description, format=None, else: self.data_pickle_file = None - def __repr__(self): - header = "OpenML Dataset" - header = '{}\n{}\n'.format(header, '=' * len(header)) + @property + def id(self) -> Optional[int]: + return self.dataset_id - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Name": self.name, "Version": self.version, "Format": self.format, @@ -201,19 +202,14 @@ def __repr__(self): if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace('T', ' ') if self.dataset_id is not None: - fields["OpenML URL"] = "{}d/{}".format(base_url, self.dataset_id) + fields["OpenML URL"] = self.openml_url if self.qualities is not None and self.qualities['NumberOfInstances'] is not None: fields["# of instances"] = int(self.qualities['NumberOfInstances']) # determines the order in which the information will be printed order = ["Name", "Version", "Format", "Upload Date", "Licence", "Download URL", "OpenML URL", "Data File", "Pickle File", "# of features", "# of instances"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body + return [(key, fields[key]) for key in order if key in fields] def __eq__(self, other): @@ -462,26 +458,6 @@ def _load_data(self): return data, categorical, attribute_names - def push_tag(self, tag): - """Annotates this data set with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the dataset. - """ - _tag_entity('data', self.dataset_id, tag) - - def remove_tag(self, tag): - """Removes a tag from this dataset on the server. - - Parameters - ---------- - tag : str - Tag to attach to the dataset. - """ - _tag_entity('data', self.dataset_id, tag, untag=True) - @staticmethod def _convert_array_format(data, array_format, attribute_names): """Convert a dataset to a given array format. @@ -796,14 +772,8 @@ def publish(self): self.dataset_id = int(response['oml:upload_data_set']['oml:id']) return self.dataset_id - def _to_xml(self): - """ Serialize object to xml for upload - - Returns - ------- - xml_dataset : str - XML description of the data. - """ + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ props = ['id', 'name', 'version', 'description', 'format', 'creator', 'contributor', 'collection_date', 'upload_date', 'language', 'licence', 'url', 'default_target_attribute', @@ -811,7 +781,7 @@ def _to_xml(self): 'citation', 'tag', 'visibility', 'original_data_url', 'paper_url', 'update_comment', 'md5_checksum'] - data_container = OrderedDict() + data_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' data_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')]) data_container['oml:data_set_description'] = data_dict @@ -820,14 +790,7 @@ def _to_xml(self): if content is not None: data_dict["oml:" + prop] = content - xml_string = xmltodict.unparse( - input_dict=data_container, - pretty=True, - ) - # A flow may not be uploaded with the xml encoding specification: - # - xml_string = xml_string.split('\n', 1)[-1] - return xml_string + return data_container def _check_qualities(qualities): diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py index 2dc5999cb..9d8507708 100644 --- a/openml/evaluations/evaluation.py +++ b/openml/evaluations/evaluation.py @@ -61,18 +61,17 @@ def __repr__(self): header = "OpenML Evaluation" header = '{}\n{}\n'.format(header, '=' * len(header)) - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) fields = {"Upload Date": self.upload_time, "Run ID": self.run_id, - "OpenML Run URL": "{}r/{}".format(base_url, self.run_id), + "OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id), "Task ID": self.task_id, - "OpenML Task URL": "{}t/{}".format(base_url, self.task_id), + "OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id), "Flow ID": self.flow_id, - "OpenML Flow URL": "{}f/{}".format(base_url, self.flow_id), + "OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "Setup ID": self.setup_id, "Data ID": self.data_id, "Data Name": self.data_name, - "OpenML Data URL": "{}d/{}".format(base_url, self.data_id), + "OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id), "Metric Used": self.function, "Result": self.value} diff --git a/openml/flows/flow.py b/openml/flows/flow.py index ec3598914..7d66a8433 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -1,17 +1,16 @@ from collections import OrderedDict import os -from typing import Dict, List, Union # noqa: F401 +from typing import Dict, List, Union, Tuple, Optional # noqa: F401 import logging import xmltodict +from openml.base import OpenMLBase from ..extensions import get_extension_by_flow -from ..utils import extract_xml_tags, _tag_entity +from ..utils import extract_xml_tags -import openml.config - -class OpenMLFlow(object): +class OpenMLFlow(OpenMLBase): """OpenML Flow. Stores machine learning models. Flows should not be generated manually, but by the function @@ -137,6 +136,10 @@ def __init__(self, name, description, model, components, parameters, else: self._extension = extension + @property + def id(self) -> Optional[int]: + return self.flow_id + @property def extension(self): if self._extension is not None: @@ -145,20 +148,16 @@ def extension(self): raise RuntimeError("No extension could be found for flow {}: {}" .format(self.flow_id, self.name)) - def __repr__(self): - header = "OpenML Flow" - header = '{}\n{}\n'.format(header, '=' * len(header)) - - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Flow Name": self.name, "Flow Description": self.description, "Dependencies": self.dependencies} if self.flow_id is not None: + fields["Flow URL"] = self.openml_url + fields["Flow ID"] = str(self.flow_id) if self.version is not None: - fields["Flow ID"] = "{} (version {})".format(self.flow_id, self.version) - else: - fields["Flow ID"] = self.flow_id - fields["Flow URL"] = "{}f/{}".format(base_url, self.flow_id) + fields["Flow ID"] += " (version {})".format(self.version) if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace('T', ' ') if self.binary_url is not None: @@ -167,48 +166,10 @@ def __repr__(self): # determines the order in which the information will be printed order = ["Flow ID", "Flow URL", "Flow Name", "Flow Description", "Binary URL", "Upload Date", "Dependencies"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body - - def _to_xml(self) -> str: - """Generate xml representation of self for upload to server. - - Returns - ------- - str - Flow represented as XML string. - """ - flow_dict = self._to_dict() - flow_xml = xmltodict.unparse(flow_dict, pretty=True) - - # A flow may not be uploaded with the xml encoding specification: - # - flow_xml = flow_xml.split('\n', 1)[-1] - return flow_xml - - def _to_dict(self) -> dict: - """ Helper function used by _to_xml and itself. - - Creates a dictionary representation of self which can be serialized - to xml by the function _to_xml. Since a flow can contain subflows - (components) this helper function calls itself recursively to also - serialize these flows to dictionaries. - - Uses OrderedDict to ensure consistent ordering when converting to xml. - The return value (OrderedDict) will be used to create the upload xml - file. The xml file must have the tags in exactly the order given in the - xsd schema of a flow (see class docstring). + return [(key, fields[key]) for key in order if key in fields] - Returns - ------- - OrderedDict - Flow represented as OrderedDict. - - """ + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ flow_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' flow_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')]) # type: 'OrderedDict[str, Union[List, str]]' # noqa E501 flow_container['oml:flow'] = flow_dict @@ -506,26 +467,6 @@ def get_subflow(self, structure): structure.pop(0) return self.components[sub_identifier].get_subflow(structure) - def push_tag(self, tag): - """Annotates this flow with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the flow. - """ - _tag_entity('flow', self.flow_id, tag) - - def remove_tag(self, tag): - """Removes a tag from this flow on the server. - - Parameters - ---------- - tag : str - Tag to attach to the flow. - """ - _tag_entity('flow', self.flow_id, tag, untag=True) - def _copy_server_fields(source_flow, target_flow): fields_added_by_the_server = ['flow_id', 'uploader', 'version', diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 2aa3df85e..4389eb3c0 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -425,7 +425,7 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, # but the uploader has no control over them! 'tags'] ignored_by_python_api = ['binary_url', 'binary_format', 'binary_md5', - 'model'] + 'model', '_entity_id'] for key in set(flow1.__dict__.keys()).union(flow2.__dict__.keys()): if key in generated_by_the_server + ignored_by_python_api: diff --git a/openml/runs/run.py b/openml/runs/run.py index 6a4818f30..08f99d345 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -1,7 +1,7 @@ from collections import OrderedDict import pickle import time -from typing import Any, IO, TextIO # noqa F401 +from typing import Any, IO, TextIO, List, Union, Tuple, Optional # noqa F401 import os import arff @@ -10,6 +10,7 @@ import openml import openml._api_calls +from openml.base import OpenMLBase from ..exceptions import PyOpenMLError from ..flows import get_flow from ..tasks import (get_task, @@ -19,10 +20,9 @@ OpenMLClusteringTask, OpenMLRegressionTask ) -from ..utils import _tag_entity -class OpenMLRun(object): +class OpenMLRun(OpenMLBase): """OpenML Run: result of running a model on an openml dataset. Parameters @@ -67,28 +67,30 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None, self.tags = tags self.predictions_url = predictions_url - def __repr__(self): - header = "OpenML Run" - header = '{}\n{}\n'.format(header, '=' * len(header)) + @property + def id(self) -> Optional[int]: + return self.run_id - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Uploader Name": self.uploader_name, "Metric": self.task_evaluation_measure, "Run ID": self.run_id, "Task ID": self.task_id, "Task Type": self.task_type, - "Task URL": "{}t/{}".format(base_url, self.task_id), + "Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id), "Flow ID": self.flow_id, "Flow Name": self.flow_name, - "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "Setup ID": self.setup_id, "Setup String": self.setup_string, "Dataset ID": self.dataset_id, - "Dataset URL": "{}d/{}".format(base_url, self.dataset_id)} + "Dataset URL": openml.datasets.OpenMLDataset.url_for_id(self.dataset_id)} if self.uploader is not None: - fields["Uploader Profile"] = "{}u/{}".format(base_url, self.uploader) + fields["Uploader Profile"] = "{}/u/{}".format(openml.config.server_base_url, + self.uploader) if self.run_id is not None: - fields["Run URL"] = "{}r/{}".format(base_url, self.run_id) + fields["Run URL"] = self.openml_url if self.evaluations is not None and self.task_evaluation_measure in self.evaluations: fields["Result"] = self.evaluations[self.task_evaluation_measure] @@ -96,15 +98,7 @@ def __repr__(self): order = ["Uploader Name", "Uploader Profile", "Metric", "Result", "Run ID", "Run URL", "Task ID", "Task Type", "Task URL", "Flow ID", "Flow Name", "Flow URL", "Setup ID", "Setup String", "Dataset ID", "Dataset URL"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body - - def _repr_pretty_(self, pp, cycle): - pp.text(str(self)) + return [(key, fields[key]) for key in order if key in fields] @classmethod def from_filesystem(cls, directory: str, expect_model: bool = True) -> 'OpenMLRun': @@ -201,7 +195,7 @@ def to_filesystem( 'Output directory {} should be empty'.format(os.path.abspath(directory)) ) - run_xml = self._create_description_xml() + run_xml = self._to_xml() predictions_arff = arff.dumps(self._generate_arff_dict()) # It seems like typing does not allow to define the same variable multiple times @@ -469,7 +463,7 @@ def publish(self) -> 'OpenMLRun': self.model, ) - description_xml = self._create_description_xml() + description_xml = self._to_xml() file_elements = {'description': ("description.xml", description_xml)} if self.error_message is None: @@ -487,115 +481,41 @@ def publish(self) -> 'OpenMLRun': self.run_id = int(result['oml:upload_run']['oml:run_id']) return self - def _create_description_xml(self): - """Create xml representation of run for upload. - - Returns - ------- - xml_string : string - XML description of run. - """ - - # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+ - # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss' - # well_formatted_time = time.strftime("%c").replace( - # ' ', '_').replace('/', '-').replace(':', '.') - # tags = run_environment + [well_formatted_time] + ['run_task'] + \ - # [self.model.__module__ + "." + self.model.__class__.__name__] - description = _to_dict(taskid=self.task_id, flow_id=self.flow_id, - setup_string=self.setup_string, - parameter_settings=self.parameter_settings, - error_message=self.error_message, - fold_evaluations=self.fold_evaluations, - sample_evaluations=self.sample_evaluations, - tags=self.tags) - description_xml = xmltodict.unparse(description, pretty=True) - return description_xml - - def push_tag(self, tag: str) -> None: - """Annotates this run with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the run. - """ - _tag_entity('run', self.run_id, tag) - - def remove_tag(self, tag: str) -> None: - """Removes a tag from this run on the server. - - Parameters - ---------- - tag : str - Tag to attach to the run. - """ - _tag_entity('run', self.run_id, tag, untag=True) - - -############################################################################### -# Functions which cannot be in runs/functions due to circular imports - -def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, - tags=None, fold_evaluations=None, sample_evaluations=None): - """ Creates a dictionary corresponding to the desired xml desired by openML - - Parameters - ---------- - taskid : int - the identifier of the task - setup_string : string - a CLI string which can invoke the learning with the correct parameter - settings - parameter_settings : array of dicts - each dict containing keys name, value and component, one per parameter - setting - tags : array of strings - information that give a description of the run, must conform to - regex ``([a-zA-Z0-9_\-\.])+`` - fold_evaluations : dict mapping from evaluation measure to a dict mapping - repeat_nr to a dict mapping from fold nr to a value (double) - sample_evaluations : dict mapping from evaluation measure to a dict - mapping repeat_nr to a dict mapping from fold nr to a dict mapping to - a sample nr to a value (double) - sample_evaluations : - Returns - ------- - result : an array with version information of the above packages - """ # noqa: W605 - description = OrderedDict() - description['oml:run'] = OrderedDict() - description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml' - description['oml:run']['oml:task_id'] = taskid - description['oml:run']['oml:flow_id'] = flow_id - if error_message is not None: - description['oml:run']['oml:error_message'] = error_message - description['oml:run']['oml:parameter_setting'] = parameter_settings - if tags is not None: - description['oml:run']['oml:tag'] = tags # Tags describing the run - if (fold_evaluations is not None and len(fold_evaluations) > 0) or \ - (sample_evaluations is not None and len(sample_evaluations) > 0): - description['oml:run']['oml:output_data'] = OrderedDict() - description['oml:run']['oml:output_data']['oml:evaluation'] = list() - if fold_evaluations is not None: - for measure in fold_evaluations: - for repeat in fold_evaluations[measure]: - for fold, value in fold_evaluations[measure][repeat].items(): - current = OrderedDict([ - ('@repeat', str(repeat)), ('@fold', str(fold)), - ('oml:name', measure), ('oml:value', str(value))]) - description['oml:run']['oml:output_data'][ - 'oml:evaluation'].append(current) - if sample_evaluations is not None: - for measure in sample_evaluations: - for repeat in sample_evaluations[measure]: - for fold in sample_evaluations[measure][repeat]: - for sample, value in sample_evaluations[measure][repeat][ - fold].items(): + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ + description = OrderedDict() # type: 'OrderedDict' + description['oml:run'] = OrderedDict() + description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml' + description['oml:run']['oml:task_id'] = self.task_id + description['oml:run']['oml:flow_id'] = self.flow_id + if self.error_message is not None: + description['oml:run']['oml:error_message'] = self.error_message + description['oml:run']['oml:parameter_setting'] = self.parameter_settings + if self.tags is not None: + description['oml:run']['oml:tag'] = self.tags # Tags describing the run + if (self.fold_evaluations is not None and len(self.fold_evaluations) > 0) or \ + (self.sample_evaluations is not None and len(self.sample_evaluations) > 0): + description['oml:run']['oml:output_data'] = OrderedDict() + description['oml:run']['oml:output_data']['oml:evaluation'] = list() + if self.fold_evaluations is not None: + for measure in self.fold_evaluations: + for repeat in self.fold_evaluations[measure]: + for fold, value in self.fold_evaluations[measure][repeat].items(): current = OrderedDict([ ('@repeat', str(repeat)), ('@fold', str(fold)), - ('@sample', str(sample)), ('oml:name', measure), - ('oml:value', str(value))]) + ('oml:name', measure), ('oml:value', str(value))]) description['oml:run']['oml:output_data'][ 'oml:evaluation'].append(current) - return description + if self.sample_evaluations is not None: + for measure in self.sample_evaluations: + for repeat in self.sample_evaluations[measure]: + for fold in self.sample_evaluations[measure][repeat]: + for sample, value in \ + self.sample_evaluations[measure][repeat][fold].items(): + current = OrderedDict([ + ('@repeat', str(repeat)), ('@fold', str(fold)), + ('@sample', str(sample)), ('oml:name', measure), + ('oml:value', str(value))]) + description['oml:run']['oml:output_data'][ + 'oml:evaluation'].append(current) + return description diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 1786120e8..c6ca1f057 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -1,7 +1,7 @@ from collections import OrderedDict import json import os -from typing import List, Tuple # noqa F401 +from typing import List, Tuple, Optional # noqa F401 import arff import xmltodict @@ -381,7 +381,7 @@ def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace': return cls(None, merged_trace) def __repr__(self): - return '[Run id: %d, %d trace iterations]'.format( + return '[Run id: {}, {} trace iterations]'.format( -1 if self.run_id is None else self.run_id, len(self.trace_iterations), ) diff --git a/openml/setups/setup.py b/openml/setups/setup.py index aee1aa0bf..31fdc15a4 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -31,10 +31,9 @@ def __repr__(self): header = "OpenML Setup" header = '{}\n{}\n'.format(header, '=' * len(header)) - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) fields = {"Setup ID": self.setup_id, "Flow ID": self.flow_id, - "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "# of Parameters": len(self.parameters)} # determines the order in which the information will be printed @@ -86,12 +85,11 @@ def __repr__(self): header = "OpenML Parameter" header = '{}\n{}\n'.format(header, '=' * len(header)) - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) fields = {"ID": self.id, "Flow ID": self.flow_id, # "Flow Name": self.flow_name, "Flow Name": self.full_name, - "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "Parameter Name": self.parameter_name} # indented prints for parameter attributes # indention = 2 spaces + 1 | + 2 underscores diff --git a/openml/study/functions.py b/openml/study/functions.py index ccd523016..25ebea5fd 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -120,7 +120,7 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: if 'oml:setups' in result_dict: setups = [int(x) for x in result_dict['oml:setups']['oml:setup_id']] else: - raise ValueError('No setups attached to study!'.format(id_)) + raise ValueError('No setups attached to study {}!'.format(id_)) if 'oml:runs' in result_dict: runs = [ int(x) for x in result_dict['oml:runs']['oml:run_id'] @@ -130,7 +130,7 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: # Legacy studies did not require runs runs = None else: - raise ValueError('No runs attached to study!'.format(id_)) + raise ValueError('No runs attached to study {}!'.format(id_)) study = OpenMLStudy( study_id=study_id, diff --git a/openml/study/study.py b/openml/study/study.py index 54e71691c..9d1df9337 100644 --- a/openml/study/study.py +++ b/openml/study/study.py @@ -1,12 +1,13 @@ -import collections -from typing import Dict, List, Optional +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple, Union, Any import xmltodict import openml +from openml.base import OpenMLBase -class BaseStudy(object): +class BaseStudy(OpenMLBase): """ An OpenMLStudy represents the OpenML concept of a study. It contains the following information: name, id, description, creation date, @@ -87,19 +88,25 @@ def __init__( self.flows = flows self.setups = setups self.runs = runs - pass - def __repr__(self): - # header is provided by the sub classes - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + @classmethod + def _entity_letter(cls) -> str: + return 's' + + @property + def id(self) -> Optional[int]: + return self.study_id + + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Name": self.name, "Status": self.status, - "Main Entity Type": self.main_entity_type} + "Main Entity Type": self.main_entity_type} # type: Dict[str, Any] if self.study_id is not None: fields["ID"] = self.study_id - fields["Study URL"] = "{}s/{}".format(base_url, self.study_id) + fields["Study URL"] = self.openml_url if self.creator is not None: - fields["Creator"] = "{}u/{}".format(base_url, self.creator) + fields["Creator"] = "{}/u/{}".format(openml.config.server_base_url, self.creator) if self.creation_date is not None: fields["Upload Time"] = self.creation_date.replace('T', ' ') if self.data is not None: @@ -115,12 +122,7 @@ def __repr__(self): order = ["ID", "Name", "Status", "Main Entity Type", "Study URL", "# of Data", "# of Tasks", "# of Flows", "# of Runs", "Creator", "Upload Time"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return body + return [(key, fields[key]) for key in order if key in fields] def publish(self) -> int: """ @@ -143,14 +145,8 @@ def publish(self) -> int: self.study_id = int(study_res['oml:study_upload']['oml:id']) return self.study_id - def _to_xml(self) -> str: - """Serialize object to xml for upload - - Returns - ------- - xml_study : str - XML description of the data. - """ + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ # some can not be uploaded, e.g., id, creator, creation_date simple_props = ['alias', 'main_entity_type', 'name', 'description'] # maps from attribute name (which is used as outer tag name) to immer @@ -161,9 +157,9 @@ def _to_xml(self) -> str: 'runs': 'run_id', } - study_container = collections.OrderedDict() # type: 'collections.OrderedDict' + study_container = OrderedDict() # type: 'OrderedDict' namespace_list = [('@xmlns:oml', 'http://openml.org/openml')] - study_dict = collections.OrderedDict(namespace_list) # type: 'collections.OrderedDict' + study_dict = OrderedDict(namespace_list) # type: 'OrderedDict' study_container['oml:study'] = study_dict for prop_name in simple_props: @@ -177,15 +173,13 @@ def _to_xml(self) -> str: 'oml:' + inner_name: content } study_dict["oml:" + prop_name] = sub_dict + return study_container - xml_string = xmltodict.unparse( - input_dict=study_container, - pretty=True, - ) - # A flow may not be uploaded with the xml encoding specification: - # - xml_string = xml_string.split('\n', 1)[-1] - return xml_string + def push_tag(self, tag: str): + raise NotImplementedError("Tags for studies is not (yet) supported.") + + def remove_tag(self, tag: str): + raise NotImplementedError("Tags for studies is not (yet) supported.") class OpenMLStudy(BaseStudy): @@ -268,12 +262,6 @@ def __init__( setups=setups, ) - def __repr__(self): - header = "OpenML Study" - header = '{}\n{}\n'.format(header, '=' * len(header)) - body = super(OpenMLStudy, self).__repr__() - return header + body - class OpenMLBenchmarkSuite(BaseStudy): """ @@ -345,9 +333,3 @@ def __init__( runs=None, setups=None, ) - - def __repr__(self): - header = "OpenML Benchmark Suite" - header = '{}\n{}\n'.format(header, '=' * len(header)) - body = super(OpenMLBenchmarkSuite, self).__repr__() - return header + body diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 83af79373..2358160ef 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -2,7 +2,7 @@ from collections import OrderedDict import io import os -from typing import Union, Tuple, Dict, List, Optional +from typing import Union, Tuple, Dict, List, Optional, Any from warnings import warn import numpy as np @@ -11,12 +11,13 @@ import xmltodict import openml._api_calls +from openml.base import OpenMLBase from .. import datasets from .split import OpenMLSplit -from ..utils import _create_cache_directory_for_id, _tag_entity +from ..utils import _create_cache_directory_for_id -class OpenMLTask(ABC): +class OpenMLTask(OpenMLBase): """OpenML Task object. Parameters @@ -55,35 +56,36 @@ def __init__( self.estimation_procedure_id = estimation_procedure_id self.split = None # type: Optional[OpenMLSplit] - def __repr__(self): - header = "OpenML Task" - header = '{}\n{}\n'.format(header, '=' * len(header)) + @classmethod + def _entity_letter(cls) -> str: + return 't' - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) - fields = {"Task Type": self.task_type} + @property + def id(self) -> Optional[int]: + return self.task_id + + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ + fields = {"Task Type Description": '{}/tt/{}'.format( + openml.config.server_base_url, self.task_type_id)} # type: Dict[str, Any] if self.task_id is not None: fields["Task ID"] = self.task_id - fields["Task URL"] = "{}t/{}".format(base_url, self.task_id) + fields["Task URL"] = self.openml_url if self.evaluation_measure is not None: fields["Evaluation Measure"] = self.evaluation_measure if self.estimation_procedure is not None: fields["Estimation Procedure"] = self.estimation_procedure['type'] - if self.target_name is not None: - fields["Target Feature"] = self.target_name + if getattr(self, 'target_name', None) is not None: + fields["Target Feature"] = getattr(self, 'target_name') if hasattr(self, 'class_labels'): - fields["# of Classes"] = len(self.class_labels) + fields["# of Classes"] = len(getattr(self, 'class_labels')) if hasattr(self, 'cost_matrix'): fields["Cost Matrix"] = "Available" # determines the order in which the information will be printed - order = ["Task Type", "Task ID", "Task URL", "Estimation Procedure", "Evaluation Measure", - "Target Feature", "# of Classes", "Cost Matrix"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body + order = ["Task Type Description", "Task ID", "Task URL", "Estimation Procedure", + "Evaluation Measure", "Target Feature", "# of Classes", "Cost Matrix"] + return [(key, fields[key]) for key in order if key in fields] def get_dataset(self) -> datasets.OpenMLDataset: """Download dataset associated with task""" @@ -144,28 +146,8 @@ def get_split_dimensions(self) -> Tuple[int, int, int]: return self.split.repeats, self.split.folds, self.split.samples - def push_tag(self, tag: str): - """Annotates this task with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the task. - """ - _tag_entity('task', self.task_id, tag) - - def remove_tag(self, tag: str): - """Removes a tag from this task on the server. - - Parameters - ---------- - tag : str - Tag to attach to the task. - """ - _tag_entity('task', self.task_id, tag, untag=True) - def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': - + """ Creates a dictionary representation of self. """ task_container = OrderedDict() # type: OrderedDict[str, OrderedDict] task_dict = OrderedDict([ ('@xmlns:oml', 'http://openml.org/openml') @@ -199,23 +181,6 @@ def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': return task_container - def _to_xml(self) -> str: - """Generate xml representation of self for upload to server. - - Returns - ------- - str - Task represented as XML string. - """ - task_dict = self._to_dict() - task_xml = xmltodict.unparse(task_dict, pretty=True) - - # A task may not be uploaded with the xml encoding specification: - # - task_xml = task_xml.split('\n', 1)[-1] - - return task_xml - def publish(self) -> int: """Publish task to OpenML server. diff --git a/openml/utils.py b/openml/utils.py index f6cc81ff7..f4042f8a4 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -2,6 +2,7 @@ import hashlib import xmltodict import shutil +from typing import TYPE_CHECKING import warnings import pandas as pd from functools import wraps @@ -11,6 +12,11 @@ import openml.exceptions from . import config +# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles +if TYPE_CHECKING: + from openml.base import OpenMLBase + + oslo_installed = False try: # Currently, importing oslo raises a lot of warning that it will stop working @@ -62,6 +68,19 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True): (xml_tag_name, str(node))) +def _tag_openml_base(oml_object: 'OpenMLBase', tag: str, untag: bool = False): + rest_api_mapping = [ + (openml.datasets.OpenMLDataset, 'data'), + (openml.flows.OpenMLFlow, 'flow'), + (openml.tasks.OpenMLTask, 'task'), + (openml.runs.OpenMLRun, 'run') + ] + _, api_type_alias = [(python_type, api_alias) + for (python_type, api_alias) in rest_api_mapping + if isinstance(oml_object, python_type)][0] + _tag_entity(api_type_alias, oml_object.id, tag, untag) + + def _tag_entity(entity_type, entity_id, tag, untag=False): """ Function that tags or untags a given entity on OpenML. As the OpenML diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index dacade858..0266ca4d9 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -46,8 +46,8 @@ def _test_run_obj_equals(self, run, run_prime): other = getattr(run_prime, dictionary) if other is not None: self.assertDictEqual(other, dict()) - self.assertEqual(run._create_description_xml(), - run_prime._create_description_xml()) + self.assertEqual(run._to_xml(), + run_prime._to_xml()) numeric_part = \ np.array(np.array(run.data_content)[:, 0:-2], dtype=float) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 652d38711..2ec293950 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -206,7 +206,7 @@ def _remove_random_state(flow): # This is only a smoke check right now # TODO add a few asserts here - run._create_description_xml() + run._to_xml() if run.trace is not None: # This is only a smoke check right now # TODO add a few asserts here