diff --git a/doc/usage.rst b/doc/usage.rst index 0801c2c03..a4bf8ee0b 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -55,7 +55,7 @@ API: .. code:: python >>> import os - >>> openml.config.set_cache_directory(os.path.expanduser('~/.openml/cache')) + >>> openml.config.cache_directory = os.path.expanduser('~/.openml/cache') Config file: diff --git a/openml/config.py b/openml/config.py index 192b5fcaa..949fe869f 100644 --- a/openml/config.py +++ b/openml/config.py @@ -6,6 +6,7 @@ from six import StringIO from six.moves import configparser +from six.moves.urllib_parse import urlparse logger = logging.getLogger(__name__) @@ -13,10 +14,23 @@ format='[%(levelname)s] [%(asctime)s:%(name)s] %(' 'message)s', datefmt='%H:%M:%S') +# Default values! +_defaults = { + 'apikey': None, + 'server': "https://www.openml.org/api/v1/xml", + 'verbosity': 0, + 'cachedir': os.path.expanduser('~/.openml/cache'), + 'avoid_duplicate_runs': 'True', +} + config_file = os.path.expanduser('~/.openml/config') -server = "https://www.openml.org/api/v1/xml" + +# Default values are actually added here in the _setup() function which is +# called at the end of this module +server = "" apikey = "" -cachedir = "" +# The current cache directory (without the server name) +cache_directory = "" def _setup(): @@ -26,12 +40,11 @@ def _setup(): key and server can be set by the user simply using openml.config.apikey = THEIRKEY openml.config.server = SOMESERVER - The cache dir needs to be set up calling set_cache_directory - because it needs some setup. We could also make it a property but that's less clear. """ global apikey global server + global cache_directory global avoid_duplicate_runs # read config file, create cache directory try: @@ -42,52 +55,15 @@ def _setup(): config = _parse_config() apikey = config.get('FAKE_SECTION', 'apikey') server = config.get('FAKE_SECTION', 'server') - cache_dir = config.get('FAKE_SECTION', 'cachedir') + cache_directory = config.get('FAKE_SECTION', 'cachedir') avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs') - set_cache_directory(cache_dir) - - -def set_cache_directory(cachedir): - """Set module-wide cache directory. - - Sets the cache directory into which to download datasets, tasks etc. - - Parameters - ---------- - cachedir : string - Path to use as cache directory. - - See also - -------- - get_cache_directory - """ - - global _cachedir - _cachedir = cachedir - - # Set up the cache directories - dataset_cache_dir = os.path.join(cachedir, "datasets") - task_cache_dir = os.path.join(cachedir, "tasks") - run_cache_dir = os.path.join(cachedir, 'runs') - lock_dir = os.path.join(cachedir, 'locks') - - for dir_ in [ - cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir, lock_dir, - ]: - if not os.path.exists(dir_) and not os.path.isdir(dir_): - os.mkdir(dir_) def _parse_config(): """Parse the config file, set up defaults. """ - defaults = {'apikey': apikey, - 'server': server, - 'verbosity': 0, - 'cachedir': os.path.expanduser('~/.openml/cache'), - 'avoid_duplicate_runs': 'True'} - config = configparser.RawConfigParser(defaults=defaults) + config = configparser.RawConfigParser(defaults=_defaults) if not os.path.exists(config_file): # Create an empty config file if there was none so far @@ -106,8 +82,7 @@ def _parse_config(): config_file_.seek(0) config.readfp(config_file_) except OSError as e: - logging.info("Error opening file %s: %s" % - config_file, e.message) + logging.info("Error opening file %s: %s", config_file, e.message) return config @@ -119,13 +94,19 @@ def get_cache_directory(): cachedir : string The current cache directory. - See also - -------- - set_cache_directory """ + url_suffix = urlparse(server).netloc + reversed_url_suffix = '/'.join(url_suffix.split('.')[::-1]) + if not cache_directory: + _cachedir = _defaults(cache_directory) + else: + _cachedir = cache_directory + _cachedir = os.path.join(_cachedir, reversed_url_suffix) return _cachedir -__all__ = ["set_cache_directory", 'get_cache_directory'] +__all__ = [ + 'get_cache_directory', +] _setup() diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 48569ea81..b447c671d 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -14,13 +14,22 @@ from .dataset import OpenMLDataset from ..exceptions import OpenMLCacheException, OpenMLServerException, \ OpenMLHashException, PrivateDatasetError -from .. import config -from .._api_calls import _read_url +from ..utils import ( + _create_cache_directory, + _remove_cache_dir_for_id, + _create_cache_directory_for_id, + _create_lockfiles_dir, +) + + +DATASETS_CACHE_DIR_NAME = 'datasets' + ############################################################################ # Local getters/accessors to the cache directory + def _list_cached_datasets(): """Return list with ids of all cached datasets @@ -31,8 +40,7 @@ def _list_cached_datasets(): """ datasets = [] - dataset_cache = config.get_cache_directory() - dataset_cache_dir = os.path.join(dataset_cache, "datasets") + dataset_cache_dir = _create_cache_directory(DATASETS_CACHE_DIR_NAME) directory_content = os.listdir(dataset_cache_dir) directory_content.sort() @@ -88,8 +96,9 @@ def _get_cached_dataset(dataset_id): def _get_cached_dataset_description(dataset_id): - cache_dir = config.get_cache_directory() - did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id)) + did_cache_dir = _create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, dataset_id, + ) description_file = os.path.join(did_cache_dir, "description.xml") try: with io.open(description_file, encoding='utf8') as fh: @@ -102,8 +111,9 @@ def _get_cached_dataset_description(dataset_id): def _get_cached_dataset_features(dataset_id): - cache_dir = config.get_cache_directory() - did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id)) + did_cache_dir = _create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, dataset_id, + ) features_file = os.path.join(did_cache_dir, "features.xml") try: with io.open(features_file, encoding='utf8') as fh: @@ -115,8 +125,9 @@ def _get_cached_dataset_features(dataset_id): def _get_cached_dataset_qualities(dataset_id): - cache_dir = config.get_cache_directory() - did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id)) + did_cache_dir = _create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, dataset_id, + ) qualities_file = os.path.join(did_cache_dir, "qualities.xml") try: with io.open(qualities_file, encoding='utf8') as fh: @@ -128,8 +139,9 @@ def _get_cached_dataset_qualities(dataset_id): def _get_cached_dataset_arff(dataset_id): - cache_dir = config.get_cache_directory() - did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id)) + did_cache_dir = _create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, dataset_id, + ) output_file = os.path.join(did_cache_dir, "dataset.arff") try: @@ -311,9 +323,11 @@ def get_dataset(dataset_id): with lockutils.external_lock( name='datasets.functions.get_dataset:%d' % dataset_id, - lock_path=os.path.join(config.get_cache_directory(), 'locks'), + lock_path=_create_lockfiles_dir(), ): - did_cache_dir = _create_dataset_cache_directory(dataset_id) + did_cache_dir = _create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, dataset_id, + ) try: remove_dataset_cache = True @@ -330,7 +344,7 @@ def get_dataset(dataset_id): raise e finally: if remove_dataset_cache: - _remove_dataset_cache_dir(did_cache_dir) + _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) dataset = _create_dataset_from_description( description, features, qualities, arff_file @@ -412,7 +426,7 @@ def _get_dataset_arff(did_cache_dir, description): pass url = description['oml:url'] - arff_string = _read_url(url) + arff_string = openml._api_calls._read_url(url) md5 = hashlib.md5() md5.update(arff_string.encode('utf-8')) md5_checksum = md5.hexdigest() @@ -505,55 +519,6 @@ def _get_dataset_qualities(did_cache_dir, dataset_id): return qualities -def _create_dataset_cache_directory(dataset_id): - """Create a dataset cache directory - - In order to have a clearer cache structure and because every dataset - is cached in several files (description, arff, features, qualities), there - is a directory for each dataset witch the dataset ID being the directory - name. This function creates this cache directory. - - This function is NOT thread/multiprocessing safe. - - Parameters - ---------- - did : int - Dataset ID - - Returns - ------- - str - Path of the created dataset cache directory. - """ - dataset_cache_dir = os.path.join( - config.get_cache_directory(), - "datasets", - str(dataset_id), - ) - if os.path.exists(dataset_cache_dir) and os.path.isdir(dataset_cache_dir): - pass - elif os.path.exists(dataset_cache_dir) and not os.path.isdir(dataset_cache_dir): - raise ValueError('Dataset cache dir exists but is not a directory!') - else: - os.makedirs(dataset_cache_dir) - return dataset_cache_dir - - -def _remove_dataset_cache_dir(did_cache_dir): - """Remove the dataset cache directory - - This function is NOT thread/multiprocessing safe. - - Parameters - ---------- - """ - try: - shutil.rmtree(did_cache_dir) - except (OSError, IOError): - raise ValueError('Cannot remove faulty dataset cache directory %s.' - 'Please do this manually!' % did_cache_dir) - - def _create_dataset_from_description(description, features, qualities, arff_file): """Create a dataset object from a description dict. diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 5190797c7..e12c4ccd7 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -2,6 +2,7 @@ import io import json import os +import shutil import sys import time import warnings @@ -28,6 +29,8 @@ # _get_version_info, _get_dict and _create_setup_string are in run.py to avoid # circular imports +RUNS_CACHE_DIR_NAME = 'runs' + def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None): @@ -643,7 +646,7 @@ def get_run(run_id): run : OpenMLRun Run corresponding to ID, fetched from the server. """ - run_dir = os.path.join(config.get_cache_directory(), "runs", str(run_id)) + run_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id) run_file = os.path.join(run_dir, "description.xml") if not os.path.exists(run_dir): @@ -878,8 +881,9 @@ def _create_trace_from_arff(arff_obj): def _get_cached_run(run_id): """Load a run from the cache.""" - cache_dir = config.get_cache_directory() - run_cache_dir = os.path.join(cache_dir, "runs", str(run_id)) + run_cache_dir = openml.utils._create_cache_directory_for_id( + RUNS_CACHE_DIR_NAME, run_id, + ) try: run_file = os.path.join(run_cache_dir, "description.xml") with io.open(run_file, encoding='utf8') as fh: diff --git a/openml/runs/run.py b/openml/runs/run.py index 7a01433c5..9d80999d6 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -10,7 +10,6 @@ import openml import openml._api_calls from ..tasks import get_task -from .._api_calls import _file_id_to_url from ..exceptions import PyOpenMLError @@ -142,7 +141,9 @@ def get_metric_fn(self, sklearn_fn, kwargs={}): if self.data_content is not None and self.task_id is not None: predictions_arff = self._generate_arff_dict() elif 'predictions' in self.output_files: - predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff') + predictions_file_url = openml._api_calls._file_id_to_url( + self.output_files['predictions'], 'predictions.arff', + ) predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url)) # TODO: make this a stream reader else: diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 512d86a2e..0fbdc9b21 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -2,26 +2,25 @@ import io import re import os -import shutil from oslo_concurrency import lockutils import xmltodict from ..exceptions import OpenMLCacheException from ..datasets import get_dataset -from .task import OpenMLTask, _create_task_cache_dir -from .. import config +from .task import OpenMLTask import openml.utils import openml._api_calls +TASKS_CACHE_DIR_NAME = 'tasks' + + def _get_cached_tasks(): tasks = OrderedDict() - cache_dir = config.get_cache_directory() - task_cache_dir = os.path.join(cache_dir, "tasks") + task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME) directory_content = os.listdir(task_cache_dir) directory_content.sort() - # Find all dataset ids for which we have downloaded the dataset # description @@ -36,15 +35,19 @@ def _get_cached_tasks(): def _get_cached_task(tid): - cache_dir = config.get_cache_directory() - task_cache_dir = os.path.join(cache_dir, "tasks") - task_file = os.path.join(task_cache_dir, str(tid), "task.xml") + + tid_cache_dir = openml.utils._create_cache_directory_for_id( + TASKS_CACHE_DIR_NAME, + tid + ) + task_file = os.path.join(tid_cache_dir, "task.xml") try: with io.open(task_file, encoding='utf8') as fh: task = _create_task_from_xml(xml=fh.read()) return task except (OSError, IOError): + openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir) raise OpenMLCacheException("Task file for tid %d not " "cached" % tid) @@ -275,11 +278,13 @@ def get_task(task_id): raise ValueError("Task ID is neither an Integer nor can be " "cast to an Integer.") - tid_cache_dir = _create_task_cache_dir(task_id) + tid_cache_dir = openml.utils._create_cache_directory_for_id( + TASKS_CACHE_DIR_NAME, task_id, + ) with lockutils.external_lock( name='task.functions.get_task:%d' % task_id, - lock_path=os.path.join(config.get_cache_directory(), 'locks'), + lock_path=openml.utils._create_lockfiles_dir(), ): try: task = _get_task_description(task_id) @@ -287,9 +292,8 @@ def get_task(task_id): class_labels = dataset.retrieve_class_labels(task.target_name) task.class_labels = class_labels task.download_split() - except Exception as e: - _remove_task_cache_dir(tid_cache_dir) + openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir) raise e return task @@ -300,7 +304,10 @@ def _get_task_description(task_id): try: return _get_cached_task(task_id) except OpenMLCacheException: - xml_file = os.path.join(_create_task_cache_dir(task_id), "task.xml") + xml_file = os.path.join( + openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id), + "task.xml", + ) task_xml = openml._api_calls._perform_api_call("task/%d" % task_id) with io.open(xml_file, "w", encoding='utf8') as fh: @@ -310,53 +317,6 @@ def _get_task_description(task_id): return task -def _create_task_cache_directory(task_id): - """Create a task cache directory - - In order to have a clearer cache structure and because every task - is cached in several files (description, split), there - is a directory for each task witch the task ID being the directory - name. This function creates this cache directory. - - This function is NOT thread/multiprocessing safe. - - Parameters - ---------- - tid : int - Task ID - - Returns - ------- - str - Path of the created dataset cache directory. - """ - task_cache_dir = os.path.join( - config.get_cache_directory(), "tasks", str(task_id) - ) - if os.path.exists(task_cache_dir) and os.path.isdir(task_cache_dir): - pass - elif os.path.exists(task_cache_dir) and not os.path.isdir(task_cache_dir): - raise ValueError('Task cache dir exists but is not a directory!') - else: - os.makedirs(task_cache_dir) - return task_cache_dir - - -def _remove_task_cache_dir(tid_cache_dir): - """Remove the task cache directory - - This function is NOT thread/multiprocessing safe. - - Parameters - ---------- - """ - try: - shutil.rmtree(tid_cache_dir) - except (OSError, IOError): - raise ValueError('Cannot remove faulty task cache directory %s.' - 'Please do this manually!' % tid_cache_dir) - - def _create_task_from_xml(xml): dic = xmltodict.parse(xml)["oml:task"] diff --git a/openml/tasks/split.py b/openml/tasks/split.py index ae7f3a85f..6f4b13730 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -10,6 +10,10 @@ Split = namedtuple("Split", ["train", "test"]) +if six.PY2: + FileNotFoundError = IOError + + class OpenMLSplit(object): def __init__(self, name, description, split): @@ -78,6 +82,8 @@ def _from_arff_file(cls, filename, cache=True): # Cache miss if repetitions is None: # Faster than liac-arff and sufficient in this situation! + if not os.path.exists(filename): + raise FileNotFoundError('Split arff %s does not exist!' % filename) splits, meta = scipy.io.arff.loadarff(filename) name = meta.name diff --git a/openml/tasks/task.py b/openml/tasks/task.py index fb331b178..cc7dd6731 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -4,8 +4,8 @@ from .. import config from .. import datasets from .split import OpenMLSplit -from .._api_calls import _read_url import openml._api_calls +from ..utils import _create_cache_directory_for_id class OpenMLTask(object): @@ -64,7 +64,7 @@ def _download_split(self, cache_file): pass except (OSError, IOError): split_url = self.estimation_procedure["data_splits_url"] - split_arff = _read_url(split_url) + split_arff = openml._api_calls._read_url(split_url) with io.open(cache_file, "w", encoding='utf8') as fh: fh.write(split_arff) @@ -74,12 +74,12 @@ def download_split(self): """Download the OpenML split for a given task. """ cached_split_file = os.path.join( - _create_task_cache_dir(self.task_id), "datasplits.arff") + _create_cache_directory_for_id('tasks', self.task_id), + "datasplits.arff", + ) try: split = OpenMLSplit._from_arff_file(cached_split_file) - # Add FileNotFoundError in python3 version (which should be a - # subclass of OSError. except (OSError, IOError): # Next, download and cache the associated split file self._download_split(cached_split_file) diff --git a/openml/testing.py b/openml/testing.py index 62c383a95..0b75da06f 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -26,7 +26,6 @@ def setUp(self): self.maxDiff = None self.static_cache_dir = None static_cache_dir = os.path.dirname(os.path.abspath(inspect.getfile(self.__class__))) - static_cache_dir = os.path.abspath(os.path.join(static_cache_dir, '..')) content = os.listdir(static_cache_dir) if 'files' in content: @@ -52,10 +51,12 @@ def setUp(self): openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" self.production_server = openml.config.server self.test_server = "https://test.openml.org/api/v1/xml" + openml.config.cache_directory = None + openml.config.server = self.test_server openml.config.avoid_duplicate_runs = False - openml.config.set_cache_directory(self.workdir) + openml.config.cache_directory = self.workdir # If we're on travis, we save the api key in the config file to allow # the notebook tests to read them. diff --git a/openml/utils.py b/openml/utils.py index 1fe16ab04..afe83f141 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -1,9 +1,13 @@ +import os import xmltodict import six -import openml._api_calls +import shutil +import openml._api_calls +from . import config from openml.exceptions import OpenMLServerException + def extract_xml_tags(xml_tag_name, node, allow_none=True): """Helper to extract xml tags from xmltodict. @@ -159,3 +163,73 @@ def list_all(listing_call, *args, **filters): batch_size = limit return result + + +def _create_cache_directory(key): + cache = config.get_cache_directory() + cache_dir = os.path.join(cache, key) + try: + os.makedirs(cache_dir) + except: + pass + return cache_dir + + +def _create_cache_directory_for_id(key, id_): + """Create the cache directory for a specific ID + + In order to have a clearer cache structure and because every task + is cached in several files (description, split), there + is a directory for each task witch the task ID being the directory + name. This function creates this cache directory. + + This function is NOT thread/multiprocessing safe. + + Parameters + ---------- + key : str + + id_ : int + + Returns + ------- + str + Path of the created dataset cache directory. + """ + cache_dir = os.path.join( + _create_cache_directory(key), str(id_) + ) + if os.path.exists(cache_dir) and os.path.isdir(cache_dir): + pass + elif os.path.exists(cache_dir) and not os.path.isdir(cache_dir): + raise ValueError('%s cache dir exists but is not a directory!' % key) + else: + os.makedirs(cache_dir) + return cache_dir + + +def _remove_cache_dir_for_id(key, cache_dir): + """Remove the task cache directory + + This function is NOT thread/multiprocessing safe. + + Parameters + ---------- + key : str + + cache_dir : str + """ + try: + shutil.rmtree(cache_dir) + except (OSError, IOError): + raise ValueError('Cannot remove faulty %s cache directory %s.' + 'Please do this manually!' % (key, cache_dir)) + + +def _create_lockfiles_dir(): + dir = os.path.join(config.get_cache_directory(), 'locks') + try: + os.makedirs(dir) + except: + pass + return dir diff --git a/tests/files/datasets/-1/dataset.arff b/tests/files/org/openml/test/datasets/-1/dataset.arff similarity index 100% rename from tests/files/datasets/-1/dataset.arff rename to tests/files/org/openml/test/datasets/-1/dataset.arff diff --git a/tests/files/datasets/-1/description.xml b/tests/files/org/openml/test/datasets/-1/description.xml similarity index 100% rename from tests/files/datasets/-1/description.xml rename to tests/files/org/openml/test/datasets/-1/description.xml diff --git a/tests/files/datasets/-1/features.xml b/tests/files/org/openml/test/datasets/-1/features.xml similarity index 100% rename from tests/files/datasets/-1/features.xml rename to tests/files/org/openml/test/datasets/-1/features.xml diff --git a/tests/files/datasets/-1/qualities.xml b/tests/files/org/openml/test/datasets/-1/qualities.xml similarity index 100% rename from tests/files/datasets/-1/qualities.xml rename to tests/files/org/openml/test/datasets/-1/qualities.xml diff --git a/tests/files/datasets/2/dataset.arff b/tests/files/org/openml/test/datasets/2/dataset.arff similarity index 100% rename from tests/files/datasets/2/dataset.arff rename to tests/files/org/openml/test/datasets/2/dataset.arff diff --git a/tests/files/datasets/2/description.xml b/tests/files/org/openml/test/datasets/2/description.xml similarity index 100% rename from tests/files/datasets/2/description.xml rename to tests/files/org/openml/test/datasets/2/description.xml diff --git a/tests/files/datasets/2/features.xml b/tests/files/org/openml/test/datasets/2/features.xml similarity index 100% rename from tests/files/datasets/2/features.xml rename to tests/files/org/openml/test/datasets/2/features.xml diff --git a/tests/files/datasets/2/qualities.xml b/tests/files/org/openml/test/datasets/2/qualities.xml similarity index 100% rename from tests/files/datasets/2/qualities.xml rename to tests/files/org/openml/test/datasets/2/qualities.xml diff --git a/tests/files/runs/1/description.xml b/tests/files/org/openml/test/runs/1/description.xml similarity index 100% rename from tests/files/runs/1/description.xml rename to tests/files/org/openml/test/runs/1/description.xml diff --git a/tests/files/setups/1/description.xml b/tests/files/org/openml/test/setups/1/description.xml similarity index 100% rename from tests/files/setups/1/description.xml rename to tests/files/org/openml/test/setups/1/description.xml diff --git a/tests/files/tasks/1/datasplits.arff b/tests/files/org/openml/test/tasks/1/datasplits.arff similarity index 100% rename from tests/files/tasks/1/datasplits.arff rename to tests/files/org/openml/test/tasks/1/datasplits.arff diff --git a/tests/files/tasks/1/task.xml b/tests/files/org/openml/test/tasks/1/task.xml similarity index 100% rename from tests/files/tasks/1/task.xml rename to tests/files/org/openml/test/tasks/1/task.xml diff --git a/tests/files/tasks/1882/datasplits.arff b/tests/files/org/openml/test/tasks/1882/datasplits.arff similarity index 100% rename from tests/files/tasks/1882/datasplits.arff rename to tests/files/org/openml/test/tasks/1882/datasplits.arff diff --git a/tests/files/tasks/1882/task.xml b/tests/files/org/openml/test/tasks/1882/task.xml similarity index 100% rename from tests/files/tasks/1882/task.xml rename to tests/files/org/openml/test/tasks/1882/task.xml diff --git a/tests/files/tasks/3/datasplits.arff b/tests/files/org/openml/test/tasks/3/datasplits.arff similarity index 100% rename from tests/files/tasks/3/datasplits.arff rename to tests/files/org/openml/test/tasks/3/datasplits.arff diff --git a/tests/files/tasks/3/task.xml b/tests/files/org/openml/test/tasks/3/task.xml similarity index 100% rename from tests/files/tasks/3/task.xml rename to tests/files/org/openml/test/tasks/3/task.xml diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index f208d4ea1..24c2bb77c 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -20,7 +20,7 @@ from openml.exceptions import OpenMLCacheException, PyOpenMLError, \ OpenMLHashException, PrivateDatasetError from openml.testing import TestBase -from openml.utils import _tag_entity +from openml.utils import _tag_entity, _create_cache_directory_for_id from openml.datasets.functions import (_get_cached_dataset, _get_cached_dataset_features, @@ -29,7 +29,8 @@ _get_dataset_description, _get_dataset_arff, _get_dataset_features, - _get_dataset_qualities) + _get_dataset_qualities, + DATASETS_CACHE_DIR_NAME) class TestOpenMLDataset(TestBase): @@ -57,7 +58,7 @@ def _remove_pickle_files(self): pass def test__list_cached_datasets(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir cached_datasets = openml.datasets.functions._list_cached_datasets() self.assertIsInstance(cached_datasets, list) self.assertEqual(len(cached_datasets), 2) @@ -65,7 +66,7 @@ def test__list_cached_datasets(self): @mock.patch('openml.datasets.functions._list_cached_datasets') def test__get_cached_datasets(self, _list_cached_datasets_mock): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir _list_cached_datasets_mock.return_value = [-1, 2] datasets = _get_cached_datasets() self.assertIsInstance(datasets, dict) @@ -73,7 +74,7 @@ def test__get_cached_datasets(self, _list_cached_datasets_mock): self.assertIsInstance(list(datasets.values())[0], OpenMLDataset) def test__get_cached_dataset(self, ): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir dataset = _get_cached_dataset(2) features = _get_cached_dataset_features(2) qualities = _get_cached_dataset_qualities(2) @@ -83,25 +84,25 @@ def test__get_cached_dataset(self, ): self.assertTrue(len(dataset.qualities) == len(qualities)) def test_get_cached_dataset_description(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir description = openml.datasets.functions._get_cached_dataset_description(2) self.assertIsInstance(description, dict) def test_get_cached_dataset_description_not_cached(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir self.assertRaisesRegexp(OpenMLCacheException, "Dataset description for " "dataset id 3 not cached", openml.datasets.functions._get_cached_dataset_description, 3) def test_get_cached_dataset_arff(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir description = openml.datasets.functions._get_cached_dataset_arff( dataset_id=2) self.assertIsInstance(description, str) def test_get_cached_dataset_arff_not_cached(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir self.assertRaisesRegexp(OpenMLCacheException, "ARFF file for " "dataset id 3 not cached", openml.datasets.functions._get_cached_dataset_arff, @@ -185,7 +186,6 @@ def test_list_datasets_empty(self): self.assertIsInstance(datasets, dict) - @unittest.skip('See https://github.com/openml/openml-python/issues/149') def test_check_datasets_active(self): active = openml.datasets.check_datasets_active([1, 17]) @@ -261,7 +261,7 @@ def test__get_dataset_description(self): self.assertTrue(os.path.exists(description_xml_path)) def test__getarff_path_dataset_arff(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir description = openml.datasets.functions._get_cached_dataset_description(2) arff_path = _get_dataset_arff(self.workdir, description) self.assertIsInstance(arff_path, str) @@ -294,10 +294,13 @@ def test__get_dataset_qualities(self): def test_deletion_of_cache_dir(self): # Simple removal - did_cache_dir = openml.datasets.functions.\ - _create_dataset_cache_directory(1) + did_cache_dir = openml.utils._create_cache_directory_for_id( + DATASETS_CACHE_DIR_NAME, 1, + ) self.assertTrue(os.path.exists(did_cache_dir)) - openml.datasets.functions._remove_dataset_cache_dir(did_cache_dir) + openml.utils._remove_cache_dir_for_id( + DATASETS_CACHE_DIR_NAME, did_cache_dir, + ) self.assertFalse(os.path.exists(did_cache_dir)) # Use _get_dataset_arff to load the description, trigger an exception in the @@ -307,7 +310,9 @@ def test_deletion_of_cache_dir_faulty_download(self, patch): patch.side_effect = Exception('Boom!') self.assertRaisesRegexp(Exception, 'Boom!', openml.datasets.get_dataset, 1) - datasets_cache_dir = os.path.join(self.workdir, 'datasets') + datasets_cache_dir = os.path.join( + self.workdir, 'org', 'openml', 'test', 'datasets' + ) self.assertEqual(len(os.listdir(datasets_cache_dir)), 0) def test_publish_dataset(self): @@ -321,7 +326,7 @@ def test_publish_dataset(self): self.assertIsInstance(dataset.dataset_id, int) def test__retrieve_class_labels(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir labels = openml.datasets.get_dataset(2).retrieve_class_labels() self.assertEqual(labels, ['1', '2', '3', '4', '5', 'U']) labels = openml.datasets.get_dataset(2).retrieve_class_labels( diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py index 771ee2cd4..be55c2cd8 100644 --- a/tests/test_evaluations/test_evaluation_functions.py +++ b/tests/test_evaluations/test_evaluation_functions.py @@ -69,4 +69,4 @@ def test_list_evaluations_empty(self): if len(evaluations) > 0: raise ValueError('UnitTest Outdated, got somehow results') - self.assertIsInstance(evaluations, dict) \ No newline at end of file + self.assertIsInstance(evaluations, dict) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index eccda841d..deafbcacc 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -52,14 +52,17 @@ def test_parse_parameters(self): self.assertEqual(parameter['oml:component'], 2) def test_tagging(self): - run = openml.runs.get_run(1) + + runs = openml.runs.list_runs(size=1) + run_id = list(runs.keys())[0] + run = openml.runs.get_run(run_id) tag = "testing_tag_{}_{}".format(self.id(), time()) run_list = openml.runs.list_runs(tag=tag) self.assertEqual(len(run_list), 0) run.push_tag(tag) run_list = openml.runs.list_runs(tag=tag) self.assertEqual(len(run_list), 1) - self.assertIn(1, run_list) + self.assertIn(run_id, run_list) run.remove_tag(tag) run_list = openml.runs.list_runs(tag=tag) self.assertEqual(len(run_list), 0) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index d28a834b3..f824e1ed1 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -987,10 +987,10 @@ def test_predict_proba_hardclassifier(self): np.testing.assert_array_equal(predictionsA, predictionsB) def test_get_cached_run(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir openml.runs.functions._get_cached_run(1) def test_get_uncached_run(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir with self.assertRaises(openml.exceptions.OpenMLCacheException): - openml.runs.functions._get_cached_run(10) \ No newline at end of file + openml.runs.functions._get_cached_run(10) diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index e2c705a6e..928874837 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -159,11 +159,11 @@ def test_setuplist_offset(self): self.assertEqual(len(all), size * 2) def test_get_cached_setup(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir openml.setups.functions._get_cached_setup(1) def test_get_uncached_setup(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir with self.assertRaises(openml.exceptions.OpenMLCacheException): - openml.setups.functions._get_cached_setup(10) \ No newline at end of file + openml.setups.functions._get_cached_setup(10) diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index 0bf0496da..c2d0b7258 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -23,4 +23,4 @@ def test_get_tasks(self): self.assertEquals(study.data, None) self.assertGreater(len(study.tasks), 0) self.assertEquals(study.flows, None) - self.assertEquals(study.setups, None) \ No newline at end of file + self.assertEquals(study.setups, None) diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py index e58e2dc2d..6fd2926e5 100644 --- a/tests/test_tasks/test_split.py +++ b/tests/test_tasks/test_split.py @@ -16,7 +16,9 @@ def setUp(self): self.directory = os.path.dirname(__file__) # This is for dataset self.arff_filename = os.path.join( - self.directory, "..", "files", "tasks", "1882", "datasplits.arff") + self.directory, "..", "files", "org", "openml", "test", + "tasks", "1882", "datasplits.arff" + ) self.pd_filename = self.arff_filename.replace(".arff", ".pkl") def tearDown(self): diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py index 704ce8f39..fdbfa06d1 100644 --- a/tests/test_tasks/test_task.py +++ b/tests/test_tasks/test_task.py @@ -59,7 +59,7 @@ def test_tagging(self): self.assertEqual(len(task_list), 0) def test_get_train_and_test_split_indices(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir task = openml.tasks.get_task(1882) train_indices, test_indices = task.get_train_test_split_indices(0, 0) self.assertEqual(16, train_indices[0]) diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index b9d4368e7..a711534c6 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -18,19 +18,19 @@ class TestTask(TestBase): _multiprocess_can_split_ = True def test__get_cached_tasks(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir tasks = openml.tasks.functions._get_cached_tasks() self.assertIsInstance(tasks, dict) self.assertEqual(len(tasks), 3) self.assertIsInstance(list(tasks.values())[0], OpenMLTask) def test__get_cached_task(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir task = openml.tasks.functions._get_cached_task(1) self.assertIsInstance(task, OpenMLTask) def test__get_cached_task_not_cached(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir self.assertRaisesRegexp(OpenMLCacheException, 'Task file for tid 2 not cached', openml.tasks.functions._get_cached_task, 2) @@ -109,7 +109,7 @@ def test_list_tasks_per_type_paginate(self): self._check_task(tasks[tid]) def test__get_task(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir task = openml.tasks.get_task(1882) # Test the following task as it used to throw an Unicode Error. # https://github.com/openml/openml-python/issues/378 @@ -119,12 +119,15 @@ def test__get_task(self): def test_get_task(self): task = openml.tasks.get_task(1) self.assertIsInstance(task, OpenMLTask) - self.assertTrue(os.path.exists( - os.path.join(os.getcwd(), "tasks", "1", "task.xml"))) - self.assertTrue(os.path.exists( - os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) - self.assertTrue(os.path.exists( - os.path.join(os.getcwd(), "datasets", "1", "dataset.arff"))) + self.assertTrue(os.path.exists(os.path.join( + self.workdir, 'org', 'openml', 'test', "tasks", "1", "task.xml", + ))) + self.assertTrue(os.path.exists(os.path.join( + self.workdir, 'org', 'openml', 'test', "tasks", "1", "datasplits.arff" + ))) + self.assertTrue(os.path.exists(os.path.join( + self.workdir, 'org', 'openml', 'test', "datasets", "1", "dataset.arff" + ))) @mock.patch('openml.tasks.functions.get_dataset') def test_removal_upon_download_failure(self, get_dataset): @@ -145,7 +148,7 @@ def assert_and_raise(*args, **kwargs): )) def test_get_task_with_cache(self): - openml.config.set_cache_directory(self.static_cache_dir) + openml.config.cache_directory = self.static_cache_dir task = openml.tasks.get_task(1) self.assertIsInstance(task, OpenMLTask) @@ -153,13 +156,15 @@ def test_download_split(self): task = openml.tasks.get_task(1) split = task.download_split() self.assertEqual(type(split), OpenMLSplit) - self.assertTrue(os.path.exists( - os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) + self.assertTrue(os.path.exists(os.path.join( + self.workdir, 'org', 'openml', 'test', "tasks", "1", "datasplits.arff" + ))) def test_deletion_of_cache_dir(self): # Simple removal - tid_cache_dir = openml.tasks.functions.\ - _create_task_cache_directory(1) + tid_cache_dir = openml.utils._create_cache_directory_for_id( + 'tasks', 1, + ) self.assertTrue(os.path.exists(tid_cache_dir)) - openml.tasks.functions._remove_task_cache_dir(tid_cache_dir) + openml.utils._remove_cache_dir_for_id('tasks', tid_cache_dir) self.assertFalse(os.path.exists(tid_cache_dir))