openml · mfeurer · Apr 11, 2018 · Apr 4, 2018 · Apr 4, 2018 · Apr 4, 2018
diff --git a/doc/usage.rst b/doc/usage.rst
@@ -55,7 +55,7 @@ API:
 .. code:: python
 
     >>> import os
-    >>> openml.config.set_cache_directory(os.path.expanduser('~/.openml/cache'))
+    >>> openml.config.cache_directory = os.path.expanduser('~/.openml/cache')
 
 Config file:
 

diff --git a/openml/config.py b/openml/config.py
@@ -6,17 +6,31 @@
 
 from six import StringIO
 from six.moves import configparser
+from six.moves.urllib_parse import urlparse
 
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
     format='[%(levelname)s] [%(asctime)s:%(name)s] %('
            'message)s', datefmt='%H:%M:%S')
 
+# Default values!
+_defaults = {
+    'apikey': None,
+    'server': "https://www.openml.org/api/v1/xml",
+    'verbosity': 0,
+    'cachedir': os.path.expanduser('~/.openml/cache'),
+    'avoid_duplicate_runs': 'True',
+}
+
 config_file = os.path.expanduser('~/.openml/config')
-server = "https://www.openml.org/api/v1/xml"
+
+# Default values are actually added here in the _setup() function which is
+# called at the end of this module
+server = ""
 apikey = ""
-cachedir = ""
+# The current cache directory (without the server name)
+cache_directory = ""
 
 
 def _setup():
@@ -26,12 +40,11 @@ def _setup():
     key and server can be set by the user simply using
     openml.config.apikey = THEIRKEY
     openml.config.server = SOMESERVER
-    The cache dir needs to be set up calling set_cache_directory
-    because it needs some setup.
     We could also make it a property but that's less clear.
     """
     global apikey
     global server
+    global cache_directory
     global avoid_duplicate_runs
     # read config file, create cache directory
     try:
@@ -42,52 +55,15 @@ def _setup():
     config = _parse_config()
     apikey = config.get('FAKE_SECTION', 'apikey')
     server = config.get('FAKE_SECTION', 'server')
-    cache_dir = config.get('FAKE_SECTION', 'cachedir')
+    cache_directory = config.get('FAKE_SECTION', 'cachedir')
     avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
-    set_cache_directory(cache_dir)
-
-
-def set_cache_directory(cachedir):
-    """Set module-wide cache directory.
-
-    Sets the cache directory into which to download datasets, tasks etc.
-
-    Parameters
-    ----------
-    cachedir : string
-        Path to use as cache directory.
-
-    See also
-    --------
-    get_cache_directory
-    """
-
-    global _cachedir
-    _cachedir = cachedir
-
-    # Set up the cache directories
-    dataset_cache_dir = os.path.join(cachedir, "datasets")
-    task_cache_dir = os.path.join(cachedir, "tasks")
-    run_cache_dir = os.path.join(cachedir, 'runs')
-    lock_dir = os.path.join(cachedir, 'locks')
-
-    for dir_ in [
-        cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir, lock_dir,
-    ]:
-        if not os.path.exists(dir_) and not os.path.isdir(dir_):
-            os.mkdir(dir_)
 
 
 def _parse_config():
     """Parse the config file, set up defaults.
     """
-    defaults = {'apikey': apikey,
-                'server': server,
-                'verbosity': 0,
-                'cachedir': os.path.expanduser('~/.openml/cache'),
-                'avoid_duplicate_runs': 'True'}
 
-    config = configparser.RawConfigParser(defaults=defaults)
+    config = configparser.RawConfigParser(defaults=_defaults)
 
     if not os.path.exists(config_file):
         # Create an empty config file if there was none so far
@@ -106,8 +82,7 @@ def _parse_config():
         config_file_.seek(0)
         config.readfp(config_file_)
     except OSError as e:
-        logging.info("Error opening file %s: %s" %
-                     config_file, e.message)
+        logging.info("Error opening file %s: %s", config_file, e.message)
     return config
 
 
@@ -119,13 +94,19 @@ def get_cache_directory():
     cachedir : string
         The current cache directory.
 
-    See also
-    --------
-    set_cache_directory
     """
+    url_suffix = urlparse(server).netloc
+    reversed_url_suffix = '/'.join(url_suffix.split('.')[::-1])
+    if not cache_directory:
+        _cachedir = _defaults(cache_directory)
+    else:
+        _cachedir = cache_directory
+    _cachedir = os.path.join(_cachedir, reversed_url_suffix)
     return _cachedir
 
 
-__all__ = ["set_cache_directory", 'get_cache_directory']
+__all__ = [
+    'get_cache_directory',
+]
 
 _setup()
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -14,13 +14,22 @@
 from .dataset import OpenMLDataset
 from ..exceptions import OpenMLCacheException, OpenMLServerException, \
     OpenMLHashException, PrivateDatasetError
-from .. import config
-from .._api_calls import _read_url
+from ..utils import (
+    _create_cache_directory,
+    _remove_cache_dir_for_id,
+    _create_cache_directory_for_id,
+    _create_lockfiles_dir,
+)
+
+
+DATASETS_CACHE_DIR_NAME = 'datasets'
+
 
 
 ############################################################################
 # Local getters/accessors to the cache directory
 
+
 def _list_cached_datasets():
     """Return list with ids of all cached datasets
 
@@ -31,8 +40,7 @@ def _list_cached_datasets():
     """
     datasets = []
 
-    dataset_cache = config.get_cache_directory()
-    dataset_cache_dir = os.path.join(dataset_cache, "datasets")
+    dataset_cache_dir = _create_cache_directory(DATASETS_CACHE_DIR_NAME)
     directory_content = os.listdir(dataset_cache_dir)
     directory_content.sort()
 
@@ -88,8 +96,9 @@ def _get_cached_dataset(dataset_id):
 
 
 def _get_cached_dataset_description(dataset_id):
-    cache_dir = config.get_cache_directory()
-    did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+    did_cache_dir = _create_cache_directory_for_id(
+        DATASETS_CACHE_DIR_NAME, dataset_id,
+    )
     description_file = os.path.join(did_cache_dir, "description.xml")
     try:
         with io.open(description_file, encoding='utf8') as fh:
@@ -102,8 +111,9 @@ def _get_cached_dataset_description(dataset_id):
 
 
 def _get_cached_dataset_features(dataset_id):
-    cache_dir = config.get_cache_directory()
-    did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+    did_cache_dir = _create_cache_directory_for_id(
+        DATASETS_CACHE_DIR_NAME, dataset_id,
+    )
     features_file = os.path.join(did_cache_dir, "features.xml")
     try:
         with io.open(features_file, encoding='utf8') as fh:
@@ -115,8 +125,9 @@ def _get_cached_dataset_features(dataset_id):
 
 
 def _get_cached_dataset_qualities(dataset_id):
-    cache_dir = config.get_cache_directory()
-    did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+    did_cache_dir = _create_cache_directory_for_id(
+        DATASETS_CACHE_DIR_NAME, dataset_id,
+    )
     qualities_file = os.path.join(did_cache_dir, "qualities.xml")
     try:
         with io.open(qualities_file, encoding='utf8') as fh:
@@ -128,8 +139,9 @@ def _get_cached_dataset_qualities(dataset_id):
 
 
 def _get_cached_dataset_arff(dataset_id):
-    cache_dir = config.get_cache_directory()
-    did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+    did_cache_dir = _create_cache_directory_for_id(
+        DATASETS_CACHE_DIR_NAME, dataset_id,
+    )
     output_file = os.path.join(did_cache_dir, "dataset.arff")
 
     try:
@@ -311,9 +323,11 @@ def get_dataset(dataset_id):
 
     with lockutils.external_lock(
         name='datasets.functions.get_dataset:%d' % dataset_id,
-        lock_path=os.path.join(config.get_cache_directory(), 'locks'),
+        lock_path=_create_lockfiles_dir(),
     ):
-        did_cache_dir = _create_dataset_cache_directory(dataset_id)
+        did_cache_dir = _create_cache_directory_for_id(
+            DATASETS_CACHE_DIR_NAME, dataset_id,
+        )
 
         try:
             remove_dataset_cache = True
@@ -330,7 +344,7 @@ def get_dataset(dataset_id):
                 raise e
         finally:
             if remove_dataset_cache:
-                _remove_dataset_cache_dir(did_cache_dir)
+                _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
 
         dataset = _create_dataset_from_description(
             description, features, qualities, arff_file
@@ -412,7 +426,7 @@ def _get_dataset_arff(did_cache_dir, description):
         pass
 
     url = description['oml:url']
-    arff_string = _read_url(url)
+    arff_string = openml._api_calls._read_url(url)
     md5 = hashlib.md5()
     md5.update(arff_string.encode('utf-8'))
     md5_checksum = md5.hexdigest()
@@ -505,55 +519,6 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
     return qualities
 
 
-def _create_dataset_cache_directory(dataset_id):
-    """Create a dataset cache directory
-
-    In order to have a clearer cache structure and because every dataset
-    is cached in several files (description, arff, features, qualities), there
-    is a directory for each dataset witch the dataset ID being the directory
-    name. This function creates this cache directory.
-
-    This function is NOT thread/multiprocessing safe.
-
-    Parameters
-    ----------
-    did : int
-        Dataset ID
-
-    Returns
-    -------
-    str
-        Path of the created dataset cache directory.
-    """
-    dataset_cache_dir = os.path.join(
-        config.get_cache_directory(),
-        "datasets",
-        str(dataset_id),
-    )
-    if os.path.exists(dataset_cache_dir) and os.path.isdir(dataset_cache_dir):
-        pass
-    elif os.path.exists(dataset_cache_dir) and not os.path.isdir(dataset_cache_dir):
-        raise ValueError('Dataset cache dir exists but is not a directory!')
-    else:
-        os.makedirs(dataset_cache_dir)
-    return dataset_cache_dir
-
-
-def _remove_dataset_cache_dir(did_cache_dir):
-    """Remove the dataset cache directory
-
-    This function is NOT thread/multiprocessing safe.
-
-    Parameters
-    ----------
-    """
-    try:
-        shutil.rmtree(did_cache_dir)
-    except (OSError, IOError):
-        raise ValueError('Cannot remove faulty dataset cache directory %s.'
-                         'Please do this manually!' % did_cache_dir)
-
-
 def _create_dataset_from_description(description, features, qualities, arff_file):
     """Create a dataset object from a description dict.
 

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -2,6 +2,7 @@
 import io
 import json
 import os
+import shutil
 import sys
 import time
 import warnings
@@ -28,6 +29,8 @@
 # _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
 # circular imports
 
+RUNS_CACHE_DIR_NAME = 'runs'
+
 
 def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None,
                       seed=None):
@@ -643,7 +646,7 @@ def get_run(run_id):
     run : OpenMLRun
         Run corresponding to ID, fetched from the server.
     """
-    run_dir = os.path.join(config.get_cache_directory(), "runs", str(run_id))
+    run_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)
     run_file = os.path.join(run_dir, "description.xml")
 
     if not os.path.exists(run_dir):
@@ -878,8 +881,9 @@ def _create_trace_from_arff(arff_obj):
 
 def _get_cached_run(run_id):
     """Load a run from the cache."""
-    cache_dir = config.get_cache_directory()
-    run_cache_dir = os.path.join(cache_dir, "runs", str(run_id))
+    run_cache_dir = openml.utils._create_cache_directory_for_id(
+        RUNS_CACHE_DIR_NAME, run_id,
+    )
     try:
         run_file = os.path.join(run_cache_dir, "description.xml")
         with io.open(run_file, encoding='utf8') as fh:

diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -10,7 +10,6 @@
 import openml
 import openml._api_calls
 from ..tasks import get_task
-from .._api_calls import _file_id_to_url
 from ..exceptions import PyOpenMLError
 
 
@@ -142,7 +141,9 @@ def get_metric_fn(self, sklearn_fn, kwargs={}):
         if self.data_content is not None and self.task_id is not None:
             predictions_arff = self._generate_arff_dict()
         elif 'predictions' in self.output_files:
-            predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff')
+            predictions_file_url = openml._api_calls._file_id_to_url(
+                self.output_files['predictions'], 'predictions.arff',
+            )
             predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
             # TODO: make this a stream reader
         else: