Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ API:
.. code:: python

>>> import os
>>> openml.config.set_cache_directory(os.path.expanduser('~/.openml/cache'))
>>> openml.config.cache_directory = os.path.expanduser('~/.openml/cache')

Config file:

Expand Down
79 changes: 30 additions & 49 deletions openml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,31 @@

from six import StringIO
from six.moves import configparser
from six.moves.urllib_parse import urlparse


logger = logging.getLogger(__name__)
logging.basicConfig(
format='[%(levelname)s] [%(asctime)s:%(name)s] %('
'message)s', datefmt='%H:%M:%S')

# Default values!
_defaults = {
'apikey': None,
'server': "https://www.openml.org/api/v1/xml",
'verbosity': 0,
'cachedir': os.path.expanduser('~/.openml/cache'),
'avoid_duplicate_runs': 'True',
}

config_file = os.path.expanduser('~/.openml/config')
server = "https://www.openml.org/api/v1/xml"

# Default values are actually added here in the _setup() function which is
# called at the end of this module
server = ""
apikey = ""
cachedir = ""
# The current cache directory (without the server name)
cache_directory = ""


def _setup():
Expand All @@ -26,12 +40,11 @@ def _setup():
key and server can be set by the user simply using
openml.config.apikey = THEIRKEY
openml.config.server = SOMESERVER
The cache dir needs to be set up calling set_cache_directory
because it needs some setup.
We could also make it a property but that's less clear.
"""
global apikey
global server
global cache_directory
global avoid_duplicate_runs
# read config file, create cache directory
try:
Expand All @@ -42,52 +55,15 @@ def _setup():
config = _parse_config()
apikey = config.get('FAKE_SECTION', 'apikey')
server = config.get('FAKE_SECTION', 'server')
cache_dir = config.get('FAKE_SECTION', 'cachedir')
cache_directory = config.get('FAKE_SECTION', 'cachedir')
avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
set_cache_directory(cache_dir)


def set_cache_directory(cachedir):
"""Set module-wide cache directory.

Sets the cache directory into which to download datasets, tasks etc.

Parameters
----------
cachedir : string
Path to use as cache directory.

See also
--------
get_cache_directory
"""

global _cachedir
_cachedir = cachedir

# Set up the cache directories
dataset_cache_dir = os.path.join(cachedir, "datasets")
task_cache_dir = os.path.join(cachedir, "tasks")
run_cache_dir = os.path.join(cachedir, 'runs')
lock_dir = os.path.join(cachedir, 'locks')

for dir_ in [
cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir, lock_dir,
]:
if not os.path.exists(dir_) and not os.path.isdir(dir_):
os.mkdir(dir_)


def _parse_config():
"""Parse the config file, set up defaults.
"""
defaults = {'apikey': apikey,
'server': server,
'verbosity': 0,
'cachedir': os.path.expanduser('~/.openml/cache'),
'avoid_duplicate_runs': 'True'}

config = configparser.RawConfigParser(defaults=defaults)
config = configparser.RawConfigParser(defaults=_defaults)

if not os.path.exists(config_file):
# Create an empty config file if there was none so far
Expand All @@ -106,8 +82,7 @@ def _parse_config():
config_file_.seek(0)
config.readfp(config_file_)
except OSError as e:
logging.info("Error opening file %s: %s" %
config_file, e.message)
logging.info("Error opening file %s: %s", config_file, e.message)
return config


Expand All @@ -119,13 +94,19 @@ def get_cache_directory():
cachedir : string
The current cache directory.

See also
--------
set_cache_directory
"""
url_suffix = urlparse(server).netloc
reversed_url_suffix = '/'.join(url_suffix.split('.')[::-1])
if not cache_directory:
_cachedir = _defaults(cache_directory)
else:
_cachedir = cache_directory
_cachedir = os.path.join(_cachedir, reversed_url_suffix)
return _cachedir


__all__ = ["set_cache_directory", 'get_cache_directory']
__all__ = [
'get_cache_directory',
]

_setup()
95 changes: 30 additions & 65 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,22 @@
from .dataset import OpenMLDataset
from ..exceptions import OpenMLCacheException, OpenMLServerException, \
OpenMLHashException, PrivateDatasetError
from .. import config
from .._api_calls import _read_url
from ..utils import (
_create_cache_directory,
_remove_cache_dir_for_id,
_create_cache_directory_for_id,
_create_lockfiles_dir,
)


DATASETS_CACHE_DIR_NAME = 'datasets'



############################################################################
# Local getters/accessors to the cache directory


def _list_cached_datasets():
"""Return list with ids of all cached datasets

Expand All @@ -31,8 +40,7 @@ def _list_cached_datasets():
"""
datasets = []

dataset_cache = config.get_cache_directory()
dataset_cache_dir = os.path.join(dataset_cache, "datasets")
dataset_cache_dir = _create_cache_directory(DATASETS_CACHE_DIR_NAME)
directory_content = os.listdir(dataset_cache_dir)
directory_content.sort()

Expand Down Expand Up @@ -88,8 +96,9 @@ def _get_cached_dataset(dataset_id):


def _get_cached_dataset_description(dataset_id):
cache_dir = config.get_cache_directory()
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME, dataset_id,
)
description_file = os.path.join(did_cache_dir, "description.xml")
try:
with io.open(description_file, encoding='utf8') as fh:
Expand All @@ -102,8 +111,9 @@ def _get_cached_dataset_description(dataset_id):


def _get_cached_dataset_features(dataset_id):
cache_dir = config.get_cache_directory()
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME, dataset_id,
)
features_file = os.path.join(did_cache_dir, "features.xml")
try:
with io.open(features_file, encoding='utf8') as fh:
Expand All @@ -115,8 +125,9 @@ def _get_cached_dataset_features(dataset_id):


def _get_cached_dataset_qualities(dataset_id):
cache_dir = config.get_cache_directory()
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME, dataset_id,
)
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
try:
with io.open(qualities_file, encoding='utf8') as fh:
Expand All @@ -128,8 +139,9 @@ def _get_cached_dataset_qualities(dataset_id):


def _get_cached_dataset_arff(dataset_id):
cache_dir = config.get_cache_directory()
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME, dataset_id,
)
output_file = os.path.join(did_cache_dir, "dataset.arff")

try:
Expand Down Expand Up @@ -311,9 +323,11 @@ def get_dataset(dataset_id):

with lockutils.external_lock(
name='datasets.functions.get_dataset:%d' % dataset_id,
lock_path=os.path.join(config.get_cache_directory(), 'locks'),
lock_path=_create_lockfiles_dir(),
):
did_cache_dir = _create_dataset_cache_directory(dataset_id)
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME, dataset_id,
)

try:
remove_dataset_cache = True
Expand All @@ -330,7 +344,7 @@ def get_dataset(dataset_id):
raise e
finally:
if remove_dataset_cache:
_remove_dataset_cache_dir(did_cache_dir)
_remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)

dataset = _create_dataset_from_description(
description, features, qualities, arff_file
Expand Down Expand Up @@ -412,7 +426,7 @@ def _get_dataset_arff(did_cache_dir, description):
pass

url = description['oml:url']
arff_string = _read_url(url)
arff_string = openml._api_calls._read_url(url)
md5 = hashlib.md5()
md5.update(arff_string.encode('utf-8'))
md5_checksum = md5.hexdigest()
Expand Down Expand Up @@ -505,55 +519,6 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
return qualities


def _create_dataset_cache_directory(dataset_id):
"""Create a dataset cache directory

In order to have a clearer cache structure and because every dataset
is cached in several files (description, arff, features, qualities), there
is a directory for each dataset witch the dataset ID being the directory
name. This function creates this cache directory.

This function is NOT thread/multiprocessing safe.

Parameters
----------
did : int
Dataset ID

Returns
-------
str
Path of the created dataset cache directory.
"""
dataset_cache_dir = os.path.join(
config.get_cache_directory(),
"datasets",
str(dataset_id),
)
if os.path.exists(dataset_cache_dir) and os.path.isdir(dataset_cache_dir):
pass
elif os.path.exists(dataset_cache_dir) and not os.path.isdir(dataset_cache_dir):
raise ValueError('Dataset cache dir exists but is not a directory!')
else:
os.makedirs(dataset_cache_dir)
return dataset_cache_dir


def _remove_dataset_cache_dir(did_cache_dir):
"""Remove the dataset cache directory

This function is NOT thread/multiprocessing safe.

Parameters
----------
"""
try:
shutil.rmtree(did_cache_dir)
except (OSError, IOError):
raise ValueError('Cannot remove faulty dataset cache directory %s.'
'Please do this manually!' % did_cache_dir)


def _create_dataset_from_description(description, features, qualities, arff_file):
"""Create a dataset object from a description dict.

Expand Down
10 changes: 7 additions & 3 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import io
import json
import os
import shutil
import sys
import time
import warnings
Expand All @@ -28,6 +29,8 @@
# _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
# circular imports

RUNS_CACHE_DIR_NAME = 'runs'


def run_model_on_task(task, model, avoid_duplicate_runs=True, flow_tags=None,
seed=None):
Expand Down Expand Up @@ -643,7 +646,7 @@ def get_run(run_id):
run : OpenMLRun
Run corresponding to ID, fetched from the server.
"""
run_dir = os.path.join(config.get_cache_directory(), "runs", str(run_id))
run_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)
run_file = os.path.join(run_dir, "description.xml")

if not os.path.exists(run_dir):
Expand Down Expand Up @@ -878,8 +881,9 @@ def _create_trace_from_arff(arff_obj):

def _get_cached_run(run_id):
"""Load a run from the cache."""
cache_dir = config.get_cache_directory()
run_cache_dir = os.path.join(cache_dir, "runs", str(run_id))
run_cache_dir = openml.utils._create_cache_directory_for_id(
RUNS_CACHE_DIR_NAME, run_id,
)
try:
run_file = os.path.join(run_cache_dir, "description.xml")
with io.open(run_file, encoding='utf8') as fh:
Expand Down
5 changes: 3 additions & 2 deletions openml/runs/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import openml
import openml._api_calls
from ..tasks import get_task
from .._api_calls import _file_id_to_url
from ..exceptions import PyOpenMLError


Expand Down Expand Up @@ -142,7 +141,9 @@ def get_metric_fn(self, sklearn_fn, kwargs={}):
if self.data_content is not None and self.task_id is not None:
predictions_arff = self._generate_arff_dict()
elif 'predictions' in self.output_files:
predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff')
predictions_file_url = openml._api_calls._file_id_to_url(
self.output_files['predictions'], 'predictions.arff',
)
predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
# TODO: make this a stream reader
else:
Expand Down
Loading