Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion ci_scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ cd $curr_dir
# compares with $before to check for remaining files
after="`git status --porcelain -b`"
if [[ "$before" != "$after" ]]; then
echo 'git status from before: '$before
echo 'git status from after: '$after
echo "All generated files have not been deleted!"
exit 1
fi
fi
86 changes: 3 additions & 83 deletions openml/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import openml
from openml.tasks import TaskTypeEnum

import pytest
import logging


Expand All @@ -35,12 +34,9 @@ class TestBase(unittest.TestCase):
# amueller's read/write key that he will throw away later
apikey = "610344db6388d9ba34f6db45a3cf71de"

# creating logger for unit test file deletion status
logger = logging.getLogger("unit_tests")
logger.setLevel(logging.INFO)
fh = logging.FileHandler('TestBase.log')
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# creating logger for tracking files uploaded to test server
logger = logging.getLogger("unit_tests_published_entities")
logger.setLevel(logging.DEBUG)

def setUp(self, n_levels: int = 1):
"""Setup variables and temporary directories.
Expand Down Expand Up @@ -151,82 +147,6 @@ def _delete_entity_from_tracker(self, entity_type, entity):
if id_ == entity][0]
TestBase.publish_tracker[entity_type].pop(delete_index)

@pytest.fixture(scope="session", autouse=True)
def _cleanup_fixture(self):
"""Cleans up files generated by unit tests

This function is called at the beginning of the invocation of
TestBase (defined below), by each of class that inherits TestBase.
The 'yield' creates a checkpoint and breaks away to continue running
the unit tests of the sub class. When all the tests end, execution
resumes from the checkpoint.
"""

abspath_this_file = os.path.abspath(inspect.getfile(self.__class__))
static_cache_dir = os.path.dirname(abspath_this_file)
# Could be a risky while condition, however, going up a directory
# n-times will eventually end at main directory
while True:
if 'openml' in os.listdir(static_cache_dir):
break
else:
static_cache_dir = os.path.join(static_cache_dir, '../')
directory = os.path.join(static_cache_dir, 'tests/files/')
files = os.walk(directory)
old_file_list = []
for root, _, filenames in files:
for filename in filenames:
old_file_list.append(os.path.join(root, filename))
# context switches to other remaining tests
# pauses the code execution here till all tests in the 'session' is over
yield
# resumes from here after all collected tests are completed

#
# Local file deletion
#
files = os.walk(directory)
new_file_list = []
for root, _, filenames in files:
for filename in filenames:
new_file_list.append(os.path.join(root, filename))
# filtering the files generated during this run
new_file_list = list(set(new_file_list) - set(old_file_list))
for file in new_file_list:
os.remove(file)

#
# Test server deletion
#
openml.config.server = TestBase.test_server
openml.config.apikey = TestBase.apikey

# legal_entities defined in openml.utils._delete_entity - {'user'}
entity_types = {'run', 'data', 'flow', 'task', 'study'}
# 'run' needs to be first entity to allow other dependent entities to be deleted
# cloning file tracker to allow deletion of entries of deleted files
tracker = TestBase.publish_tracker.copy()

# reordering to delete sub flows at the end of flows
# sub-flows have shorter names, hence, sorting by descending order of flow name length
if 'flow' in entity_types:
flow_deletion_order = [entity_id for entity_id, _ in
sorted(tracker['flow'], key=lambda x: len(x[1]), reverse=True)]
tracker['flow'] = flow_deletion_order

# deleting all collected entities published to test server
for entity_type in entity_types:
for i, entity in enumerate(tracker[entity_type]):
try:
openml.utils._delete_entity(entity_type, entity)
TestBase.logger.info("Deleted ({}, {})".format(entity_type, entity))
# deleting actual entry from tracker
TestBase._delete_entity_from_tracker(entity_type, entity)
except Exception as e:
TestBase.logger.warning("Cannot delete ({},{}): {}".format(
entity_type, entity, e))
TestBase.logger.info("End of cleanup_fixture from {}".format(self.__class__))

def _get_sentinel(self, sentinel=None):
if sentinel is None:
# Create a unique prefix for the flow. Necessary because the flow
Expand Down
181 changes: 181 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
'''This file is recognized by pytest for defining specified behaviour

'conftest.py' files are directory-scope files that are shared by all
sub-directories from where this file is placed. pytest recognises
'conftest.py' for any unit test executed from within this directory
tree. This file is used to define fixtures, hooks, plugins, and other
functionality that can be shared by the unit tests.

This file has been created for the OpenML testing to primarily make use
of the pytest hooks 'pytest_sessionstart' and 'pytest_sessionfinish',
which are being used for managing the deletion of local and remote files
created by the unit tests, run across more than one process.

This design allows one to comment or remove the conftest.py file to
disable file deletions, without editing any of the test case files.


Possible Future: class TestBase from openml/testing.py can be included
under this file and there would not be any requirements to import
testing.py in each of the unit test modules.
'''

import os
import logging
from typing import List

import openml
from openml.testing import TestBase

# creating logger for unit test file deletion status
logger = logging.getLogger("unit_tests")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks a lot like a copy of the logger above. Also, I assume that the logger names should be swapped.

logger.setLevel(logging.DEBUG)

file_list = []
directory = None

# finding the root directory of conftest.py and going up to OpenML main directory
# exploiting the fact that conftest.py always resides in the root directory for tests
static_dir = os.path.dirname(os.path.abspath(__file__))
logging.info("static directory: {}".format(static_dir))
print("static directory: {}".format(static_dir))
while True:
if 'openml' in os.listdir(static_dir):
break
static_dir = os.path.join(static_dir, '..')


def worker_id() -> str:
''' Returns the name of the worker process owning this function call.

:return: str
Possible outputs from the set of {'master', 'gw0', 'gw1', ..., 'gw(n-1)'}
where n is the number of workers being used by pytest-xdist
'''
vars_ = list(os.environ.keys())
if 'PYTEST_XDIST_WORKER' in vars_ or 'PYTEST_XDIST_WORKER_COUNT' in vars_:
return os.environ['PYTEST_XDIST_WORKER']
else:
return 'master'


def read_file_list() -> List[str]:
'''Returns a list of paths to all files that currently exist in 'openml/tests/files/'

:return: List[str]
'''
directory = os.path.join(static_dir, 'tests/files/')
if worker_id() == 'master':
logger.info("Collecting file lists from: {}".format(directory))
files = os.walk(directory)
file_list = []
for root, _, filenames in files:
for filename in filenames:
file_list.append(os.path.join(root, filename))
return file_list


def compare_delete_files(old_list, new_list) -> None:
'''Deletes files that are there in the new_list but not in the old_list

:param old_list: List[str]
:param new_list: List[str]
:return: None
'''
file_list = list(set(new_list) - set(old_list))
for file in file_list:
os.remove(file)
logger.info("Deleted from local: {}".format(file))


def delete_remote_files(tracker) -> None:
'''Function that deletes the entities passed as input, from the OpenML test server

The TestBase class in openml/testing.py has an attribute called publish_tracker.
This function expects the dictionary of the same structure.
It is a dictionary of lists, where the keys are entity types, while the values are
lists of integer IDs, except for key 'flow' where the value is a tuple (ID, flow name).

Iteratively, multiple POST requests are made to the OpenML test server using
openml.utils._delete_entity() to remove the entities uploaded by all the unit tests.

:param tracker: Dict
:return: None
'''
openml.config.server = TestBase.test_server
openml.config.apikey = TestBase.apikey

# reordering to delete sub flows at the end of flows
# sub-flows have shorter names, hence, sorting by descending order of flow name length
if 'flow' in tracker:
flow_deletion_order = [entity_id for entity_id, _ in
sorted(tracker['flow'], key=lambda x: len(x[1]), reverse=True)]
tracker['flow'] = flow_deletion_order

# deleting all collected entities published to test server
# 'run's are deleted first to prevent dependency issue of entities on deletion
logger.info("Entity Types: {}".format(['run', 'data', 'flow', 'task', 'study']))
for entity_type in ['run', 'data', 'flow', 'task', 'study']:
logger.info("Deleting {}s...".format(entity_type))
for i, entity in enumerate(tracker[entity_type]):
try:
openml.utils._delete_entity(entity_type, entity)
logger.info("Deleted ({}, {})".format(entity_type, entity))
except Exception as e:
logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e))


def pytest_sessionstart() -> None:
'''pytest hook that is executed before any unit test starts

This function will be called by each of the worker processes, along with the master process
when they are spawned. This happens even before the collection of unit tests.
If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this
function, before execution of any unit test begins. The master pytest process has the name
'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1.
The order of process spawning is: 'master' -> random ordering of the 'gw{i}' workers.

Since, master is always executed first, it is checked if the current process is 'master' and
store a list of strings of paths of all files in the directory (pre-unit test snapshot).

:return: None
'''
# file_list is global to maintain the directory snapshot during tear down
global file_list
worker = worker_id()
if worker == 'master':
file_list = read_file_list()


def pytest_sessionfinish() -> None:
'''pytest hook that is executed after all unit tests of a worker ends

This function will be called by each of the worker processes, along with the master process
when they are done with the unit tests allocated to them.
If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this
function, before execution of any unit test begins. The master pytest process has the name
'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1.
The order of invocation is: random ordering of the 'gw{i}' workers -> 'master'.

Since, master is always executed last, it is checked if the current process is 'master' and,
* Compares file list with pre-unit test snapshot and deletes all local files generated
* Iterates over the list of entities uploaded to test server and deletes them remotely

:return: None
'''
# allows access to the file_list read in the set up phase
global file_list
worker = worker_id()
logger.info("Finishing worker {}".format(worker))

# Test file deletion
logger.info("Deleting files uploaded to test server for worker {}".format(worker))
delete_remote_files(TestBase.publish_tracker)

if worker == 'master':
# Local file deletion
new_file_list = read_file_list()
compare_delete_files(file_list, new_file_list)
logger.info("Local files deleted")

logging.info("{} is killed".format(worker))
49 changes: 9 additions & 40 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from unittest import mock

import arff
import time

import pytest
import numpy as np
Expand Down Expand Up @@ -1088,22 +1089,8 @@ def test_ignore_attributes_dataset(self):
paper_url=paper_url
)

def test___publish_fetch_ignore_attribute(self):
"""(Part 1) Test to upload and retrieve dataset and check ignore_attributes

DEPENDS on test_publish_fetch_ignore_attribute() to be executed after this
This test is split into two parts:
1) test___publish_fetch_ignore_attribute()
This will be executed earlier, owing to alphabetical sorting.
This test creates and publish() a dataset and checks for a valid ID.
2) test_publish_fetch_ignore_attribute()
This will be executed after test___publish_fetch_ignore_attribute(),
owing to alphabetical sorting. The time gap is to allow the server
more time time to compute data qualities.
The dataset ID obtained previously is used to fetch the dataset.
The retrieved dataset is checked for valid ignore_attributes.
"""
# the returned fixt
def test_publish_fetch_ignore_attribute(self):
"""Test to upload and retrieve dataset and check ignore_attributes"""
data = [
['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
Expand Down Expand Up @@ -1158,40 +1145,22 @@ def test___publish_fetch_ignore_attribute(self):
upload_did))
# test if publish was successful
self.assertIsInstance(upload_did, int)
# variables to carry forward for test_publish_fetch_ignore_attribute()
self.__class__.test_publish_fetch_ignore_attribute_did = upload_did
self.__class__.test_publish_fetch_ignore_attribute_list = ignore_attribute

def test_publish_fetch_ignore_attribute(self):
"""(Part 2) Test to upload and retrieve dataset and check ignore_attributes

DEPENDS on test___publish_fetch_ignore_attribute() to be executed first
This will be executed after test___publish_fetch_ignore_attribute(),
owing to alphabetical sorting. The time gap is to allow the server
more time time to compute data qualities.
The dataset ID obtained previously is used to fetch the dataset.
The retrieved dataset is checked for valid ignore_attributes.
"""
# Retrieving variables from test___publish_fetch_ignore_attribute()
upload_did = self.__class__.test_publish_fetch_ignore_attribute_did
ignore_attribute = self.__class__.test_publish_fetch_ignore_attribute_list
trials = 1
timeout_limit = 200
dataset = None
# fetching from server
# loop till timeout or fetch not successful
while True:
if trials > timeout_limit:
break
max_waiting_time_seconds = 400
# time.time() works in seconds
start_time = time.time()
while time.time() - start_time < max_waiting_time_seconds:
try:
dataset = openml.datasets.get_dataset(upload_did)
break
except Exception as e:
# returned code 273: Dataset not processed yet
# returned code 362: No qualities found
print("Trial {}/{}: ".format(trials, timeout_limit))
print("\tFailed to fetch dataset:{} with '{}'.".format(upload_did, str(e)))
trials += 1
print("Failed to fetch dataset:{} with '{}'.".format(upload_did, str(e)))
time.sleep(10)
continue
if dataset is None:
raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(upload_did))
Expand Down
2 changes: 1 addition & 1 deletion tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def determine_grid_size(param_grid):
# suboptimal (slow), and not guaranteed to work if evaluation
# engine is behind.
# TODO: mock this? We have the arff already on the server
self._wait_for_processed_run(run.run_id, 200)
self._wait_for_processed_run(run.run_id, 400)
try:
model_prime = openml.runs.initialize_model_from_trace(
run_id=run.run_id,
Expand Down
Loading