diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 29ada2298..7fa3450ca 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -114,9 +114,9 @@ jobs:
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and not production and not uses_test_server"
+          marks="sklearn and not production_server and not test_server"
         else
-          marks="not production and not uses_test_server"
+          marks="not production_server and not test_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -131,9 +131,9 @@ jobs:
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and production and not uses_test_server"
+          marks="sklearn and production_server and not test_server"
         else
-          marks="production and not uses_test_server"
+          marks="production_server and not test_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -143,7 +143,7 @@ jobs:
       env:
         OPENML_TEST_SERVER_ADMIN_KEY: ${{ secrets.OPENML_TEST_SERVER_ADMIN_KEY }}
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
-        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not uses_test_server"
+        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not test_server"
 
     - name: Check for files left behind by test
       if: matrix.os != 'windows-latest' && always()
diff --git a/docs/developer_setup.md b/docs/developer_setup.md
new file mode 100644
index 000000000..55a73fef9
--- /dev/null
+++ b/docs/developer_setup.md
@@ -0,0 +1,210 @@
+# OpenML Local Development Environment Setup
+
+This guide outlines the standard procedures for setting up a local development environment for the OpenML ecosystem. It covers the configuration of the backend servers (API v1 and API v2) and the Python Client SDK.
+
+OpenML currently has two backend architecture:
+
+* **API v1**: The PHP-based server currently serving production traffic.
+* **API v2**: The Python-based server (FastAPI) currently under active development.
+
+> Note on Migration: API v1 is projected to remain operational through at least 2026. API v2 is the target architecture for future development.
+
+## 1. API v1 Setup (PHP Backend)
+
+This section details the deployment of the legacy PHP backend.
+
+### Prerequisites
+
+* **Docker**: Docker Desktop (Ensure the daemon is running).
+* **Version Control**: Git.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+Retrieve the OpenML services source code:
+
+```bash
+git clone https://github.com/openml/services
+cd services
+```
+
+#### 2. Configure File Permissions
+
+To ensure the containerized PHP service can write to the local filesystem, initialize the data directory permissions.
+
+From the repository root:
+
+```bash
+chown -R www-data:www-data data/php
+```
+
+If the `www-data` user does not exist on the host system, grant full permissions as a fallback:
+
+```bash
+chmod -R 777 data/php
+```
+
+#### 3. Launch Services
+
+Initialize the container stack:
+
+```bash
+docker compose --profile all up -d
+```
+
+#### Warning: Container Conflicts
+
+If API v2 (Python backend) containers are present on the system, name conflicts may occur. To resolve this, stop and remove existing containers before launching API v1:
+
+```bash
+docker compose --profile all down
+docker compose --profile all up -d
+```
+
+#### 4. Verification
+
+Validate the deployment by accessing the flow endpoint. A successful response will return structured JSON data.
+
+* **Endpoint**: http://localhost:8080/api/v1/json/flow/181
+
+### Client Configuration
+
+To direct the `openml-python` client to the local API v1 instance, modify the configuration as shown below. The API key corresponds to the default key located in `services/config/php/.env`.
+
+```python
+import openml
+from openml_sklearn.extension import SklearnExtension
+from sklearn.neighbors import KNeighborsClassifier
+
+# Configure client to use local Docker instance
+openml.config.server = "http://localhost:8080/api/v1/xml"
+openml.config.apikey = "AD000000000000000000000000000000"
+
+# Test flow publication
+clf = KNeighborsClassifier(n_neighbors=3)
+extension = SklearnExtension()
+knn_flow = extension.model_to_flow(clf)
+
+knn_flow.publish()
+```
+
+## 2. API v2 Setup (Python Backend)
+
+This section details the deployment of the FastAPI backend.
+
+### Prerequisites
+
+* **Docker**: Docker Desktop (Ensure the daemon is running).
+* **Version Control**: Git.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+Retrieve the API v2 source code:
+
+```bash
+git clone https://github.com/openml/server-api
+cd server-api
+```
+
+#### 2. Launch Services
+
+Build and start the container stack:
+
+```bash
+docker compose --profile all up
+```
+
+#### 3. Verification
+
+Validate the deployment using the following endpoints:
+
+* **Task Endpoint**: http://localhost:8001/tasks/31
+* **Swagger UI (Documentation)**: http://localhost:8001/docs
+
+## 3. Python SDK (`openml-python`) Setup
+
+This section outlines the environment setup for contributing to the OpenML Python client.
+
+### Installation Steps
+
+#### 1. Clone the Repository
+
+```bash
+git clone https://github.com/openml/openml-python
+cd openml-python
+```
+
+#### 2. Environment Initialization
+
+Create an isolated virtual environment (example using Conda):
+
+```bash
+conda create -n openml-python-dev python=3.12
+conda activate openml-python-dev
+```
+
+#### 3. Install Dependencies
+
+Install the package in editable mode, including development and documentation dependencies:
+
+```bash
+python -m pip install -e ".[dev,docs]"
+```
+
+#### 4. Configure Quality Gates
+
+Install pre-commit hooks to enforce coding standards:
+
+```bash
+pre-commit install
+pre-commit run --all-files
+```
+
+## 4. Testing Guidelines
+
+The OpenML Python SDK utilizes `pytest` markers to categorize tests based on dependencies and execution context.
+
+| Marker            | Description                                                                 |
+|-------------------|-----------------------------------------------------------------------------|
+| `sklearn`          | Tests requiring `scikit-learn`. Skipped if the library is missing.          |
+| `production_server`| Tests that interact with the live OpenML server (real API calls).         |
+| `test_server`     | Tests requiring the OpenML test server environment.                       |
+
+### Execution Examples
+
+Run the full test suite:
+
+```bash
+pytest
+```
+
+Run a specific subset (e.g., `scikit-learn` tests):
+
+```bash
+pytest -m sklearn
+```
+
+Exclude production tests (local only):
+
+```bash
+pytest -m "not production_server"
+```
+
+### Admin Privilege Tests
+
+Certain tests require administrative privileges on the test server. These are skipped automatically unless an admin API key is provided via environment variables.
+
+#### Windows (PowerShell):
+
+```shell
+$env:OPENML_TEST_SERVER_ADMIN_KEY = "admin-key"
+```
+
+#### Linux/macOS:
+
+```bash
+export OPENML_TEST_SERVER_ADMIN_KEY="admin-key"
+```
diff --git a/mkdocs.yml b/mkdocs.yml
index 0dba42557..419cc249e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -65,6 +65,7 @@ nav:
   - Advanced User Guide: details.md
   - API: reference/
   - Contributing: contributing.md
+  - Developer Setup: developer_setup.md
 
 markdown_extensions:
   - pymdownx.highlight:
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
index 9e53bd9fa..5da635c70 100644
--- a/openml/_api_calls.py
+++ b/openml/_api_calls.py
@@ -22,8 +22,8 @@
 from . import config
 from .__version__ import __version__
 from .exceptions import (
+    OpenMLAuthenticationError,
     OpenMLHashException,
-    OpenMLNotAuthorizedError,
     OpenMLServerError,
     OpenMLServerException,
     OpenMLServerNoResult,
@@ -515,11 +515,7 @@ def __parse_server_exception(
         400,  # run/42 delete
         460,  # task/42 delete
     ]:
-        msg = (
-            f"The API call {url} requires authentication via an API key.\nPlease configure "
-            "OpenML-Python to use your API as described in this example:"
-            "\nhttps://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication"
-        )
-        return OpenMLNotAuthorizedError(message=msg)
+        msg = f"The API call {url} requires authentication via an API key."
+        return OpenMLAuthenticationError(message=msg)
 
     return OpenMLServerException(code=code, message=full_message, url=url)
diff --git a/openml/cli.py b/openml/cli.py
index 0afb089c2..c33578f6e 100644
--- a/openml/cli.py
+++ b/openml/cli.py
@@ -102,15 +102,15 @@ def check_apikey(apikey: str) -> str:
 
 def configure_server(value: str) -> None:
     def check_server(server: str) -> str:
-        is_shorthand = server in ["test", "production"]
+        is_shorthand = server in ["test", "production_server"]
         if is_shorthand or looks_like_url(server):
             return ""
-        return "Must be 'test', 'production' or a url."
+        return "Must be 'test', 'production_server' or a url."
 
     def replace_shorthand(server: str) -> str:
         if server == "test":
-            return "https://test.openml.org/api/v1/xml"
-        if server == "production":
+            return f"{config.TEST_SERVER_URL}/api/v1/xml"
+        if server == "production_server":
             return "https://www.openml.org/api/v1/xml"
         return server
 
@@ -119,7 +119,7 @@ def replace_shorthand(server: str) -> str:
         value=value,
         check_with_message=check_server,
         intro_message="Specify which server you wish to connect to.",
-        input_message="Specify a url or use 'test' or 'production' as a shorthand: ",
+        input_message="Specify a url or use 'test' or 'production_server' as a shorthand: ",
         sanitize=replace_shorthand,
     )
 
diff --git a/openml/config.py b/openml/config.py
index 9758b6fff..638b45650 100644
--- a/openml/config.py
+++ b/openml/config.py
@@ -28,6 +28,8 @@
 OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR = "OPENML_TEST_SERVER_ADMIN_KEY"
 _TEST_SERVER_NORMAL_USER_KEY = "normaluser"
 
+TEST_SERVER_URL = "https://test.openml.org"
+
 
 class _Config(TypedDict):
     apikey: str
@@ -214,7 +216,7 @@ class ConfigurationForExamples:
     _last_used_server = None
     _last_used_key = None
     _start_last_called = False
-    _test_server = "https://test.openml.org/api/v1/xml"
+    _test_server = f"{TEST_SERVER_URL}/api/v1/xml"
     _test_apikey = _TEST_SERVER_NORMAL_USER_KEY
 
     @classmethod
@@ -470,7 +472,8 @@ def get_cache_directory() -> str:
 
     """
     url_suffix = urlparse(server).netloc
-    reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1])  # noqa: PTH118
+    url_parts = url_suffix.replace(":", "_").split(".")[::-1]
+    reversed_url_suffix = os.sep.join(url_parts)  # noqa: PTH118
     return os.path.join(_root_cache_directory, reversed_url_suffix)  # noqa: PTH118
 
 
diff --git a/openml/exceptions.py b/openml/exceptions.py
index fe63b8a58..1c1343ff3 100644
--- a/openml/exceptions.py
+++ b/openml/exceptions.py
@@ -63,5 +63,28 @@ class OpenMLNotAuthorizedError(OpenMLServerError):
     """Indicates an authenticated user is not authorized to execute the requested action."""
 
 
+class OpenMLAuthenticationError(OpenMLServerError):
+    """Exception raised when API authentication fails.
+
+    This typically occurs when:
+    - No API key is configured
+    - The API key is invalid or expired
+    - The API key format is incorrect
+
+    This is different from authorization (OpenMLNotAuthorizedError), which occurs
+    when a valid API key lacks permissions for the requested operation.
+    """
+
+    def __init__(self, message: str):
+        help_text = (
+            "\n\nTo fix this:\n"
+            "1. Get your API key from https://www.openml.org/\n"
+            "   (you'll need to register for a free account if you don't have one)\n"
+            "2. Configure your API key by following the authentication guide:\n"
+            "   https://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication"
+        )
+        super().__init__(message + help_text)
+
+
 class ObjectNotPublishedError(PyOpenMLError):
     """Indicates an object has not been published yet."""
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 3df2861c0..3fbc7adee 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -415,9 +415,10 @@ def get_task(
     if not isinstance(task_id, int):
         raise TypeError(f"Task id should be integer, is {type(task_id)}")
 
-    cache_key_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
-    tid_cache_dir = cache_key_dir / str(task_id)
-    tid_cache_dir_existed = tid_cache_dir.exists()
+    task_cache_directory = openml.utils._create_cache_directory_for_id(
+        TASKS_CACHE_DIR_NAME, task_id
+    )
+    task_cache_directory_existed = task_cache_directory.exists()
     try:
         task = _get_task_description(task_id)
         dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
@@ -425,14 +426,17 @@ def get_task(
         # Including class labels as part of task meta data handles
         #   the case where data download was initially disabled
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
+            assert task.target_name is not None, (
+                "Supervised tasks must define a target feature before retrieving class labels."
+            )
             task.class_labels = dataset.retrieve_class_labels(task.target_name)
         # Clustering tasks do not have class labels
         # and do not offer download_split
         if download_splits and isinstance(task, OpenMLSupervisedTask):
             task.download_split()
     except Exception as e:
-        if not tid_cache_dir_existed:
-            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
+        if not task_cache_directory_existed:
+            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory)
         raise e
 
     return task
@@ -598,6 +602,7 @@ def create_task(
         )
 
     return task_cls(
+        task_id=None,
         task_type_id=task_type,
         task_type="None",  # TODO: refactor to get task type string from ID.
         data_set_id=dataset_id,
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index b297a105c..385b1f949 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -1,6 +1,4 @@
 # License: BSD 3-Clause
-# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting
-# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code.
 from __future__ import annotations
 
 import warnings
@@ -8,7 +6,7 @@
 from collections.abc import Sequence
 from enum import Enum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, ClassVar
 from typing_extensions import TypedDict
 
 import openml._api_calls
@@ -71,31 +69,45 @@ class OpenMLTask(OpenMLBase):
         Refers to the URL of the data splits used for the OpenML task.
     """
 
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
     def __init__(  # noqa: PLR0913
         self,
         task_id: int | None,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
-        estimation_procedure_id: int = 1,
+        estimation_procedure_id: int | None = None,
         estimation_procedure_type: str | None = None,
         estimation_parameters: dict[str, str] | None = None,
         evaluation_measure: str | None = None,
         data_splits_url: str | None = None,
+        target_name: str | None = None,
     ):
         self.task_id = int(task_id) if task_id is not None else None
         self.task_type_id = task_type_id
         self.task_type = task_type
         self.dataset_id = int(data_set_id)
+        self.target_name = target_name
+        resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
+            estimation_procedure_id,
+        )
         self.evaluation_measure = evaluation_measure
         self.estimation_procedure: _EstimationProcedure = {
             "type": estimation_procedure_type,
             "parameters": estimation_parameters,
             "data_splits_url": data_splits_url,
         }
-        self.estimation_procedure_id = estimation_procedure_id
+        self.estimation_procedure_id = resolved_estimation_procedure_id
         self.split: OpenMLSplit | None = None
 
+    def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int:
+        return (
+            estimation_procedure_id
+            if estimation_procedure_id is not None
+            else self.DEFAULT_ESTIMATION_PROCEDURE_ID
+        )
+
     @classmethod
     def _entity_letter(cls) -> str:
         return "t"
@@ -129,7 +141,8 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
             if class_labels is not None:
                 fields["# of Classes"] = len(class_labels)
 
-            if hasattr(self, "cost_matrix"):
+            cost_matrix = getattr(self, "cost_matrix", None)
+            if cost_matrix is not None:
                 fields["Cost Matrix"] = "Available"
 
         # determines the order in which the information will be printed
@@ -250,13 +263,15 @@ class OpenMLSupervisedTask(OpenMLTask, ABC):
         Refers to the unique identifier of task.
     """
 
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
     def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
-        estimation_procedure_id: int = 1,
+        estimation_procedure_id: int | None = None,
         estimation_procedure_type: str | None = None,
         estimation_parameters: dict[str, str] | None = None,
         evaluation_measure: str | None = None,
@@ -273,10 +288,9 @@ def __init__(  # noqa: PLR0913
             estimation_parameters=estimation_parameters,
             evaluation_measure=evaluation_measure,
             data_splits_url=data_splits_url,
+            target_name=target_name,
         )
 
-        self.target_name = target_name
-
     def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
         """Get data associated with the current task.
 
@@ -331,6 +345,8 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the Classification task (if it already exists on OpenML).
     task_type_id : TaskType
         ID of the Classification task type.
     task_type : str
@@ -339,7 +355,7 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
         ID of the OpenML dataset associated with the Classification task.
     target_name : str
         Name of the target variable.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=1
         ID of the estimation procedure for the Classification task.
     estimation_procedure_type : str, default=None
         Type of the estimation procedure.
@@ -349,21 +365,21 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
         Name of the evaluation measure.
     data_splits_url : str, default=None
         URL of the data splits for the Classification task.
-    task_id : Union[int, None]
-        ID of the Classification task (if it already exists on OpenML).
     class_labels : List of str, default=None
         A list of class labels (for classification tasks).
     cost_matrix : array, default=None
         A cost matrix (for classification tasks).
     """
 
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1
+
     def __init__(  # noqa: PLR0913
         self,
         task_type_id: TaskType,
         task_type: str,
         data_set_id: int,
         target_name: str,
-        estimation_procedure_id: int = 1,
+        estimation_procedure_id: int | None = None,
         estimation_procedure_type: str | None = None,
         estimation_parameters: dict[str, str] | None = None,
         evaluation_measure: str | None = None,
@@ -373,20 +389,19 @@ def __init__(  # noqa: PLR0913
         cost_matrix: np.ndarray | None = None,
     ):
         super().__init__(
-            task_id=task_id,
             task_type_id=task_type_id,
             task_type=task_type,
             data_set_id=data_set_id,
+            target_name=target_name,
             estimation_procedure_id=estimation_procedure_id,
             estimation_procedure_type=estimation_procedure_type,
             estimation_parameters=estimation_parameters,
             evaluation_measure=evaluation_measure,
-            target_name=target_name,
             data_splits_url=data_splits_url,
+            task_id=task_id,
         )
         self.class_labels = class_labels
         self.cost_matrix = cost_matrix
-
         if cost_matrix is not None:
             raise NotImplementedError("Costmatrix functionality is not yet implemented.")
 
@@ -396,6 +411,8 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the OpenML Regression task.
     task_type_id : TaskType
         Task type ID of the OpenML Regression task.
     task_type : str
@@ -404,7 +421,7 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
         ID of the OpenML dataset.
     target_name : str
         Name of the target feature used in the Regression task.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=7
         ID of the OpenML estimation procedure.
     estimation_procedure_type : str, default=None
         Type of the OpenML estimation procedure.
@@ -412,37 +429,11 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
         Parameters used by the OpenML estimation procedure.
     data_splits_url : str, default=None
         URL of the OpenML data splits for the Regression task.
-    task_id : Union[int, None]
-        ID of the OpenML Regression task.
     evaluation_measure : str, default=None
         Evaluation measure used in the Regression task.
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        task_type_id: TaskType,
-        task_type: str,
-        data_set_id: int,
-        target_name: str,
-        estimation_procedure_id: int = 7,
-        estimation_procedure_type: str | None = None,
-        estimation_parameters: dict[str, str] | None = None,
-        data_splits_url: str | None = None,
-        task_id: int | None = None,
-        evaluation_measure: str | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            evaluation_measure=evaluation_measure,
-            target_name=target_name,
-            data_splits_url=data_splits_url,
-        )
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7
 
 
 class OpenMLClusteringTask(OpenMLTask):
@@ -450,16 +441,16 @@ class OpenMLClusteringTask(OpenMLTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the OpenML clustering task.
     task_type_id : TaskType
         Task type ID of the OpenML clustering task.
     task_type : str
         Task type of the OpenML clustering task.
     data_set_id : int
         ID of the OpenML dataset used in clustering the task.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=17
         ID of the OpenML estimation procedure.
-    task_id : Union[int, None]
-        ID of the OpenML clustering task.
     estimation_procedure_type : str, default=None
         Type of the OpenML estimation procedure used in the clustering task.
     estimation_parameters : dict, default=None
@@ -473,32 +464,7 @@ class OpenMLClusteringTask(OpenMLTask):
         feature set for the clustering task.
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        task_type_id: TaskType,
-        task_type: str,
-        data_set_id: int,
-        estimation_procedure_id: int = 17,
-        task_id: int | None = None,
-        estimation_procedure_type: str | None = None,
-        estimation_parameters: dict[str, str] | None = None,
-        data_splits_url: str | None = None,
-        evaluation_measure: str | None = None,
-        target_name: str | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            evaluation_measure=evaluation_measure,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            data_splits_url=data_splits_url,
-        )
-
-        self.target_name = target_name
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17
 
     def get_X(self) -> pd.DataFrame:
         """Get data associated with the current task.
@@ -534,6 +500,8 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
 
     Parameters
     ----------
+    task_id : Union[int, None]
+        ID of the Learning Curve task.
     task_type_id : TaskType
         ID of the Learning Curve task.
     task_type : str
@@ -542,7 +510,7 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
         ID of the dataset that this task is associated with.
     target_name : str
         Name of the target feature in the dataset.
-    estimation_procedure_id : int, default=None
+    estimation_procedure_id : int, default=13
         ID of the estimation procedure to use for evaluating models.
     estimation_procedure_type : str, default=None
         Type of the estimation procedure.
@@ -550,8 +518,6 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
         Additional parameters for the estimation procedure.
     data_splits_url : str, default=None
         URL of the file containing the data splits for Learning Curve task.
-    task_id : Union[int, None]
-        ID of the Learning Curve task.
     evaluation_measure : str, default=None
         Name of the evaluation measure to use for evaluating models.
     class_labels : list of str, default=None
@@ -560,32 +526,4 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
         Cost matrix for Learning Curve tasks.
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        task_type_id: TaskType,
-        task_type: str,
-        data_set_id: int,
-        target_name: str,
-        estimation_procedure_id: int = 13,
-        estimation_procedure_type: str | None = None,
-        estimation_parameters: dict[str, str] | None = None,
-        data_splits_url: str | None = None,
-        task_id: int | None = None,
-        evaluation_measure: str | None = None,
-        class_labels: list[str] | None = None,
-        cost_matrix: np.ndarray | None = None,
-    ):
-        super().__init__(
-            task_id=task_id,
-            task_type_id=task_type_id,
-            task_type=task_type,
-            data_set_id=data_set_id,
-            estimation_procedure_id=estimation_procedure_id,
-            estimation_procedure_type=estimation_procedure_type,
-            estimation_parameters=estimation_parameters,
-            evaluation_measure=evaluation_measure,
-            target_name=target_name,
-            data_splits_url=data_splits_url,
-            class_labels=class_labels,
-            cost_matrix=cost_matrix,
-        )
+    DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13
diff --git a/openml/testing.py b/openml/testing.py
index 304a4e0be..9f694f9bf 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -47,7 +47,7 @@ class TestBase(unittest.TestCase):
         "user": [],
     }
     flow_name_tracker: ClassVar[list[str]] = []
-    test_server = "https://test.openml.org/api/v1/xml"
+    test_server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
     admin_key = os.environ.get(openml.config.OPENML_TEST_SERVER_ADMIN_KEY_ENV_VAR)
     user_key = openml.config._TEST_SERVER_NORMAL_USER_KEY
 
diff --git a/openml/utils/__init__.py b/openml/utils/__init__.py
new file mode 100644
index 000000000..1e74a3684
--- /dev/null
+++ b/openml/utils/__init__.py
@@ -0,0 +1,39 @@
+"""Utilities module."""
+
+from openml.utils._openml import (
+    ProgressBar,
+    ReprMixin,
+    _create_cache_directory,
+    _create_cache_directory_for_id,
+    _create_lockfiles_dir,
+    _delete_entity,
+    _get_cache_dir_for_id,
+    _get_cache_dir_for_key,
+    _get_rest_api_type_alias,
+    _list_all,
+    _remove_cache_dir_for_id,
+    _tag_entity,
+    _tag_openml_base,
+    extract_xml_tags,
+    get_cache_size,
+    thread_safe_if_oslo_installed,
+)
+
+__all__ = [
+    "ProgressBar",
+    "ReprMixin",
+    "_create_cache_directory",
+    "_create_cache_directory_for_id",
+    "_create_lockfiles_dir",
+    "_delete_entity",
+    "_get_cache_dir_for_id",
+    "_get_cache_dir_for_key",
+    "_get_rest_api_type_alias",
+    "_list_all",
+    "_remove_cache_dir_for_id",
+    "_tag_entity",
+    "_tag_openml_base",
+    "extract_xml_tags",
+    "get_cache_size",
+    "thread_safe_if_oslo_installed",
+]
diff --git a/openml/utils.py b/openml/utils/_openml.py
similarity index 99%
rename from openml/utils.py
rename to openml/utils/_openml.py
index 30dc4e53c..f18dbe3e0 100644
--- a/openml/utils.py
+++ b/openml/utils/_openml.py
@@ -26,8 +26,7 @@
 import openml
 import openml._api_calls
 import openml.exceptions
-
-from . import config
+from openml import config
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
diff --git a/pyproject.toml b/pyproject.toml
index 93a6ffbfa..47013271d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -133,10 +133,10 @@ filterwarnings=[
     "ignore:the matrix subclass:PendingDeprecationWarning"
 ]
 markers = [
-  "server: anything that connects to a server",
   "upload: anything that uploads to a server",
-  "production: any interaction with the production server",
+  "production_server: any interaction with the production server",
   "cache: anything that interacts with the (test) cache",
+  "test_server: tests that require the OpenML test server",
 ]
 
 # https://github.com/charliermarsh/ruff
diff --git a/tests/conftest.py b/tests/conftest.py
index bd974f3f3..2a7a6dcc7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -272,12 +272,12 @@ def as_robot() -> Iterator[None]:
 
 @pytest.fixture(autouse=True)
 def with_server(request):
-    if "production" in request.keywords:
+    if "production_server" in request.keywords:
         openml.config.server = "https://www.openml.org/api/v1/xml"
         openml.config.apikey = None
         yield
         return
-    openml.config.server = "https://test.openml.org/api/v1/xml"
+    openml.config.server = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
     openml.config.apikey = TestBase.user_key
     yield
 
diff --git a/tests/files/localhost_8000 b/tests/files/localhost_8000
new file mode 120000
index 000000000..334c709ef
--- /dev/null
+++ b/tests/files/localhost_8000
@@ -0,0 +1 @@
+org/openml/test
\ No newline at end of file
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index b13bac30b..c651845fb 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -18,7 +18,7 @@
 import pytest
 
 
-@pytest.mark.production()
+@pytest.mark.production_server()
 class OpenMLDatasetTest(TestBase):
     _multiprocess_can_split_ = True
 
@@ -281,7 +281,7 @@ def test_equality_comparison(self):
         self.assertNotEqual(self.titanic, "Wrong_object")
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_tagging():
     dataset = openml.datasets.get_dataset(125, download_data=False)
 
@@ -298,7 +298,7 @@ def test_tagging():
     datasets = openml.datasets.list_datasets(tag=tag)
     assert datasets.empty
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_get_feature_with_ontology_data_id_11():
     # test on car dataset, which has built-in ontology references
     dataset = openml.datasets.get_dataset(11)
@@ -307,7 +307,7 @@ def test_get_feature_with_ontology_data_id_11():
     assert len(dataset.features[2].ontologies) >= 1
     assert len(dataset.features[3].ontologies) >= 1   
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_add_remove_ontology_to_dataset():
     did = 1
     feature_index = 1
@@ -315,7 +315,7 @@ def test_add_remove_ontology_to_dataset():
     openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
     openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)    
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_add_same_ontology_multiple_features():
     did = 1
     ontology = "https://www.openml.org/unittest/" + str(time())
@@ -324,7 +324,7 @@ def test_add_same_ontology_multiple_features():
         openml.datasets.functions.data_feature_add_ontology(did, i, ontology)    
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_add_illegal_long_ontology():
     did = 1
     ontology = "http://www.google.com/" + ("a" * 257)
@@ -336,7 +336,7 @@ def test_add_illegal_long_ontology():
     
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_add_illegal_url_ontology():
     did = 1
     ontology = "not_a_url" + str(time())
@@ -347,7 +347,7 @@ def test_add_illegal_url_ontology():
         assert e.code == 1106
 
 
-@pytest.mark.production()
+@pytest.mark.production_server()
 class OpenMLDatasetTestSparse(TestBase):
     _multiprocess_can_split_ = True
 
@@ -408,7 +408,7 @@ def test_get_sparse_categorical_data_id_395(self):
         assert len(feature.nominal_values) == 25
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test__read_features(mocker, workdir, static_cache_dir):
     """Test we read the features from the xml if no cache pickle is available.
     This test also does some simple checks to verify that the features are read correctly
@@ -440,7 +440,7 @@ def test__read_features(mocker, workdir, static_cache_dir):
     assert pickle_mock.dump.call_count == 1
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test__read_qualities(static_cache_dir, workdir, mocker):
     """Test we read the qualities from the xml if no cache pickle is available.
     This test also does some minor checks to ensure that the qualities are read correctly.
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index d80743a8c..151a9ac23 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -107,7 +107,7 @@ def _check_datasets(self, datasets):
         for did in datasets:
             self._check_dataset(datasets[did])
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_tag_untag_dataset(self):
         tag = "test_tag_%d" % random.randint(1, 1000000)
         all_tags = _tag_entity("data", 1, tag)
@@ -115,12 +115,12 @@ def test_tag_untag_dataset(self):
         all_tags = _tag_entity("data", 1, tag, untag=True)
         assert tag not in all_tags
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_datasets_length(self):
         datasets = openml.datasets.list_datasets()
         assert len(datasets) >= 100
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_datasets_paginate(self):
         size = 10
         max = 100
@@ -135,12 +135,12 @@ def test_list_datasets_paginate(self):
                 categories=["in_preparation", "active", "deactivated"],
             )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_datasets_empty(self):
         datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
         assert datasets.empty
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_check_datasets_active(self):
         # Have to test on live because there is no deactivated dataset on the test server.
         self.use_production_server()
@@ -159,7 +159,7 @@ def test_check_datasets_active(self):
         )
         openml.config.server = self.test_server
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_illegal_character_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "illegal_tag&"
@@ -169,7 +169,7 @@ def test_illegal_character_tag(self):
         except openml.exceptions.OpenMLServerException as e:
             assert e.code == 477
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_illegal_length_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "a" * 65
@@ -179,7 +179,7 @@ def test_illegal_length_tag(self):
         except openml.exceptions.OpenMLServerException as e:
             assert e.code == 477
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_deactivated(self):
         """Check that an activated dataset is returned if an earlier deactivated one exists."""
         self.use_production_server()
@@ -187,19 +187,19 @@ def test__name_to_id_with_deactivated(self):
         assert openml.datasets.functions._name_to_id("anneal") == 2
         openml.config.server = self.test_server
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_multiple_active(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris") == 61
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_version(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.use_production_server()
         assert openml.datasets.functions._name_to_id("iris", version=3) == 969
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__name_to_id_with_multiple_active_error(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.use_production_server()
@@ -211,7 +211,7 @@ def test__name_to_id_with_multiple_active_error(self):
             error_if_multiple=True,
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__name_to_id_name_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -221,7 +221,7 @@ def test__name_to_id_name_does_not_exist(self):
             dataset_name="does_not_exist",
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__name_to_id_version_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -232,7 +232,7 @@ def test__name_to_id_version_does_not_exist(self):
             version=100000,
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_datasets_by_name(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", "kr-vs-kp"]
@@ -240,7 +240,7 @@ def test_get_datasets_by_name(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_datasets_by_mixed(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", 2]
@@ -248,14 +248,14 @@ def test_get_datasets_by_mixed(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_datasets(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids)
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_by_name(self):
         dataset = openml.datasets.get_dataset("anneal")
         assert type(dataset) == OpenMLDataset
@@ -274,7 +274,7 @@ def test_get_dataset_download_all_files(self):
         # test_get_dataset_lazy
         raise NotImplementedError
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
         assert type(dataset) == OpenMLDataset
@@ -282,7 +282,7 @@ def test_get_dataset_uint8_dtype(self):
         df, _, _, _ = dataset.get_data()
         assert df["carbon"].dtype == "uint8"
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_dataset_cannot_access_private_data(self):
         # Issue324 Properly handle private datasets when trying to access them
         self.use_production_server()
@@ -293,7 +293,7 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -323,28 +323,28 @@ def ensure_absence_of_real_data():
         assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
         X, *_ = dataset.get_data()
         assert isinstance(X, pd.DataFrame)
         assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
         did = 44
         dataset = openml.datasets.get_dataset(did)
         assert dataset.row_id_attribute == "Counter"
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_dataset_description(self):
         description = _get_dataset_description(self.workdir, 2)
         assert isinstance(description, dict)
         description_xml_path = os.path.join(self.workdir, "description.xml")
         assert os.path.exists(description_xml_path)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__getarff_path_dataset_arff(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         description = _get_dataset_description(self.workdir, 2)
@@ -408,7 +408,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
 
 
     @mock.patch("openml._api_calls._download_minio_file")
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_dataset_parquet_is_cached(self, patch):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         patch.side_effect = RuntimeError(
@@ -449,21 +449,21 @@ def test__getarff_md5_issue(self):
 
         openml.config.connection_n_retries = n
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
         assert isinstance(features_file, Path)
         features_xml_path = self.workdir / "features.xml"
         assert features_xml_path.exists()
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_dataset_qualities(self):
         qualities = _get_dataset_qualities_file(self.workdir, 2)
         assert isinstance(qualities, Path)
         qualities_xml_path = self.workdir / "qualities.xml"
         assert qualities_xml_path.exists()
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_force_refresh_cache(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -486,7 +486,7 @@ def test_get_dataset_force_refresh_cache(self):
             did_cache_dir,
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_force_refresh_cache_clean_start(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -523,23 +523,16 @@ def test_deletion_of_cache_dir(self):
 
     # get_dataset_description is the only data guaranteed to be downloaded
     @mock.patch("openml.datasets.functions._get_dataset_description")
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_deletion_of_cache_dir_faulty_download(self, patch):
         patch.side_effect = Exception("Boom!")
         self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
-        datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
+        datasets_cache_dir = os.path.join(openml.config.get_cache_directory(), "datasets")
         assert len(os.listdir(datasets_cache_dir)) == 0
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_dataset(self):
-        # lazy loading not possible as we need the arff-file.
-        openml.datasets.get_dataset(3, download_data=True)
-        file_path = os.path.join(
-            openml.config.get_cache_directory(),
-            "datasets",
-            "3",
-            "dataset.arff",
-        )
+        arff_file_path = self.static_cache_dir / "org" / "openml" / "test" / "datasets" / "2" / "dataset.arff"
         dataset = OpenMLDataset(
             "anneal",
             "test",
@@ -547,7 +540,7 @@ def test_publish_dataset(self):
             version=1,
             licence="public",
             default_target_attribute="class",
-            data_file=file_path,
+            data_file=arff_file_path,
         )
         dataset.publish()
         TestBase._mark_entity_for_removal("data", dataset.dataset_id)
@@ -556,7 +549,7 @@ def test_publish_dataset(self):
         )
         assert isinstance(dataset.dataset_id, int)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__retrieve_class_labels(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         labels = openml.datasets.get_dataset(2).retrieve_class_labels()
@@ -573,7 +566,7 @@ def test__retrieve_class_labels(self):
         labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
         assert labels == ["COIL", "SHEET"]
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -604,7 +597,7 @@ def _assert_status_of_dataset(self, *, did: int, status: str):
         reason="Test requires admin key. Set OPENML_TEST_SERVER_ADMIN_KEY environment variable.",
     )
     @pytest.mark.flaky()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_data_status(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -696,7 +689,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
@@ -730,7 +723,7 @@ def test_create_dataset_numpy(self):
         ), "Uploaded arff does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_create_dataset_list(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -785,7 +778,7 @@ def test_create_dataset_list(self):
         ), "Uploaded ARFF does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
         sparse_data = scipy.sparse.coo_matrix(
@@ -888,9 +881,9 @@ def test_create_invalid_dataset(self):
         param["data"] = data[0]
         self.assertRaises(ValueError, create_dataset, **param)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_online_dataset_arff(self):
-        dataset_id = 100  # Australian
+        dataset_id = 128  # iris -- one of the few datasets without parquet file
         # lazy loading not used as arff file is checked.
         dataset = openml.datasets.get_dataset(dataset_id, download_data=True)
         decoder = arff.ArffDecoder()
@@ -904,7 +897,7 @@ def test_get_online_dataset_arff(self):
             return_type=arff.DENSE if d_format == "arff" else arff.COO,
         ), "ARFF files are not equal"
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_topic_api_error(self):
         # Check server exception when non-admin accessses apis
         self.assertRaisesRegex(
@@ -923,7 +916,7 @@ def test_topic_api_error(self):
             topic="business",
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_online_dataset_format(self):
         # Phoneme dataset
         dataset_id = 77
@@ -933,7 +926,7 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_create_dataset_pandas(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1158,7 +1151,7 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
         data = [
@@ -1277,7 +1270,7 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
         name = f"{self._get_sentinel()}-pandas_testing_dataset"
@@ -1368,13 +1361,13 @@ def test_create_dataset_attributes_auto_without_df(self):
                 paper_url=paper_url,
             )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_qualities(self):
         qualities = openml.datasets.list_qualities()
         assert isinstance(qualities, list) is True
         assert all(isinstance(q, str) for q in qualities) is True
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_cache_format_pickle(self):
         dataset = openml.datasets.get_dataset(1)
         dataset.get_data()
@@ -1390,7 +1383,7 @@ def test_get_dataset_cache_format_pickle(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_dataset_cache_format_feather(self):
         # This test crashed due to using the parquet file by default, which is downloaded
         # from minio. However, there is a mismatch between OpenML test server and minio IDs.
@@ -1423,7 +1416,7 @@ def test_get_dataset_cache_format_feather(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_data_edit_non_critical_field(self):
         # Case 1
         # All users can edit non-critical fields of datasets
@@ -1445,7 +1438,7 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1468,11 +1461,12 @@ def test_data_edit_critical_field(self):
                     raise e
                 time.sleep(10)
                 # Delete the cache dir to get the newer version of the dataset
+                
                 shutil.rmtree(
-                    os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
+                    os.path.join(openml.config.get_cache_directory(), "datasets", str(did)),
                 )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_data_edit_requires_field(self):
         # Check server exception when no field to edit is provided
         self.assertRaisesRegex(
@@ -1485,7 +1479,7 @@ def test_data_edit_requires_field(self):
             data_id=64,  # blood-transfusion-service-center
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_data_edit_requires_valid_dataset(self):
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(
@@ -1496,7 +1490,7 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user
@@ -1523,7 +1517,7 @@ def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
             default_target_attribute="y",
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
         # Check server exception when a non-owner or non-admin tries to edit critical fields
         self.assertRaisesRegex(
@@ -1535,7 +1529,7 @@ def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
             default_target_attribute="y",
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_data_fork(self):
         did = 1
         result = fork_dataset(did)
@@ -1549,7 +1543,7 @@ def test_data_fork(self):
         )
 
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_datasets_with_high_size_parameter(self):
         # Testing on prod since concurrent deletion of uploded datasets make the test fail
         self.use_production_server()
@@ -1734,7 +1728,6 @@ def test_delete_dataset(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
     )
@@ -1749,14 +1742,13 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke
     ):
         openml.datasets.delete_dataset(40_000)
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/40000"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
     )
@@ -1771,14 +1763,13 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key
     ):
         openml.datasets.delete_dataset(40_000)
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/40000"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
     )
@@ -1790,14 +1781,13 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key)
     success = openml.datasets.delete_dataset(40000)
     assert success
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/40000"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/40000"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = (
         test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
     )
@@ -1812,7 +1802,7 @@ def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key)
     ):
         openml.datasets.delete_dataset(9_999_999)
 
-    dataset_url = "https://test.openml.org/api/v1/xml/data/9999999"
+    dataset_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/9999999"
     assert dataset_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
@@ -1827,7 +1817,7 @@ def all_datasets():
     return openml.datasets.list_datasets()
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets(all_datasets: pd.DataFrame):
     # We can only perform a smoke test here because we test on dynamic
     # data from the internet...
@@ -1836,49 +1826,49 @@ def test_list_datasets(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(all_datasets)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
     tag_datasets = openml.datasets.list_datasets(tag="study_14")
     assert 0 < len(tag_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(tag_datasets)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets_by_size():
     datasets = openml.datasets.list_datasets(size=5)
     assert len(datasets) == 5
     _assert_datasets_have_id_and_valid_status(datasets)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
     small_datasets = openml.datasets.list_datasets(number_instances="5..100")
     assert 0 < len(small_datasets) <= len(all_datasets)
     _assert_datasets_have_id_and_valid_status(small_datasets)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
     wide_datasets = openml.datasets.list_datasets(number_features="50..100")
     assert 8 <= len(wide_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(wide_datasets)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
     five_class_datasets = openml.datasets.list_datasets(number_classes="5")
     assert 3 <= len(five_class_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
     na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
     combined_filter_datasets = openml.datasets.list_datasets(
         tag="study_14",
@@ -1907,9 +1897,8 @@ def _dataset_features_is_downloaded(did: int):
 
 
 def _dataset_data_file_is_downloaded(did: int):
-    parquet_present = _dataset_file_is_downloaded(did, "dataset.pq")
-    arff_present = _dataset_file_is_downloaded(did, "dataset.arff")
-    return parquet_present or arff_present
+    cache_directory = Path(openml.config.get_cache_directory()) / "datasets" / str(did)
+    return any(f.suffix in (".pq", ".arff") for f in cache_directory.iterdir())
 
 
 def _assert_datasets_retrieved_successfully(
@@ -1951,7 +1940,7 @@ def isolate_for_test():
     ("with_data", "with_qualities", "with_features"),
     itertools.product([True, False], repeat=3),
 )
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_get_dataset_lazy_behavior(
     isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool
 ):
@@ -1978,7 +1967,7 @@ def test_get_dataset_lazy_behavior(
     )
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_get_dataset_with_invalid_id() -> None:
     INVALID_ID = 123819023109238  # Well, at some point this will probably be valid...
     with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
@@ -2006,7 +1995,7 @@ def test_read_features_from_xml_with_whitespace() -> None:
     assert dict[1].nominal_values == [" - 50000.", " 50000+."]
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_get_dataset_parquet(requests_mock, test_files_directory):
     # Parquet functionality is disabled on the test server
     # There is no parquet-copy of the test server yet.
@@ -2014,7 +2003,7 @@ def test_get_dataset_parquet(requests_mock, test_files_directory):
             test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
     )
     # While the mocked example is from production, unit tests by default connect to the test server.
-    requests_mock.get("https://test.openml.org/api/v1/xml/data/61", text=content_file.read_text())
+    requests_mock.get(f"{openml.config.TEST_SERVER_URL}/api/v1/xml/data/61", text=content_file.read_text())
     dataset = openml.datasets.get_dataset(61, download_data=True)
     assert dataset._parquet_url is not None
     assert dataset.parquet_file is not None
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index ee7c306a1..e15556d7b 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -50,7 +50,7 @@ def _check_list_evaluation_setups(self, **kwargs):
             self.assertSequenceEqual(sorted(list1), sorted(list2))
         return evals_setups
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_task(self):
         self.use_production_server()
 
@@ -70,7 +70,7 @@ def test_evaluation_list_filter_task(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_uploader_ID_16(self):
         self.use_production_server()
 
@@ -85,7 +85,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
         assert len(evaluations) > 50
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_uploader_ID_10(self):
         self.use_production_server()
 
@@ -104,7 +104,7 @@ def test_evaluation_list_filter_uploader_ID_10(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_flow(self):
         self.use_production_server()
 
@@ -124,7 +124,7 @@ def test_evaluation_list_filter_flow(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_filter_run(self):
         self.use_production_server()
 
@@ -144,7 +144,7 @@ def test_evaluation_list_filter_run(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_limit(self):
         self.use_production_server()
 
@@ -155,7 +155,7 @@ def test_evaluation_list_limit(self):
         )
         assert len(evaluations) == 100
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_evaluations_empty(self):
         evaluations = openml.evaluations.list_evaluations("unexisting_measure")
         if len(evaluations) > 0:
@@ -163,7 +163,7 @@ def test_list_evaluations_empty(self):
 
         assert isinstance(evaluations, dict)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_per_fold(self):
         self.use_production_server()
         size = 1000
@@ -201,7 +201,7 @@ def test_evaluation_list_per_fold(self):
             assert evaluations[run_id].value is not None
             assert evaluations[run_id].values is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_evaluation_list_sort(self):
         self.use_production_server()
         size = 10
@@ -233,13 +233,13 @@ def test_evaluation_list_sort(self):
         test_output = sorted(unsorted_output, reverse=True)
         assert test_output[:size] == sorted_output
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_evaluation_measures(self):
         measures = openml.evaluations.list_evaluation_measures()
         assert isinstance(measures, list) is True
         assert all(isinstance(s, str) for s in measures) is True
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_evaluations_setups_filter_flow(self):
         self.use_production_server()
         flow_id = [405]
@@ -257,7 +257,7 @@ def test_list_evaluations_setups_filter_flow(self):
         keys = list(evals["parameters"].values[0].keys())
         assert all(elem in columns for elem in keys)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_list_evaluations_setups_filter_task(self):
         self.use_production_server()
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 527ad1f8c..b942c0ab9 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -44,7 +44,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the
         # test server
@@ -77,7 +77,7 @@ def test_get_flow(self):
         assert subflow_3.parameters["L"] == "-1"
         assert len(subflow_3.components) == 0
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_structure(self):
         # also responsible for testing: flow.get_subflow
@@ -103,7 +103,7 @@ def test_get_structure(self):
                 subflow = flow.get_subflow(structure)
                 assert subflow.flow_id == sub_flow_id
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_tagging(self):
         flows = openml.flows.list_flows(size=1)
         flow_id = flows["id"].iloc[0]
@@ -121,7 +121,7 @@ def test_tagging(self):
         flows = openml.flows.list_flows(tag=tag)
         assert len(flows) == 0
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_from_xml_to_xml(self):
         # Get the raw xml thing
         # TODO maybe get this via get_flow(), which would have to be refactored
@@ -181,7 +181,7 @@ def test_to_xml_from_xml(self):
         assert new_flow is not flow
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -223,7 +223,7 @@ def test_publish_existing_flow(self, flow_exists_mock):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_flow_with_similar_components(self):
         clf = sklearn.ensemble.VotingClassifier(
             [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
@@ -274,7 +274,7 @@ def test_publish_flow_with_similar_components(self):
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
@@ -366,7 +366,7 @@ def test_illegal_flow(self):
         )
         self.assertRaises(ValueError, self.extension.model_to_flow, illegal)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_nonexisting_flow_exists(self):
         def get_sentinel():
             # Create a unique prefix for the flow. Necessary because the flow
@@ -384,7 +384,7 @@ def get_sentinel():
         assert not flow_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
@@ -425,7 +425,7 @@ def test_existing_flow_exists(self):
             assert downloaded_flow_id == flow.flow_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
@@ -565,7 +565,7 @@ def test_extract_tags(self):
         tags = openml.utils.extract_xml_tags("oml:tag", flow_dict["oml:flow"])
         assert tags == ["OpenmlWeka", "weka"]
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_download_non_scikit_learn_flows(self):
         self.use_production_server()
 
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 5aa99cd62..ce0d5e782 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -47,7 +47,7 @@ def _check_flow(self, flow):
         )
         assert ext_version_str_or_none
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows(self):
         self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
@@ -58,7 +58,7 @@ def test_list_flows(self):
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_output_format(self):
         self.use_production_server()
         # We can only perform a smoke test here because we test on dynamic
@@ -67,13 +67,13 @@ def test_list_flows_output_format(self):
         assert isinstance(flows, pd.DataFrame)
         assert len(flows) >= 1500
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_empty(self):
         self.use_production_server()
         flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
         assert flows.empty
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_by_tag(self):
         self.use_production_server()
         flows = openml.flows.list_flows(tag="weka")
@@ -81,7 +81,7 @@ def test_list_flows_by_tag(self):
         for flow in flows.to_dict(orient="index").values():
             self._check_flow(flow)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_flows_paginate(self):
         self.use_production_server()
         size = 10
@@ -280,7 +280,7 @@ def test_are_flows_equal_ignore_if_older(self):
         reason="OrdinalEncoder introduced in 0.20. "
         "No known models with list of lists parameters in older versions.",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_sklearn_to_flow_list_of_lists(self):
         from sklearn.preprocessing import OrdinalEncoder
@@ -301,7 +301,7 @@ def test_sklearn_to_flow_list_of_lists(self):
         assert server_flow.parameters["categories"] == "[[0, 1], [0, 1]]"
         assert server_flow.model.categories == flow.model.categories
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow1(self):
         # Regression test for issue #305
         # Basically, this checks that a flow without an external version can be loaded
@@ -310,7 +310,7 @@ def test_get_flow1(self):
         assert flow.external_version is None
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model(self):
         model = ensemble.RandomForestClassifier(n_estimators=33)
         extension = openml.extensions.get_extension_by_model(model)
@@ -322,7 +322,7 @@ def test_get_flow_reinstantiate_model(self):
         downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
         assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_flow_reinstantiate_model_no_extension(self):
         # Flow 10 is a WEKA flow
         self.assertRaisesRegex(
@@ -338,7 +338,7 @@ def test_get_flow_reinstantiate_model_no_extension(self):
         Version(sklearn.__version__) == Version("0.19.1"),
         reason="Requires scikit-learn!=0.19.1, because target flow is from that version.",
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(self):
         self.use_production_server()
         flow = 8175
@@ -359,7 +359,7 @@ def test_get_flow_with_reinstantiate_strict_with_wrong_version_raises_exception(
         # Because scikit-learn dropped min_impurity_split hyperparameter in 1.0,
         # and the requested flow is from 1.0.0 exactly.
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
         self.use_production_server()
         flow = openml.flows.get_flow(flow_id=19190, reinstantiate=True, strict_version=False)
@@ -373,7 +373,7 @@ def test_get_flow_reinstantiate_flow_not_strict_post_1(self):
         reason="Requires scikit-learn 0.23.2 or ~0.24.",
         # Because these still have min_impurity_split, but with new scikit-learn module structure."
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
         self.use_production_server()
         flow = openml.flows.get_flow(flow_id=18587, reinstantiate=True, strict_version=False)
@@ -385,7 +385,7 @@ def test_get_flow_reinstantiate_flow_not_strict_023_and_024(self):
         Version(sklearn.__version__) > Version("0.23"),
         reason="Requires scikit-learn<=0.23, because the scikit-learn module structure changed.",
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         self.use_production_server()
         flow = openml.flows.get_flow(flow_id=8175, reinstantiate=True, strict_version=False)
@@ -393,7 +393,7 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         assert "sklearn==0.19.1" not in flow.dependencies
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_flow_id(self):
         if self.long_version:
             list_all = openml.utils._list_all
@@ -428,7 +428,7 @@ def test_get_flow_id(self):
             pytest.skip(reason="Not sure why there should only be one version of this flow.")
             assert flow_ids_exact_version_True == flow_ids_exact_version_False
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_delete_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -453,7 +453,6 @@ def test_delete_flow(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -466,14 +465,13 @@ def test_delete_flow_not_owned(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(40_000)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/40000"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_has_runs.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -486,14 +484,13 @@ def test_delete_flow_with_run(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(40_000)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/40000"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_subflow(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_is_subflow.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -506,14 +503,13 @@ def test_delete_subflow(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(40_000)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/40000"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/40000"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_flow_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_successful.xml"
     mock_delete.return_value = create_request_response(
         status_code=200,
@@ -523,7 +519,7 @@ def test_delete_flow_success(mock_delete, test_files_directory, test_api_key):
     success = openml.flows.delete_flow(33364)
     assert success
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/33364"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/33364"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
@@ -531,7 +527,6 @@ def test_delete_flow_success(mock_delete, test_files_directory, test_api_key):
 @mock.patch.object(requests.Session, "delete")
 @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "flows" / "flow_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -544,6 +539,6 @@ def test_delete_unknown_flow(mock_delete, test_files_directory, test_api_key):
     ):
         openml.flows.delete_flow(9_999_999)
 
-    flow_url = "https://test.openml.org/api/v1/xml/flow/9999999"
+    flow_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/flow/9999999"
     assert flow_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index a295259ef..3f30f38ba 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -15,14 +15,14 @@
 
 
 class TestConfig(openml.testing.TestBase):
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_too_long_uri(self):
         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
             openml.datasets.list_datasets(data_id=list(range(10000)))
 
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_retry_on_database_error(self, Session_class_mock, _):
         response_mock = unittest.mock.Mock()
         response_mock.text = (
@@ -117,12 +117,12 @@ def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
         ("task/42", "delete"),  # 460
     ],
 )
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
     endpoint: str,
     method: str,
 ) -> None:
     # We need to temporarily disable the API key to test the error message
     with openml.config.overwrite_config_context({"apikey": None}):
-        with pytest.raises(openml.exceptions.OpenMLNotAuthorizedError, match=API_TOKEN_HELP_LINK):
+        with pytest.raises(openml.exceptions.OpenMLAuthenticationError, match=API_TOKEN_HELP_LINK):
             openml._api_calls._perform_api_call(call=endpoint, request_method=method, data=None)
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
index c5ddc4ecc..13b06223a 100644
--- a/tests/test_openml/test_config.py
+++ b/tests/test_openml/test_config.py
@@ -78,7 +78,7 @@ def test_get_config_as_dict(self):
         config = openml.config.get_config_as_dict()
         _config = {}
         _config["apikey"] = TestBase.user_key
-        _config["server"] = "https://test.openml.org/api/v1/xml"
+        _config["server"] = f"{openml.config.TEST_SERVER_URL}/api/v1/xml"
         _config["cachedir"] = self.workdir
         _config["avoid_duplicate_runs"] = False
         _config["connection_n_retries"] = 20
@@ -106,7 +106,7 @@ def test_setup_with_config(self):
 
 
 class TestConfigurationForExamples(openml.testing.TestBase):
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_switch_to_example_configuration(self):
         """Verifies the test configuration is loaded properly."""
         # Below is the default test key which would be used anyway, but just for clarity:
@@ -118,7 +118,7 @@ def test_switch_to_example_configuration(self):
         assert openml.config.apikey == TestBase.user_key
         assert openml.config.server == self.test_server
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_switch_from_example_configuration(self):
         """Verifies the previous configuration is loaded after stopping."""
         # Below is the default test key which would be used anyway, but just for clarity:
@@ -143,7 +143,7 @@ def test_example_configuration_stop_before_start(self):
             openml.config.stop_using_configuration_for_example,
         )
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_example_configuration_start_twice(self):
         """Checks that the original config can be returned to if `start..` is called twice."""
         openml.config.apikey = TestBase.user_key
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 1a66b76c0..17349fca8 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -25,7 +25,7 @@ class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
     # less than 1 seconds
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_tagging(self):
         runs = openml.runs.list_runs(size=1)
         assert not runs.empty, "Test server state is incorrect"
@@ -119,7 +119,7 @@ def _check_array(array, type_):
             assert run_prime_trace_content is None
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
@@ -155,7 +155,7 @@ def test_to_from_filesystem_vanilla(self):
 
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
@@ -190,7 +190,7 @@ def test_to_from_filesystem_search(self):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
@@ -296,7 +296,7 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_test, saved_y_test)
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -340,7 +340,7 @@ def test_publish_with_local_loaded_flow(self):
             openml.runs.get_run(loaded_run.run_id)
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_offline_and_online_run_identical(self):
         extension = SklearnExtension()
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 8f2c505b7..9bc8d74fa 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -398,7 +398,7 @@ def _check_sample_evaluations(
                             assert evaluation < max_time_allowed
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_regression_on_classif_task(self):
         task_id = 259  # collins; crossvalidation; has numeric targets
 
@@ -415,7 +415,7 @@ def test_run_regression_on_classif_task(self):
             )
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
         task = openml.tasks.get_task(task_id)
@@ -628,7 +628,7 @@ def _run_and_upload_regression(
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_and_upload_logistic_regression(self):
         lr = LogisticRegression(solver="lbfgs", max_iter=1000)
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -637,7 +637,7 @@ def test_run_and_upload_logistic_regression(self):
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
         task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
@@ -668,7 +668,7 @@ def test_run_and_upload_linear_regression(self):
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
         pipeline1 = Pipeline(
             steps=[
@@ -686,7 +686,7 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_and_upload_column_transformer_pipeline(self):
         import sklearn.compose
         import sklearn.impute
@@ -799,7 +799,7 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
         assert call_count == 3
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_and_upload_gridsearch(self):
         estimator_name = (
             "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
@@ -822,7 +822,7 @@ def test_run_and_upload_gridsearch(self):
         assert len(run.trace.trace_iterations) == 9
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -855,7 +855,7 @@ def test_run_and_upload_randomsearch(self):
         assert len(trace.trace_iterations) == 5
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
         # 1) it verifies the correct handling of masked arrays (not all
@@ -883,7 +883,7 @@ def test_run_and_upload_maskedarrays(self):
     ##########################################################################
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -908,7 +908,7 @@ def test_learning_curve_task_1(self):
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -949,7 +949,7 @@ def test_learning_curve_task_2(self):
         Version(sklearn.__version__) < Version("0.21"),
         reason="Pipelines don't support indexing (used for the assert check)",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_initialize_cv_from_run(self):
         randomsearch = Pipeline(
             [
@@ -1024,7 +1024,7 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] <= 1
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
         australian_task = 595  # Australian; crossvalidation
@@ -1044,7 +1044,7 @@ def test_local_run_swapped_parameter_order_model(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_local_run_swapped_parameter_order_flow(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1073,7 +1073,7 @@ def test_local_run_swapped_parameter_order_flow(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_local_run_metric_score(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1096,7 +1096,7 @@ def test_local_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_online_run_metric_score(self):
         self.use_production_server()
 
@@ -1111,7 +1111,7 @@ def test_online_run_metric_score(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(
             steps=[
@@ -1173,7 +1173,7 @@ def test_initialize_model_from_run(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
@@ -1229,7 +1229,7 @@ def test__run_exists(self):
             assert run_ids, (run_ids, clf)
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
         # non-existing flo
@@ -1249,7 +1249,7 @@ def test_run_with_illegal_flow_id(self):
             )
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
         # caught if the run is stored to and loaded from disk first.
@@ -1281,7 +1281,7 @@ def test_run_with_illegal_flow_id_after_load(self):
             TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
         # flow. Comes to a different value error than the previous test
@@ -1307,7 +1307,7 @@ def test_run_with_illegal_flow_id_1(self):
             )
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
         # also caught if the run is stored to and loaded from disk first.
@@ -1350,7 +1350,7 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="OneHotEncoder cannot handle mixed type DataFrame as input",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
         num_instances = 3196
@@ -1407,7 +1407,7 @@ def test__create_trace_from_arff(self):
             trace_arff = arff.load(arff_file)
         OpenMLRunTrace.trace_from_arff(trace_arff)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_run(self):
         # this run is not available on test
         self.use_production_server()
@@ -1442,7 +1442,7 @@ def _check_run(self, run):
         assert isinstance(run, dict)
         assert len(run) == 8, str(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1451,12 +1451,12 @@ def test_get_runs_list(self):
         for run in runs.to_dict(orient="index").values():
             self._check_run(run)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_runs_empty(self):
         runs = openml.runs.list_runs(task=[0])
         assert runs.empty
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1475,7 +1475,7 @@ def test_get_runs_list_by_task(self):
             assert run["task_id"] in task_ids
             self._check_run(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_uploader(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1497,7 +1497,7 @@ def test_get_runs_list_by_uploader(self):
             assert run["uploader"] in uploader_ids
             self._check_run(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1516,7 +1516,7 @@ def test_get_runs_list_by_flow(self):
             assert run["flow_id"] in flow_ids
             self._check_run(run)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_pagination(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1529,7 +1529,7 @@ def test_get_runs_pagination(self):
             for run in runs.to_dict(orient="index").values():
                 assert run["uploader"] in uploader_ids
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_runs_list_by_filters(self):
         # TODO: comes from live, no such lists on test
         self.use_production_server()
@@ -1566,7 +1566,7 @@ def test_get_runs_list_by_filters(self):
         )
         assert len(runs) == 2
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_runs_list_by_tag(self):
         # We don't have tagged runs on the test server
@@ -1580,7 +1580,7 @@ def test_get_runs_list_by_tag(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_on_dataset_with_missing_labels_dataframe(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1617,7 +1617,7 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_on_dataset_with_missing_labels_array(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1656,7 +1656,7 @@ def test_run_on_dataset_with_missing_labels_array(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_cached_run(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.runs.functions._get_cached_run(1)
@@ -1667,7 +1667,7 @@ def test_get_uncached_run(self):
             openml.runs.functions._get_cached_run(10)
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
@@ -1687,7 +1687,7 @@ def test_run_flow_on_task_downloaded_flow(self):
         TestBase._mark_entity_for_removal("run", run.run_id)
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {run.run_id}")
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_format_prediction_non_supervised(self):
         # non-supervised tasks don't exist on the test server
         self.use_production_server()
@@ -1698,7 +1698,7 @@ def test_format_prediction_non_supervised(self):
         ):
             format_prediction(clustering, *ignored_input)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1708,7 +1708,7 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1719,7 +1719,7 @@ def test_format_prediction_classification_incomplete_probabilities(self):
         with pytest.raises(ValueError, match="Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1730,7 +1730,7 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
@@ -1738,7 +1738,7 @@ def test_format_prediction_task_learning_curve_sample_not_set(self):
         with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_format_prediction_task_regression(self):
         task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
         _task_id = check_task_existence(**task_meta_data)
@@ -1773,7 +1773,7 @@ def test_format_prediction_task_regression(self):
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_delete_run(self):
         rs = np.random.randint(1, 2**31 - 1)
         clf = sklearn.pipeline.Pipeline(
@@ -1813,7 +1813,6 @@ def test_initialize_model_from_run_nonstrict(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -1826,14 +1825,13 @@ def test_delete_run_not_owned(mock_delete, test_files_directory, test_api_key):
     ):
         openml.runs.delete_run(40_000)
 
-    run_url = "https://test.openml.org/api/v1/xml/run/40000"
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/40000"
     assert run_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_run_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_successful.xml"
     mock_delete.return_value = create_request_response(
         status_code=200,
@@ -1843,14 +1841,13 @@ def test_delete_run_success(mock_delete, test_files_directory, test_api_key):
     success = openml.runs.delete_run(10591880)
     assert success
 
-    run_url = "https://test.openml.org/api/v1/xml/run/10591880"
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/10591880"
     assert run_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "runs" / "run_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -1863,7 +1860,7 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
     ):
         openml.runs.delete_run(9_999_999)
 
-    run_url = "https://test.openml.org/api/v1/xml/run/9999999"
+    run_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/run/9999999"
     assert run_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
@@ -1873,8 +1870,12 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
     )
+@unittest.skipIf(
+    Version(sklearn.__version__) >= Version("1.8"),
+    reason="predictions differ significantly",
+    )
 @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test__run_task_get_arffcontent_2(parallel_mock):
     """Tests if a run executed in parallel is collated correctly."""
     task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
@@ -1965,7 +1966,7 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
     ]
 )
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     """Tests evaluation of a run using various joblib backends and n_jobs."""
     if backend is None:
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index a0469f9a5..0df3a0b3b 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -35,7 +35,7 @@ def setUp(self):
         super().setUp()
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
         sentinel = get_sentinel()
@@ -83,7 +83,7 @@ def _existing_setup_exists(self, classif):
         assert setup_id == run.setup_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
             self.var_smoothing = 1e-9
@@ -99,13 +99,13 @@ def side_effect(self):
             self._existing_setup_exists(nb)
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
     @pytest.mark.sklearn()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
         self._existing_setup_exists(
@@ -118,7 +118,7 @@ def test_existing_setup_exists_3(self):
             ),
         )
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_setup(self):
         self.use_production_server()
         # no setups in default test server
@@ -135,7 +135,7 @@ def test_get_setup(self):
             else:
                 assert len(current.parameters) == num_params[idx]
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_setup_list_filter_flow(self):
         self.use_production_server()
 
@@ -147,7 +147,7 @@ def test_setup_list_filter_flow(self):
         for setup_id in setups:
             assert setups[setup_id].flow_id == flow_id
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_setups_empty(self):
         setups = openml.setups.list_setups(setup=[0])
         if len(setups) > 0:
@@ -155,7 +155,7 @@ def test_list_setups_empty(self):
 
         assert isinstance(setups, dict)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_list_setups_output_format(self):
         self.use_production_server()
         flow_id = 6794
@@ -168,7 +168,7 @@ def test_list_setups_output_format(self):
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_setuplist_offset(self):
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
@@ -180,7 +180,7 @@ def test_setuplist_offset(self):
 
         assert len(all) == size * 2
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_cached_setup(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.setups.functions._get_cached_setup(1)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 4b662524b..2a2d276ec 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -12,7 +12,7 @@
 class TestStudyFunctions(TestBase):
     _multiprocess_can_split_ = True
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_study_old(self):
         self.use_production_server()
@@ -24,7 +24,7 @@ def test_get_study_old(self):
         assert len(study.setups) == 30
         assert study.runs is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_study_new(self):
         self.use_production_server()
 
@@ -35,7 +35,7 @@ def test_get_study_new(self):
         assert len(study.setups) == 1253
         assert len(study.runs) == 1693
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_openml100(self):
         self.use_production_server()
 
@@ -45,7 +45,7 @@ def test_get_openml100(self):
         assert isinstance(study_2, openml.study.OpenMLBenchmarkSuite)
         assert study.study_id == study_2.study_id
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_study_error(self):
         self.use_production_server()
 
@@ -54,7 +54,7 @@ def test_get_study_error(self):
         ):
             openml.study.get_study(99)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_suite(self):
         self.use_production_server()
 
@@ -65,7 +65,7 @@ def test_get_suite(self):
         assert study.runs is None
         assert study.setups is None
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_suite_error(self):
         self.use_production_server()
 
@@ -74,7 +74,7 @@ def test_get_suite_error(self):
         ):
             openml.study.get_suite(123)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_benchmark_suite(self):
         fixture_alias = None
         fixture_name = "unit tested benchmark suite"
@@ -143,16 +143,16 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool):
         assert study_downloaded.main_entity_type == "run"
         assert study_downloaded.runs is None
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_empty_study_explicit(self):
         self._test_publish_empty_study_is_allowed(explicit=True)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_empty_study_implicit(self):
         self._test_publish_empty_study_is_allowed(explicit=False)
 
     @pytest.mark.flaky()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -222,7 +222,7 @@ def test_publish_study(self):
         res = openml.study.delete_study(study.id)
         assert res
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
         assert len(run_list) == 10
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index fed0c0a00..65dcebc1d 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -26,13 +26,13 @@ def test_download_task(self):
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_get_X_and_Y():
     task = get_task(119)
     X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index 2bbb015c6..29f5663c4 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -20,15 +20,15 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.CLUSTERING
         self.estimation_procedure = 17
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_dataset(self):
         # no clustering tasks on test server
         self.use_production_server()
         task = openml.tasks.get_task(self.task_id)
         task.get_dataset()
 
-    @pytest.mark.production()
-    @pytest.mark.uses_test_server()
+    @pytest.mark.production_server()
+    @pytest.mark.test_server()
     def test_download_task(self):
         # no clustering tasks on test server
         self.use_production_server()
@@ -37,7 +37,7 @@ def test_download_task(self):
         assert task.task_type_id == TaskType.CLUSTERING
         assert task.dataset_id == 36
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_upload_task(self):
         compatible_datasets = self._get_compatible_rand_dataset()
         for i in range(100):
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index fbcbfe9bf..465d9c0be 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
@@ -27,14 +27,14 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_categorical_dtype(Y)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.LEARNING_CURVE
         assert task.dataset_id == 20
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index a834cdf0f..26d7dc94b 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -49,7 +49,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_REGRESSION
 
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
@@ -58,7 +58,7 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_numeric_dtype(Y)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index 3f7b06ee4..99df3cace 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -28,7 +28,7 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index b77782847..1d0df1210 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -32,11 +32,11 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_download_task(self):
         return get_task(self.task_id)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_upload_task(self):
         # We don't know if the task in question already exists, so we try a few times. Checking
         # beforehand would not be an option because a concurrent unit test could potentially
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index d44717177..df3c0a3b6 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -26,7 +26,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_cached_tasks(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         tasks = openml.tasks.functions._get_cached_tasks()
@@ -34,7 +34,7 @@ def test__get_cached_tasks(self):
         assert len(tasks) == 3
         assert isinstance(next(iter(tasks.values())), OpenMLTask)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_cached_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.functions._get_cached_task(1)
@@ -49,14 +49,14 @@ def test__get_cached_task_not_cached(self):
             2,
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_estimation_procedure_list(self):
         estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
         assert isinstance(estimation_procedures, list)
         assert isinstance(estimation_procedures[0], dict)
         assert estimation_procedures[0]["task_type_id"] == TaskType.SUPERVISED_CLASSIFICATION
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_list_clustering_task(self):
         self.use_production_server()
@@ -73,7 +73,7 @@ def _check_task(self, task):
         assert isinstance(task["status"], str)
         assert task["status"] in ["in_preparation", "active", "deactivated"]
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_tasks_by_type(self):
         num_curves_tasks = 198  # number is flexible, check server if fails
         ttid = TaskType.LEARNING_CURVE
@@ -83,33 +83,35 @@ def test_list_tasks_by_type(self):
             assert ttid == task["ttid"]
             self._check_task(task)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_tasks_length(self):
         ttid = TaskType.LEARNING_CURVE
         tasks = openml.tasks.list_tasks(task_type=ttid)
         assert len(tasks) > 100
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_tasks_empty(self):
         tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
         assert tasks.empty
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_tasks_by_tag(self):
-        num_basic_tasks = 100  # number is flexible, check server if fails
+        # Server starts with 99 active tasks with the tag, and one 'in_preparation',
+        # so depending on the processing of the last dataset, there may be 99 or 100 matches.
+        num_basic_tasks = 99
         tasks = openml.tasks.list_tasks(tag="OpenML100")
         assert len(tasks) >= num_basic_tasks
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_tasks(self):
         tasks = openml.tasks.list_tasks()
         assert len(tasks) >= 900
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
@@ -119,7 +121,7 @@ def test_list_tasks_paginate(self):
             for task in tasks.to_dict(orient="index").values():
                 self._check_task(task)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_list_tasks_per_type_paginate(self):
         size = 40
         max = 100
@@ -136,7 +138,7 @@ def test_list_tasks_per_type_paginate(self):
                     assert j == task["ttid"]
                     self._check_task(task)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test__get_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.tasks.get_task(1882)
@@ -144,51 +146,51 @@ def test__get_task(self):
     @unittest.skip(
         "Please await outcome of discussion: https://github.com/openml/OpenML/issues/776",
     )
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test__get_task_live(self):
         self.use_production_server()
         # Test the following task as it used to throw an Unicode Error.
         # https://github.com/openml/openml-python/issues/378
         openml.tasks.get_task(34536)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_task(self):
         task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "task.xml")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "task.xml")
         )
         assert not os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
         )
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
+            os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset_1.pq")
         )
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "task.xml")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "task.xml")
         )
         assert task.class_labels == ["1", "2", "3", "4", "5", "U"]
 
         assert not os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
         )
         # Since the download_data=False is propagated to get_dataset
         assert not os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "datasets", "2", "dataset.arff")
+            os.path.join(openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")
         )
 
         task.download_split()
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "2", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "2", "datasplits.arff")
         )
 
     @mock.patch("openml.tasks.functions.get_dataset")
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_removal_upon_download_failure(self, get_dataset):
         class WeirdException(Exception):
             pass
@@ -206,13 +208,13 @@ def assert_and_raise(*args, **kwargs):
         # Now the file should no longer exist
         assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
         assert isinstance(task, OpenMLTask)
 
-    @pytest.mark.production()
+    @pytest.mark.production_server()
     def test_get_task_different_types(self):
         self.use_production_server()
         # Regression task
@@ -222,13 +224,13 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()
         assert type(split) == OpenMLSplit
         assert os.path.exists(
-            os.path.join(self.workdir, "org", "openml", "test", "tasks", "1", "datasplits.arff")
+            os.path.join(openml.config.get_cache_directory(), "tasks", "1", "datasplits.arff")
         )
 
     def test_deletion_of_cache_dir(self):
@@ -244,7 +246,6 @@ def test_deletion_of_cache_dir(self):
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_owned.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -257,14 +258,13 @@ def test_delete_task_not_owned(mock_delete, test_files_directory, test_api_key):
     ):
         openml.tasks.delete_task(1)
 
-    task_url = "https://test.openml.org/api/v1/xml/task/1"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/1"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_has_runs.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -277,14 +277,13 @@ def test_delete_task_with_run(mock_delete, test_files_directory, test_api_key):
     ):
         openml.tasks.delete_task(3496)
 
-    task_url = "https://test.openml.org/api/v1/xml/task/3496"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/3496"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_success(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_successful.xml"
     mock_delete.return_value = create_request_response(
         status_code=200,
@@ -294,14 +293,13 @@ def test_delete_success(mock_delete, test_files_directory, test_api_key):
     success = openml.tasks.delete_task(361323)
     assert success
 
-    task_url = "https://test.openml.org/api/v1/xml/task/361323"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/361323"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
 
 
 @mock.patch.object(requests.Session, "delete")
 def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key):
-    openml.config.start_using_configuration_for_example()
     content_file = test_files_directory / "mock_responses" / "tasks" / "task_delete_not_exist.xml"
     mock_delete.return_value = create_request_response(
         status_code=412,
@@ -314,6 +312,6 @@ def test_delete_unknown_task(mock_delete, test_files_directory, test_api_key):
     ):
         openml.tasks.delete_task(9_999_999)
 
-    task_url = "https://test.openml.org/api/v1/xml/task/9999999"
+    task_url = f"{openml.config.TEST_SERVER_URL}/api/v1/xml/task/9999999"
     assert task_url == mock_delete.call_args.args[0]
     assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key")
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 6b8804b9f..9316d0876 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -16,7 +16,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # tags can be at most 64 alphanumeric (+ underscore) chars
@@ -32,7 +32,7 @@ def test_tagging(self):
         tasks = openml.tasks.list_tasks(tag=tag)
         assert len(tasks) == 0
 
-    @pytest.mark.uses_test_server()
+    @pytest.mark.test_server()
     def test_get_train_and_test_split_indices(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1882)
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 8dbdd30b5..38e004bfb 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -48,18 +48,18 @@ def _mocked_perform_api_call(call, request_method):
     return openml._api_calls._download_text_file(url)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all():
     openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_for_tasks(min_number_tasks_on_test_server):
     tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
     assert min_number_tasks_on_test_server == len(tasks)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     # By setting the batch size one lower than the minimum we guarantee at least two
     # batches and at the same time do as few batches (roundtrips) as possible.
@@ -72,7 +72,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_for_datasets(min_number_datasets_on_test_server):
     datasets = openml.datasets.list_datasets(
         size=min_number_datasets_on_test_server,
@@ -83,14 +83,14 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
         _check_dataset(dataset)
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_for_flows(min_number_flows_on_test_server):
     flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
     assert min_number_flows_on_test_server == len(flows)
 
 
 @pytest.mark.flaky()  # Other tests might need to upload runs first
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_for_setups(min_number_setups_on_test_server):
     # TODO apparently list_setups function does not support kwargs
     setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
@@ -98,14 +98,14 @@ def test_list_all_for_setups(min_number_setups_on_test_server):
 
 
 @pytest.mark.flaky()  # Other tests might need to upload runs first
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_for_runs(min_number_runs_on_test_server):
     runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
     assert min_number_runs_on_test_server == len(runs)
 
 
 @pytest.mark.flaky()  # Other tests might need to upload runs first
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
     # TODO apparently list_evaluations function does not support kwargs
     evaluations = openml.evaluations.list_evaluations(
@@ -116,7 +116,7 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
 
 
 @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_list_all_few_results_available(_perform_api_call):
     datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
     assert len(datasets) == 1, "only one iris dataset version 1 should be present"
@@ -141,7 +141,7 @@ def test__create_cache_directory(config_mock, tmp_path):
         openml.utils._create_cache_directory("ghi")
 
 
-@pytest.mark.uses_test_server()
+@pytest.mark.test_server()
 def test_correct_test_server_download_state():
     """This test verifies that the test server downloads the data from the correct source.