diff --git a/doc/conf.py b/doc/conf.py
index e5de2d551..f0f26318c 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -49,7 +49,7 @@
 autosummary_generate = True
 numpydoc_show_class_members = False
 
-autodoc_default_flags = ["members", "inherited-members"]
+autodoc_default_options = {"members": True, "inherited-members": True}
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
diff --git a/doc/index.rst b/doc/index.rst
index e38e4d877..b78b7c009 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -38,7 +38,7 @@ Example
     # Publish the experiment on OpenML (optional, requires an API key.
     # You can get your own API key by signing up to OpenML.org)
     run.publish()
-    print(f'View the run online: {openml.config.server}/run/{run.run_id}')
+    print(f'View the run online: {run.openml_url}')
 
 You can find more examples in our `examples gallery <examples/index.html>`_.
 
diff --git a/doc/progress.rst b/doc/progress.rst
index 1ca1e1d0e..f27dd1137 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -6,7 +6,20 @@
 Changelog
 =========
 
-0.11.1
+0.12.1
+~~~~~~
+
+* ADD #895/#1038: Measure runtimes of scikit-learn runs also for models which are parallelized
+  via the joblib.
+* DOC #1050: Refer to the webpage instead of the XML file in the main example.
+* DOC #1051: Document existing extensions to OpenML-Python besides the shipped scikit-learn
+  extension.
+* FIX #1035: Render class attributes and methods again.
+* FIX #1042: Fixes a rare concurrency issue with OpenML-Python and joblib which caused the joblib
+  worker pool to fail.
+* FIX #1053: Fixes a bug which could prevent importing the package in a docker container.
+
+0.12.0
 ~~~~~~
 * ADD #964: Validate ``ignore_attribute``, ``default_target_attribute``, ``row_id_attribute`` are set to attributes that exist on the dataset when calling ``create_dataset``.
 * ADD #979: Dataset features and qualities are now also cached in pickle format.
diff --git a/doc/usage.rst b/doc/usage.rst
index 1d54baa62..23ef4ec84 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -145,10 +145,22 @@ obtained on. Learn how to share your datasets in the following tutorial:
 
 * `Upload a dataset <examples/30_extended/create_upload_tutorial.html>`_
 
-~~~~~~~~~~~~~~~~~~~~~~~
+***********************
 Extending OpenML-Python
-~~~~~~~~~~~~~~~~~~~~~~~
+***********************
 
 OpenML-Python provides an extension interface to connect other machine learning libraries than
 scikit-learn to OpenML. Please check the :ref:`api_extensions` and use the
 scikit-learn extension in :class:`openml.extensions.sklearn.SklearnExtension` as a starting point.
+
+Runtime measurement is incorporated in the OpenML sklearn-extension. Example usage and potential
+usage for Hyperparameter Optimisation can be found in the example tutorial:
+`HPO using OpenML <examples/30_extended/fetch_runtimes.html>`_
+
+
+Here is a list of currently maintained OpenML extensions:
+
+* `openml-keras <https://github.com/openml/openml-keras>`_
+* `openml-pytorch <https://github.com/openml/openml-pytorch>`_
+* `openml-tensorflow(for tensorflow 2+) <https://github.com/openml/openml-tensorflow>`_
+
diff --git a/examples/30_extended/fetch_runtimes_tutorial.py b/examples/30_extended/fetch_runtimes_tutorial.py
new file mode 100644
index 000000000..3d5183613
--- /dev/null
+++ b/examples/30_extended/fetch_runtimes_tutorial.py
@@ -0,0 +1,479 @@
+"""
+
+==========================================
+Measuring runtimes for Scikit-learn models
+==========================================
+
+The runtime of machine learning models on specific datasets can be a deciding
+factor on the choice of algorithms, especially for benchmarking and comparison
+purposes. OpenML's scikit-learn extension provides runtime data from runs of
+model fit and prediction on tasks or datasets, for both the CPU-clock as well
+as the actual wallclock-time incurred. The objective of this example is to
+illustrate how to retrieve such timing measures, and also offer some potential
+means of usage and interpretation of the same.
+
+It should be noted that there are multiple levels at which parallelism can occur.
+
+* At the outermost level, OpenML tasks contain fixed data splits, on which the
+  defined model/flow is executed. Thus, a model can be fit on each OpenML dataset fold
+  in parallel using the `n_jobs` parameter to `run_model_on_task` or `run_flow_on_task`
+  (illustrated under Case 2 & 3 below).
+
+* The model/flow specified can also include scikit-learn models that perform their own
+  parallelization. For instance, by specifying `n_jobs` in a Random Forest model definition
+  (covered under Case 2 below).
+
+* The sklearn model can further be an HPO estimator and contain it's own parallelization.
+  If the base estimator used also supports `parallelization`, then there's at least a 2-level nested
+  definition for parallelization possible (covered under Case 3 below).
+
+We shall cover these 5 representative scenarios for:
+
+* (Case 1) Retrieving runtimes for Random Forest training and prediction on each of the
+  cross-validation folds
+
+* (Case 2) Testing the above setting in a parallel setup and monitor the difference using
+  runtimes retrieved
+
+* (Case 3) Comparing RandomSearchCV and GridSearchCV on the above task based on runtimes
+
+* (Case 4) Running models that don't run in parallel or models which scikit-learn doesn't
+  parallelize
+
+* (Case 5) Running models that do not release the Python Global Interpreter Lock (GIL)
+"""
+
+############################################################################
+
+# License: BSD 3-Clause
+
+import openml
+import numpy as np
+from matplotlib import pyplot as plt
+from joblib.parallel import parallel_backend
+
+from sklearn.naive_bayes import GaussianNB
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+
+
+############################################################################
+# Preparing tasks and scikit-learn models
+# ***************************************
+
+task_id = 167119
+
+task = openml.tasks.get_task(task_id)
+print(task)
+
+# Viewing associated data
+n_repeats, n_folds, n_samples = task.get_split_dimensions()
+print(
+    "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
+        task_id, n_repeats, n_folds, n_samples,
+    )
+)
+
+# Creating utility function
+def print_compare_runtimes(measures):
+    for repeat, val1 in measures["usercpu_time_millis_training"].items():
+        for fold, val2 in val1.items():
+            print(
+                "Repeat #{}-Fold #{}: CPU-{:.3f} vs Wall-{:.3f}".format(
+                    repeat, fold, val2, measures["wall_clock_time_millis_training"][repeat][fold]
+                )
+            )
+
+
+############################################################################
+# Case 1: Running a Random Forest model on an OpenML task
+# *******************************************************
+# We'll run a Random Forest model and obtain an OpenML run object. We can
+# see the evaluations recorded per fold for the dataset and the information
+# available for this run.
+
+clf = RandomForestClassifier(n_estimators=10)
+
+run1 = openml.runs.run_model_on_task(
+    model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False,
+)
+measures = run1.fold_evaluations
+
+print("The timing and performance metrics available: ")
+for key in measures.keys():
+    print(key)
+print()
+
+print(
+    "The performance metric is recorded under `predictive_accuracy` per "
+    "fold and can be retrieved as: "
+)
+for repeat, val1 in measures["predictive_accuracy"].items():
+    for fold, val2 in val1.items():
+        print("Repeat #{}-Fold #{}: {:.4f}".format(repeat, fold, val2))
+    print()
+
+################################################################################
+# The remaining entries recorded in `measures` are the runtime records
+# related as:
+#
+# usercpu_time_millis = usercpu_time_millis_training + usercpu_time_millis_testing
+#
+# wall_clock_time_millis = wall_clock_time_millis_training + wall_clock_time_millis_testing
+#
+# The timing measures recorded as `*_millis_training` contain the per
+# repeat-per fold timing incurred for the execution of the `.fit()` procedure
+# of the model. For `usercpu_time_*` the time recorded using `time.process_time()`
+# is converted to `milliseconds` and stored. Similarly, `time.time()` is used
+# to record the time entry for `wall_clock_time_*`. The `*_millis_testing` entry
+# follows the same procedure but for time taken for the `.predict()` procedure.
+
+# Comparing the CPU and wall-clock training times of the Random Forest model
+print_compare_runtimes(measures)
+
+######################################################################
+# Case 2: Running Scikit-learn model on an OpenML task in parallel
+# ****************************************************************
+# Redefining the model to allow parallelism with `n_jobs=2` (2 cores)
+
+clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
+
+run2 = openml.runs.run_model_on_task(
+    model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+)
+measures = run2.fold_evaluations
+# The wall-clock time recorded per fold should be lesser than Case 1 above
+print_compare_runtimes(measures)
+
+####################################################################################
+# Running a Random Forest model on an OpenML task in parallel (all cores available):
+
+# Redefining the model to use all available cores with `n_jobs=-1`
+clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
+
+run3 = openml.runs.run_model_on_task(
+    model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+)
+measures = run3.fold_evaluations
+# The wall-clock time recorded per fold should be lesser than the case above,
+# if more than 2 CPU cores are available. The speed-up is more pronounced for
+# larger datasets.
+print_compare_runtimes(measures)
+
+####################################################################################
+# We can now observe that the ratio of CPU time to wallclock time is lower
+# than in case 1. This happens because joblib by default spawns subprocesses
+# for the workloads for which CPU time cannot be tracked. Therefore, interpreting
+# the reported CPU and wallclock time requires knowledge of the parallelization
+# applied at runtime.
+
+####################################################################################
+# Running the same task with a different parallel backend. Joblib provides multiple
+# backends: {`loky` (default), `multiprocessing`, `dask`, `threading`, `sequential`}.
+# The backend can be explicitly set using a joblib context manager. The behaviour of
+# the job distribution can change and therefore the scale of runtimes recorded too.
+
+with parallel_backend(backend="multiprocessing", n_jobs=-1):
+    run3_ = openml.runs.run_model_on_task(
+        model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+    )
+measures = run3_.fold_evaluations
+print_compare_runtimes(measures)
+
+####################################################################################
+# The CPU time interpretation becomes ambiguous when jobs are distributed over an
+# unknown number of cores or when subprocesses are spawned for which the CPU time
+# cannot be tracked, as in the examples above. It is impossible for OpenML-Python
+# to capture the availability of the number of cores/threads, their eventual
+# utilisation and whether workloads are executed in subprocesses, for various
+# cases that can arise as demonstrated in the rest of the example. Therefore,
+# the final interpretation of the runtimes is left to the `user`.
+
+#####################################################################
+# Case 3: Running and benchmarking HPO algorithms with their runtimes
+# *******************************************************************
+# We shall now optimize a similar RandomForest model for the same task using
+# scikit-learn's HPO support by using GridSearchCV to optimize our earlier
+# RandomForest model's hyperparameter `n_estimators`. Scikit-learn also provides a
+# `refit_time_` for such HPO models, i.e., the time incurred by training
+# and evaluating the model on the best found parameter setting. This is
+# included in the `wall_clock_time_millis_training` measure recorded.
+
+from sklearn.model_selection import GridSearchCV
+
+
+clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
+
+# GridSearchCV model
+n_iter = 5
+grid_pipe = GridSearchCV(
+    estimator=clf,
+    param_grid={"n_estimators": np.linspace(start=1, stop=50, num=n_iter).astype(int).tolist()},
+    cv=2,
+    n_jobs=2,
+)
+
+run4 = openml.runs.run_model_on_task(
+    model=grid_pipe, task=task, upload_flow=False, avoid_duplicate_runs=False, n_jobs=2
+)
+measures = run4.fold_evaluations
+print_compare_runtimes(measures)
+
+##################################################################################
+# Like any optimisation problem, scikit-learn's HPO estimators also generate
+# a sequence of configurations which are evaluated, using which the best found
+# configuration is tracked throughout the trace.
+# The OpenML run object stores these traces as OpenMLRunTrace objects accessible
+# using keys of the pattern (repeat, fold, iterations). Here `fold` implies the
+# outer-cross validation fold as obtained from the task data splits in OpenML.
+# GridSearchCV here performs grid search over the inner-cross validation folds as
+# parameterized by the `cv` parameter. Since `GridSearchCV` in this example performs a
+# `2-fold` cross validation, the runtime recorded per repeat-per fold in the run object
+# is for the entire `fit()` procedure of GridSearchCV thus subsuming the runtimes of
+# the 2-fold (inner) CV search performed.
+
+# We earlier extracted the number of repeats and folds for this task:
+print("# repeats: {}\n# folds: {}".format(n_repeats, n_folds))
+
+# To extract the training runtime of the first repeat, first fold:
+print(run4.fold_evaluations["wall_clock_time_millis_training"][0][0])
+
+##################################################################################
+# To extract the training runtime of the 1-st repeat, 4-th (outer) fold and also
+# to fetch the parameters and performance of the evaluations made during
+# the 1-st repeat, 4-th fold evaluation by the Grid Search model.
+
+_repeat = 0
+_fold = 3
+print(
+    "Total runtime for repeat {}'s fold {}: {:4f} ms".format(
+        _repeat, _fold, run4.fold_evaluations["wall_clock_time_millis_training"][_repeat][_fold]
+    )
+)
+for i in range(n_iter):
+    key = (_repeat, _fold, i)
+    r = run4.trace.trace_iterations[key]
+    print(
+        "n_estimators: {:>2} - score: {:.3f}".format(
+            r.parameters["parameter_n_estimators"], r.evaluation
+        )
+    )
+
+##################################################################################
+# Scikit-learn's HPO estimators also come with an argument `refit=True` as a default.
+# In our previous model definition it was set to True by default, which meant that the best
+# found hyperparameter configuration was used to refit or retrain the model without any inner
+# cross validation. This extra refit time measure is provided by the scikit-learn model as the
+# attribute `refit_time_`.
+# This time is included in the `wall_clock_time_millis_training` measure.
+#
+# For non-HPO estimators, `wall_clock_time_millis = wall_clock_time_millis_training + wall_clock_time_millis_testing`.
+#
+# For HPO estimators, `wall_clock_time_millis = wall_clock_time_millis_training + wall_clock_time_millis_testing + refit_time`.
+#
+# This refit time can therefore be explicitly extracted in this manner:
+
+
+def extract_refit_time(run, repeat, fold):
+    refit_time = (
+        run.fold_evaluations["wall_clock_time_millis"][repeat][fold]
+        - run.fold_evaluations["wall_clock_time_millis_training"][repeat][fold]
+        - run.fold_evaluations["wall_clock_time_millis_testing"][repeat][fold]
+    )
+    return refit_time
+
+
+for repeat in range(n_repeats):
+    for fold in range(n_folds):
+        print(
+            "Repeat #{}-Fold #{}: {:.4f}".format(
+                repeat, fold, extract_refit_time(run4, repeat, fold)
+            )
+        )
+
+############################################################################
+# Along with the GridSearchCV already used above, we demonstrate how such
+# optimisation traces can be retrieved by showing an application of these
+# traces - comparing the speed of finding the best configuration using
+# RandomizedSearchCV and GridSearchCV available with scikit-learn.
+
+# RandomizedSearchCV model
+rs_pipe = RandomizedSearchCV(
+    estimator=clf,
+    param_distributions={
+        "n_estimators": np.linspace(start=1, stop=50, num=15).astype(int).tolist()
+    },
+    cv=2,
+    n_iter=n_iter,
+    n_jobs=2,
+)
+run5 = openml.runs.run_model_on_task(
+    model=rs_pipe, task=task, upload_flow=False, avoid_duplicate_runs=False, n_jobs=2
+)
+
+################################################################################
+# Since for the call to ``openml.runs.run_model_on_task`` the parameter
+# ``n_jobs`` is set to its default ``None``, the evaluations across the OpenML folds
+# are not parallelized. Hence, the time recorded is agnostic to the ``n_jobs``
+# being set at both the HPO estimator ``GridSearchCV`` as well as the base
+# estimator ``RandomForestClassifier`` in this case. The OpenML extension only records the
+# time taken for the completion of the complete ``fit()`` call, per-repeat per-fold.
+#
+# This notion can be used to extract and plot the best found performance per
+# fold by the HPO model and the corresponding time taken for search across
+# that fold. Moreover, since ``n_jobs=None`` for ``openml.runs.run_model_on_task``
+# the runtimes per fold can be cumulatively added to plot the trace against time.
+
+
+def extract_trace_data(run, n_repeats, n_folds, n_iter, key=None):
+    key = "wall_clock_time_millis_training" if key is None else key
+    data = {"score": [], "runtime": []}
+    for i_r in range(n_repeats):
+        for i_f in range(n_folds):
+            data["runtime"].append(run.fold_evaluations[key][i_r][i_f])
+            for i_i in range(n_iter):
+                r = run.trace.trace_iterations[(i_r, i_f, i_i)]
+                if r.selected:
+                    data["score"].append(r.evaluation)
+                    break
+    return data
+
+
+def get_incumbent_trace(trace):
+    best_score = 1
+    inc_trace = []
+    for i, r in enumerate(trace):
+        if i == 0 or (1 - r) < best_score:
+            best_score = 1 - r
+        inc_trace.append(best_score)
+    return inc_trace
+
+
+grid_data = extract_trace_data(run4, n_repeats, n_folds, n_iter)
+rs_data = extract_trace_data(run5, n_repeats, n_folds, n_iter)
+
+plt.clf()
+plt.plot(
+    np.cumsum(grid_data["runtime"]), get_incumbent_trace(grid_data["score"]), label="Grid Search"
+)
+plt.plot(
+    np.cumsum(rs_data["runtime"]), get_incumbent_trace(rs_data["score"]), label="Random Search"
+)
+plt.xscale("log")
+plt.yscale("log")
+plt.xlabel("Wallclock time (in milliseconds)")
+plt.ylabel("1 - Accuracy")
+plt.title("Optimisation Trace Comparison")
+plt.legend()
+plt.show()
+
+################################################################################
+# Case 4: Running models that scikit-learn doesn't parallelize
+# *************************************************************
+# Both scikit-learn and OpenML depend on parallelism implemented through `joblib`.
+# However, there can be cases where either models cannot be parallelized or don't
+# depend on joblib for its parallelism. 2 such cases are illustrated below.
+#
+# Running a Decision Tree model that doesn't support parallelism implicitly, but
+# using OpenML to parallelize evaluations for the outer-cross validation folds.
+
+dt = DecisionTreeClassifier()
+
+run6 = openml.runs.run_model_on_task(
+    model=dt, task=task, upload_flow=False, avoid_duplicate_runs=False, n_jobs=2
+)
+measures = run6.fold_evaluations
+print_compare_runtimes(measures)
+
+################################################################################
+# Although the decision tree does not run in parallel, it can release the
+# `Python GIL <https://docs.python.org/dev/glossary.html#term-global-interpreter-lock>`_.
+# This can result in surprising runtime measures as demonstrated below:
+
+with parallel_backend("threading", n_jobs=-1):
+    run7 = openml.runs.run_model_on_task(
+        model=dt, task=task, upload_flow=False, avoid_duplicate_runs=False
+    )
+measures = run7.fold_evaluations
+print_compare_runtimes(measures)
+
+################################################################################
+# Running a Neural Network from scikit-learn that uses scikit-learn independent
+# parallelism using libraries such as `MKL, OpenBLAS or BLIS
+# <https://scikit-learn.org/stable/computing/parallelism.html#parallel-numpy-routines-from-numerical-libraries>`_.
+
+mlp = MLPClassifier(max_iter=10)
+
+run8 = openml.runs.run_model_on_task(
+    model=mlp, task=task, upload_flow=False, avoid_duplicate_runs=False
+)
+measures = run8.fold_evaluations
+print_compare_runtimes(measures)
+
+################################################################################
+# Case 5: Running Scikit-learn models that don't release GIL
+# **********************************************************
+# Certain Scikit-learn models do not release the `Python GIL
+# <https://docs.python.org/dev/glossary.html#term-global-interpreter-lock>`_ and
+# are also not executed in parallel via a BLAS library. In such cases, the
+# CPU times and wallclock times are most likely trustworthy. Note however
+# that only very few models such as naive Bayes models are of this kind.
+
+clf = GaussianNB()
+
+with parallel_backend("multiprocessing", n_jobs=-1):
+    run9 = openml.runs.run_model_on_task(
+        model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
+    )
+measures = run9.fold_evaluations
+print_compare_runtimes(measures)
+
+################################################################################
+# Summmary
+# *********
+# The scikit-learn extension for OpenML-Python records model runtimes for the
+# CPU-clock and the wall-clock times. The above examples illustrated how these
+# recorded runtimes can be extracted when using a scikit-learn model and under
+# parallel setups too. To summarize, the scikit-learn extension measures the:
+#
+# * `CPU-time` & `wallclock-time` for the whole run
+#
+#   * A run here corresponds to a call to `run_model_on_task` or `run_flow_on_task`
+#   * The recorded time is for the model fit for each of the outer-cross validations folds,
+#     i.e., the OpenML data splits
+#
+# * Python's `time` module is used to compute the runtimes
+#
+#   * `CPU-time` is recorded using the responses of `time.process_time()`
+#   * `wallclock-time` is recorded using the responses of `time.time()`
+#
+# * The timings recorded by OpenML per outer-cross validation fold is agnostic to
+#   model parallelisation
+#
+#   * The wallclock times reported in Case 2 above highlights the speed-up on using `n_jobs=-1`
+#     in comparison to `n_jobs=2`, since the timing recorded by OpenML is for the entire
+#     `fit()` procedure, whereas the parallelisation is performed inside `fit()` by scikit-learn
+#   * The CPU-time for models that are run in parallel can be difficult to interpret
+#
+# * `CPU-time` & `wallclock-time` for each search per outer fold in an HPO run
+#
+#   * Reports the total time for performing search on each of the OpenML data split, subsuming
+#     any sort of parallelism that happened as part of the HPO estimator or the underlying
+#     base estimator
+#   * Also allows extraction of the `refit_time` that scikit-learn measures using `time.time()`
+#     for retraining the model per outer fold, for the best found configuration
+#
+# * `CPU-time` & `wallclock-time` for models that scikit-learn doesn't parallelize
+#
+#   * Models like Decision Trees or naive Bayes don't parallelize and thus both the wallclock and
+#     CPU times are similar in runtime for the OpenML call
+#   * However, models implemented in Cython, such as the Decision Trees can release the GIL and
+#     still run in parallel if a `threading` backend is used by joblib.
+#   * Scikit-learn Neural Networks can undergo parallelization implicitly owing to thread-level
+#     parallelism involved in the linear algebraic operations and thus the wallclock-time and
+#     CPU-time can differ.
+#
+# Because of all the cases mentioned above it is crucial to understand which case is triggered
+# when reporting runtimes for scikit-learn models measured with OpenML-Python!
diff --git a/openml/__version__.py b/openml/__version__.py
index ff4effa59..700e61f6a 100644
--- a/openml/__version__.py
+++ b/openml/__version__.py
@@ -3,4 +3,4 @@
 # License: BSD 3-Clause
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.12.0"
+__version__ = "0.12.1"
diff --git a/openml/config.py b/openml/config.py
index 9e2e697d5..4516e96e1 100644
--- a/openml/config.py
+++ b/openml/config.py
@@ -204,7 +204,7 @@ def _setup(config=None):
     # read config file, create directory for config file
     if not os.path.exists(config_dir):
         try:
-            os.mkdir(config_dir)
+            os.makedirs(config_dir, exist_ok=True)
             cache_exists = True
         except PermissionError:
             cache_exists = False
@@ -235,7 +235,7 @@ def _get(config, key):
     # create the cache subdirectory
     if not os.path.exists(cache_directory):
         try:
-            os.mkdir(cache_directory)
+            os.makedirs(cache_directory, exist_ok=True)
         except PermissionError:
             openml_logger.warning(
                 "No permission to create openml cache directory at %s! This can result in "
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index 3441b4a4e..a0c551e83 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -1455,53 +1455,6 @@ def _prevent_optimize_n_jobs(self, model):
                     "openml-python should not be used to " "optimize the n_jobs parameter."
                 )
 
-    def _can_measure_cputime(self, model: Any) -> bool:
-        """
-        Returns True if the parameter settings of model are chosen s.t. the model
-        will run on a single core (if so, openml-python can measure cpu-times)
-
-        Parameters:
-        -----------
-        model:
-            The model that will be fitted
-
-        Returns:
-        --------
-        bool:
-            True if all n_jobs parameters will be either set to None or 1, False otherwise
-        """
-        if not (isinstance(model, sklearn.base.BaseEstimator) or self._is_hpo_class(model)):
-            raise ValueError("model should be BaseEstimator or BaseSearchCV")
-
-        # check the parameters for n_jobs
-        n_jobs_vals = SklearnExtension._get_parameter_values_recursive(model.get_params(), "n_jobs")
-        for val in n_jobs_vals:
-            if val is not None and val != 1 and val != "deprecated":
-                return False
-        return True
-
-    def _can_measure_wallclocktime(self, model: Any) -> bool:
-        """
-        Returns True if the parameter settings of model are chosen s.t. the model
-        will run on a preset number of cores (if so, openml-python can measure wall-clock time)
-
-        Parameters:
-        -----------
-        model:
-            The model that will be fitted
-
-        Returns:
-        --------
-        bool:
-            True if no n_jobs parameters is set to -1, False otherwise
-        """
-        if not (isinstance(model, sklearn.base.BaseEstimator) or self._is_hpo_class(model)):
-            raise ValueError("model should be BaseEstimator or BaseSearchCV")
-
-        # check the parameters for n_jobs
-        n_jobs_vals = SklearnExtension._get_parameter_values_recursive(model.get_params(), "n_jobs")
-        return -1 not in n_jobs_vals
-
     ################################################################################################
     # Methods for performing runs with extension modules
 
@@ -1725,12 +1678,8 @@ def _prediction_to_probabilities(
         model_copy = sklearn.base.clone(model, safe=True)
         # sanity check: prohibit users from optimizing n_jobs
         self._prevent_optimize_n_jobs(model_copy)
-        # Runtime can be measured if the model is run sequentially
-        can_measure_cputime = self._can_measure_cputime(model_copy)
-        can_measure_wallclocktime = self._can_measure_wallclocktime(model_copy)
-
+        # measures and stores runtimes
         user_defined_measures = OrderedDict()  # type: 'OrderedDict[str, float]'
-
         try:
             # for measuring runtime. Only available since Python 3.3
             modelfit_start_cputime = time.process_time()
@@ -1742,14 +1691,11 @@ def _prediction_to_probabilities(
                 model_copy.fit(X_train)
 
             modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000
-            if can_measure_cputime:
-                user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime
-
             modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000
-            if hasattr(model_copy, "refit_time_"):
-                modelfit_dur_walltime += model_copy.refit_time_
-            if can_measure_wallclocktime:
-                user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime
+
+            user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime
+            refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0
+            user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime
 
         except AttributeError as e:
             # typically happens when training a regressor on classification task
@@ -1792,20 +1738,16 @@ def _prediction_to_probabilities(
         else:
             raise ValueError(task)
 
-        if can_measure_cputime:
-            modelpredict_duration_cputime = (
-                time.process_time() - modelpredict_start_cputime
-            ) * 1000
-            user_defined_measures["usercpu_time_millis_testing"] = modelpredict_duration_cputime
-            user_defined_measures["usercpu_time_millis"] = (
-                modelfit_dur_cputime + modelpredict_duration_cputime
-            )
-        if can_measure_wallclocktime:
-            modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000
-            user_defined_measures["wall_clock_time_millis_testing"] = modelpredict_duration_walltime
-            user_defined_measures["wall_clock_time_millis"] = (
-                modelfit_dur_walltime + modelpredict_duration_walltime
-            )
+        modelpredict_duration_cputime = (time.process_time() - modelpredict_start_cputime) * 1000
+        user_defined_measures["usercpu_time_millis_testing"] = modelpredict_duration_cputime
+        user_defined_measures["usercpu_time_millis"] = (
+            modelfit_dur_cputime + modelpredict_duration_cputime
+        )
+        modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000
+        user_defined_measures["wall_clock_time_millis_testing"] = modelpredict_duration_walltime
+        user_defined_measures["wall_clock_time_millis"] = (
+            modelfit_dur_walltime + modelpredict_duration_walltime + refit_time
+        )
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
 
diff --git a/setup.py b/setup.py
index dc1a58863..2d2a638b5 100644
--- a/setup.py
+++ b/setup.py
@@ -84,7 +84,7 @@
             "seaborn",
         ],
         "examples_unix": ["fanova"],
-        "docs": ["sphinx", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc"],
+        "docs": ["sphinx>=3", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc",],
     },
     test_suite="pytest",
     classifiers=[
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index c1f88bcda..e45eeea53 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -1280,19 +1280,13 @@ def test_paralizable_check(self):
             sklearn.model_selection.GridSearchCV(multicore_bagging, illegal_param_dist),
         ]
 
-        can_measure_cputime_answers = [True, False, False, True, False, False, True, False, False]
-        can_measure_walltime_answers = [True, True, False, True, True, False, True, True, False]
         if LooseVersion(sklearn.__version__) < "0.20":
             has_refit_time = [False, False, False, False, False, False, False, False, False]
         else:
             has_refit_time = [False, False, False, False, False, False, True, True, False]
 
         X, y = sklearn.datasets.load_iris(return_X_y=True)
-        for model, allowed_cputime, allowed_walltime, refit_time in zip(
-            legal_models, can_measure_cputime_answers, can_measure_walltime_answers, has_refit_time
-        ):
-            self.assertEqual(self.extension._can_measure_cputime(model), allowed_cputime)
-            self.assertEqual(self.extension._can_measure_wallclocktime(model), allowed_walltime)
+        for model, refit_time in zip(legal_models, has_refit_time):
             model.fit(X, y)
             self.assertEqual(refit_time, hasattr(model, "refit_time_"))
 
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
index 5b15f781e..2e2c609db 100644
--- a/tests/test_openml/test_config.py
+++ b/tests/test_openml/test_config.py
@@ -26,6 +26,16 @@ def test_non_writable_home(self, log_handler_mock, warnings_mock, expanduser_moc
         self.assertEqual(log_handler_mock.call_count, 1)
         self.assertFalse(log_handler_mock.call_args_list[0][1]["create_file_handler"])
 
+    @unittest.mock.patch("os.path.expanduser")
+    def test_XDG_directories_do_not_exist(self, expanduser_mock):
+        with tempfile.TemporaryDirectory(dir=self.workdir) as td:
+
+            def side_effect(path_):
+                return os.path.join(td, str(path_).replace("~/", ""))
+
+            expanduser_mock.side_effect = side_effect
+            openml.config._setup()
+
     def test_get_config_as_dict(self):
         """ Checks if the current configuration is returned accurately as a dict. """
         config = openml.config.get_config_as_dict()
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 4534f26a4..c8f1729b7 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1635,13 +1635,13 @@ def test_joblib_backends(self, parallel_mock):
         line_length = 6 + len(task.class_labels)
 
         backend_choice = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing"
-        for n_jobs, backend, len_time_stats, call_count in [
-            (1, backend_choice, 7, 10),
-            (2, backend_choice, 4, 10),
-            (-1, backend_choice, 1, 10),
-            (1, "threading", 7, 20),
-            (-1, "threading", 1, 30),
-            (1, "sequential", 7, 40),
+        for n_jobs, backend, call_count in [
+            (1, backend_choice, 10),
+            (2, backend_choice, 10),
+            (-1, backend_choice, 10),
+            (1, "threading", 20),
+            (-1, "threading", 30),
+            (1, "sequential", 40),
         ]:
             clf = sklearn.model_selection.RandomizedSearchCV(
                 estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
@@ -1674,8 +1674,6 @@ def test_joblib_backends(self, parallel_mock):
             self.assertEqual(len(res[0][0]), line_length)
             # usercpu_time_millis_* not recorded when n_jobs > 1
             # *_time_millis_* not recorded when n_jobs = -1
-            self.assertEqual(len(res[2]), len_time_stats)
-            self.assertEqual(len(res[3]), len_time_stats)
             self.assertEqual(len(res[2]["predictive_accuracy"][0]), 10)
             self.assertEqual(len(res[3]["predictive_accuracy"][0]), 10)
             self.assertEqual(parallel_mock.call_count, call_count)