diff --git a/.travis.yml b/.travis.yml
index 80f3bda42..9fd33403c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,20 +15,21 @@ env:
   - TEST_DIR=/tmp/test_dir/
   - MODULE=openml
   matrix:
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
-    # Checks for older scikit-learn versions (which also don't nicely work with
-    # Python3.7)
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" COVERAGE="true" DOCPUSH="true" SKIP_TESTS="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" RUN_FLAKE8="true" SKIP_TESTS="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.2"
+  # Checks for older scikit-learn versions (which also don't nicely work with
+  # Python3.7)
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
 
 # Travis issue
 # https://github.com/travis-ci/travis-ci/issues/8920
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index e129b7718..b15260fb4 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -66,7 +66,7 @@
 ############################################################################
 # Get the actual data.
 #
-# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
+# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy
 # sparse matrix, or as a Pandas DataFrame. The format is
 # controlled with the parameter ``dataset_format`` which can be either 'array'
 # (default) or 'dataframe'. Let's first build our dataset from a NumPy array
diff --git a/examples/30_extended/flow_id_tutorial.py b/examples/30_extended/flow_id_tutorial.py
index ef3689ea1..e77df8d1a 100644
--- a/examples/30_extended/flow_id_tutorial.py
+++ b/examples/30_extended/flow_id_tutorial.py
@@ -15,6 +15,11 @@
 
 import openml
 
+
+# Activating test server
+openml.config.start_using_configuration_for_example()
+
+
 clf = sklearn.tree.DecisionTreeClassifier()
 
 ####################################################################################################
@@ -69,3 +74,6 @@
 # This also works with the actual model (generalizing the first part of this example):
 flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
 print(flow_ids)
+
+# Deactivating test server
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
index be438e728..a46bf9699 100644
--- a/examples/30_extended/run_setup_tutorial.py
+++ b/examples/30_extended/run_setup_tutorial.py
@@ -37,6 +37,11 @@
 import sklearn.ensemble
 import sklearn.impute
 import sklearn.preprocessing
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
+from sklearn.experimental import enable_hist_gradient_boosting
 
 
 openml.config.start_using_configuration_for_example()
@@ -52,22 +57,39 @@
 # we will create a fairly complex model, with many preprocessing components and
 # many potential hyperparameters. Of course, the model can be as complex and as
 # easy as you want it to be
-model_original = sklearn.pipeline.make_pipeline(
-    sklearn.impute.SimpleImputer(), sklearn.ensemble.RandomForestClassifier()
-)
 
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.decomposition import TruncatedSVD
+
+
+# Helper functions to return required columns for ColumnTransformer
+def cont(X):
+    return X.dtypes != "category"
+
+
+def cat(X):
+    return X.dtypes == "category"
+
+
+cat_imp = make_pipeline(
+    SimpleImputer(strategy="most_frequent"),
+    OneHotEncoder(handle_unknown="ignore", sparse=False),
+    TruncatedSVD(),
+)
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
+model_original = sklearn.pipeline.Pipeline(
+    steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
+)
 
 # Let's change some hyperparameters. Of course, in any good application we
 # would tune them using, e.g., Random Search or Bayesian Optimization, but for
 # the purpose of this tutorial we set them to some specific values that might
 # or might not be optimal
 hyperparameters_original = {
-    "simpleimputer__strategy": "median",
-    "randomforestclassifier__criterion": "entropy",
-    "randomforestclassifier__max_features": 0.2,
-    "randomforestclassifier__min_samples_leaf": 1,
-    "randomforestclassifier__n_estimators": 16,
-    "randomforestclassifier__random_state": 42,
+    "estimator__loss": "auto",
+    "estimator__learning_rate": 0.15,
+    "estimator__max_iter": 50,
+    "estimator__min_samples_leaf": 1,
 }
 model_original.set_params(**hyperparameters_original)
 
diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
index b9202d7ce..c02a5c038 100644
--- a/examples/30_extended/study_tutorial.py
+++ b/examples/30_extended/study_tutorial.py
@@ -17,8 +17,11 @@
 
 import numpy as np
 import sklearn.tree
-import sklearn.pipeline
-import sklearn.impute
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
 
 import openml
 
@@ -68,7 +71,7 @@
 )
 print(evaluations.head())
 
-############################################################################
+###########################################################from openml.testing import cat, cont#################
 # Uploading studies
 # =================
 #
@@ -78,12 +81,30 @@
 
 openml.config.start_using_configuration_for_example()
 
-# Very simple classifier which ignores the feature type
+# Model that can handle missing values
+from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+
+# Helper functions to return required columns for ColumnTransformer
+def cont(X):
+    return X.dtypes != "category"
+
+
+def cat(X):
+    return X.dtypes == "category"
+
+
+cat_imp = make_pipeline(
+    SimpleImputer(strategy="most_frequent"),
+    OneHotEncoder(handle_unknown="ignore", sparse=False),
+    TruncatedSVD(),
+)
+ct = ColumnTransformer(
+    [("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)]
+)
 clf = sklearn.pipeline.Pipeline(
-    steps=[
-        ("imputer", sklearn.impute.SimpleImputer()),
-        ("estimator", sklearn.tree.DecisionTreeClassifier(max_depth=5)),
-    ]
+    steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
 )
 
 suite = openml.study.get_suite(1)
diff --git a/openml/__init__.py b/openml/__init__.py
index 621703332..0bab3b1d5 100644
--- a/openml/__init__.py
+++ b/openml/__init__.py
@@ -113,6 +113,7 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None)
     "study",
     "utils",
     "_api_calls",
+    "__version__",
 ]
 
 # Load the scikit-learn extension by default
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 0f3037a74..550747eac 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -815,12 +815,12 @@ def edit_dataset(
 ) -> int:
     """
       Edits an OpenMLDataset.
-      Specify atleast one field to edit, apart from data_id
+      Specify at least one field to edit, apart from data_id
        - For certain fields, a new dataset version is created : attributes, data,
        default_target_attribute, ignore_attribute, row_id_attribute.
 
-       - For other fields, the uploader can edit the exisiting version.
-        Noone except the uploader can edit the exisitng version.
+       - For other fields, the uploader can edit the existing version.
+        No one except the uploader can edit the existing version.
 
       Parameters
       ----------
diff --git a/openml/exceptions.py b/openml/exceptions.py
index 07eb64e6c..781784ee2 100644
--- a/openml/exceptions.py
+++ b/openml/exceptions.py
@@ -27,7 +27,7 @@ def __init__(self, message: str, code: int = None, url: str = None):
         self.url = url
         super().__init__(message)
 
-    def __repr__(self):
+    def __str__(self):
         return "%s returned code %s: %s" % (self.url, self.code, self.message,)
 
 
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index 2b94d2cfd..edb14487b 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -11,7 +11,7 @@
 from re import IGNORECASE
 import sys
 import time
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast
 import warnings
 
 import numpy as np
@@ -1546,7 +1546,7 @@ def _run_model_on_fold(
         fold_no: int,
         y_train: Optional[np.ndarray] = None,
         X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
-    ) -> Tuple[np.ndarray, np.ndarray, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
+    ) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
         """Run a model on a repeat,fold,subsample triplet of the task and return prediction
         information.
 
@@ -1579,24 +1579,21 @@ def _run_model_on_fold(
 
         Returns
         -------
-        arff_datacontent : List[List]
-            Arff representation (list of lists) of the predictions that were
-            generated by this fold (required to populate predictions.arff)
-        arff_tracecontent :  List[List]
-            Arff representation (list of lists) of the trace data that was generated by this
-            fold
-            (will be used to populate trace.arff, leave it empty if the model did not perform
-            any
-            hyperparameter optimization).
+        pred_y : np.ndarray
+            Predictions on the training/test set, depending on the task type.
+            For supervised tasks, predicitons are on the test set.
+            For unsupervised tasks, predicitons are on the training set.
+        proba_y : pd.DataFrame
+            Predicted probabilities for the test set.
+            None, if task is not Classification or Learning Curve prediction.
         user_defined_measures : OrderedDict[str, float]
             User defined measures that were generated on this fold
-        model : Any
-            The model trained on this repeat,fold,subsample triple. Will be used to generate
-            trace
-            information later on (in ``obtain_arff_trace``).
+        trace : Optional[OpenMLRunTrace]]
+            arff trace object from a fitted model and the trace content obtained by
+            repeatedly calling ``run_model_on_task``
         """
 
-        def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarray:
+        def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame:
             """Transforms predicted probabilities to match with OpenML class indices.
 
             Parameters
@@ -1609,16 +1606,31 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
 
             Returns
             -------
-            np.ndarray
+            pd.DataFrame
             """
+
+            if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
+                if task.class_labels is not None:
+                    if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
+                        # mapping (decoding) the predictions to the categories
+                        # creating a separate copy to not change the expected pred_y type
+                        y = [task.class_labels[pred] for pred in y]
+                else:
+                    raise ValueError("The task has no class labels")
+            else:
+                return None
+
             # y: list or numpy array of predictions
             # model_classes: sklearn classifier mapping from original array id to
             # prediction index id
-            if not isinstance(classes, list):
-                raise ValueError("please convert model classes to list prior to " "calling this fn")
-            result = np.zeros((len(y), len(classes)), dtype=np.float32)
-            for obs, prediction_idx in enumerate(y):
-                result[obs][prediction_idx] = 1.0
+            if not isinstance(model_classes, list):
+                raise ValueError("please convert model classes to list prior to calling this fn")
+            # DataFrame allows more accurate mapping of classes as column names
+            result = pd.DataFrame(
+                0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32
+            )
+            for obs, prediction in enumerate(y):
+                result.loc[obs, prediction] = 1.0
             return result
 
         if isinstance(task, OpenMLSupervisedTask):
@@ -1677,6 +1689,16 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
             else:
                 model_classes = used_estimator.classes_
 
+            if not isinstance(model_classes, list):
+                model_classes = model_classes.tolist()
+
+            # to handle the case when dataset is numpy and categories are encoded
+            # however the class labels stored in task are still categories
+            if isinstance(y_train, np.ndarray) and isinstance(
+                cast(List, task.class_labels)[0], str
+            ):
+                model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes]
+
         modelpredict_start_cputime = time.process_time()
         modelpredict_start_walltime = time.time()
 
@@ -1708,9 +1730,10 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
 
             try:
                 proba_y = model_copy.predict_proba(X_test)
-            except AttributeError:
+                proba_y = pd.DataFrame(proba_y, columns=model_classes)  # handles X_test as numpy
+            except AttributeError:  # predict_proba is not available when probability=False
                 if task.class_labels is not None:
-                    proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
+                    proba_y = _prediction_to_probabilities(pred_y, model_classes)
                 else:
                     raise ValueError("The task has no class labels")
 
@@ -1726,20 +1749,24 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
                     # then we need to add a column full of zeros into the probabilities
                     # for class 3 because the rest of the library expects that the
                     # probabilities are ordered the same way as the classes are ordered).
-                    proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
-                    for idx, model_class in enumerate(model_classes):
-                        proba_y_new[:, model_class] = proba_y[:, idx]
-                    proba_y = proba_y_new
-
-                if proba_y.shape[1] != len(task.class_labels):
                     message = "Estimator only predicted for {}/{} classes!".format(
                         proba_y.shape[1], len(task.class_labels),
                     )
                     warnings.warn(message)
                     openml.config.logger.warn(message)
+
+                    for i, col in enumerate(task.class_labels):
+                        # adding missing columns with 0 probability
+                        if col not in model_classes:
+                            proba_y[col] = 0
+                    proba_y = proba_y[task.class_labels]
             else:
                 raise ValueError("The task has no class labels")
 
+            if not np.all(set(proba_y.columns) == set(task.class_labels)):
+                missing_cols = list(set(task.class_labels) - set(proba_y.columns))
+                raise ValueError("Predicted probabilities missing for the columns: ", missing_cols)
+
         elif isinstance(task, OpenMLRegressionTask):
             proba_y = None
 
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
index 47939c867..5aaf70a9d 100644
--- a/openml/flows/flow.py
+++ b/openml/flows/flow.py
@@ -263,7 +263,13 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
         for key in self.components:
             component_dict = OrderedDict()  # type: 'OrderedDict[str, Dict]'
             component_dict["oml:identifier"] = key
-            component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"]
+            if self.components[key] in ["passthrough", "drop"]:
+                component_dict["oml:flow"] = {
+                    "oml-python:serialized_object": "component_reference",
+                    "value": {"key": self.components[key], "step_name": self.components[key]},
+                }
+            else:
+                component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"]
 
             for key_ in component_dict:
                 # We only need to check if the key is a string, because the
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 2b767eaa1..99007aa2a 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -52,6 +52,7 @@ def run_model_on_task(
     add_local_measures: bool = True,
     upload_flow: bool = False,
     return_flow: bool = False,
+    dataset_format: str = "dataframe",
 ) -> Union[OpenMLRun, Tuple[OpenMLRun, OpenMLFlow]]:
     """Run the model on the dataset defined by the task.
 
@@ -79,6 +80,9 @@ def run_model_on_task(
         If False, do not upload the flow to OpenML.
     return_flow : bool (default=False)
         If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun.
+    dataset_format : str (default='dataframe')
+        If 'array', the dataset is passed to the model as a numpy array.
+        If 'dataframe', the dataset is passed to the model as a pandas dataframe.
 
     Returns
     -------
@@ -125,6 +129,7 @@ def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTas
         seed=seed,
         add_local_measures=add_local_measures,
         upload_flow=upload_flow,
+        dataset_format=dataset_format,
     )
     if return_flow:
         return run, flow
@@ -139,6 +144,7 @@ def run_flow_on_task(
     seed: int = None,
     add_local_measures: bool = True,
     upload_flow: bool = False,
+    dataset_format: str = "dataframe",
 ) -> OpenMLRun:
 
     """Run the model provided by the flow on the dataset defined by task.
@@ -171,6 +177,9 @@ def run_flow_on_task(
     upload_flow : bool (default=False)
         If True, upload the flow to OpenML if it does not exist yet.
         If False, do not upload the flow to OpenML.
+    dataset_format : str (default='dataframe')
+        If 'array', the dataset is passed to the model as a numpy array.
+        If 'dataframe', the dataset is passed to the model as a pandas dataframe.
 
     Returns
     -------
@@ -248,6 +257,7 @@ def run_flow_on_task(
         task=task,
         extension=flow.extension,
         add_local_measures=add_local_measures,
+        dataset_format=dataset_format,
     )
 
     data_content, trace, fold_evaluations, sample_evaluations = res
@@ -407,6 +417,7 @@ def _run_task_get_arffcontent(
     task: OpenMLTask,
     extension: "Extension",
     add_local_measures: bool,
+    dataset_format: str,
 ) -> Tuple[
     List[List],
     Optional[OpenMLRunTrace],
@@ -437,14 +448,23 @@ def _run_task_get_arffcontent(
             repeat=rep_no, fold=fold_no, sample=sample_no
         )
         if isinstance(task, OpenMLSupervisedTask):
-            x, y = task.get_X_and_y(dataset_format="array")
-            train_x = x[train_indices]
-            train_y = y[train_indices]
-            test_x = x[test_indices]
-            test_y = y[test_indices]
+            x, y = task.get_X_and_y(dataset_format=dataset_format)
+            if dataset_format == "dataframe":
+                train_x = x.iloc[train_indices]
+                train_y = y.iloc[train_indices]
+                test_x = x.iloc[test_indices]
+                test_y = y.iloc[test_indices]
+            else:
+                train_x = x[train_indices]
+                train_y = y[train_indices]
+                test_x = x[test_indices]
+                test_y = y[test_indices]
         elif isinstance(task, OpenMLClusteringTask):
-            x = task.get_X(dataset_format="array")
-            train_x = x[train_indices]
+            x = task.get_X(dataset_format=dataset_format)
+            if dataset_format == "dataframe":
+                train_x = x.iloc[train_indices]
+            else:
+                train_x = x[train_indices]
             train_y = None
             test_x = None
             test_y = None
@@ -480,17 +500,33 @@ def _calculate_local_measure(sklearn_fn, openml_name):
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
 
             for i, tst_idx in enumerate(test_indices):
-
                 if task.class_labels is not None:
+                    prediction = (
+                        task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i]
+                    )
+                    if isinstance(test_y, pd.Series):
+                        test_prediction = (
+                            task.class_labels[test_y.iloc[i]]
+                            if isinstance(test_y.iloc[i], int)
+                            else test_y.iloc[i]
+                        )
+                    else:
+                        test_prediction = (
+                            task.class_labels[test_y[i]]
+                            if isinstance(test_y[i], int)
+                            else test_y[i]
+                        )
+                    pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i]
+
                     arff_line = format_prediction(
                         task=task,
                         repeat=rep_no,
                         fold=fold_no,
                         sample=sample_no,
                         index=tst_idx,
-                        prediction=task.class_labels[pred_y[i]],
-                        truth=task.class_labels[test_y[i]],
-                        proba=dict(zip(task.class_labels, proba_y[i])),
+                        prediction=prediction,
+                        truth=test_prediction,
+                        proba=dict(zip(task.class_labels, pred_prob)),
                     )
                 else:
                     raise ValueError("The task has no class labels")
@@ -504,14 +540,15 @@ def _calculate_local_measure(sklearn_fn, openml_name):
 
         elif isinstance(task, OpenMLRegressionTask):
 
-            for i in range(0, len(test_indices)):
+            for i, _ in enumerate(test_indices):
+                test_prediction = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
                 arff_line = format_prediction(
                     task=task,
                     repeat=rep_no,
                     fold=fold_no,
                     index=test_indices[i],
                     prediction=pred_y[i],
-                    truth=test_y[i],
+                    truth=test_prediction,
                 )
 
                 arff_datacontent.append(arff_line)
@@ -522,7 +559,8 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                 )
 
         elif isinstance(task, OpenMLClusteringTask):
-            for i in range(0, len(test_indices)):
+
+            for i, _ in enumerate(test_indices):
                 arff_line = [test_indices[i], pred_y[i]]  # row_id, cluster ID
                 arff_datacontent.append(arff_line)
 
diff --git a/openml/testing.py b/openml/testing.py
index 0b4c50972..da07b0ed7 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -258,4 +258,21 @@ def _check_fold_timing_evaluations(
     from sklearn.preprocessing import Imputer as SimpleImputer
 
 
-__all__ = ["TestBase", "SimpleImputer"]
+class CustomImputer(SimpleImputer):
+    """Duplicate class alias for sklearn's SimpleImputer
+
+    Helps bypass the sklearn extension duplicate operation check
+    """
+
+    pass
+
+
+def cont(X):
+    return X.dtypes != "category"
+
+
+def cat(X):
+    return X.dtypes == "category"
+
+
+__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont"]
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index 90f69df17..d34dc2ad3 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -13,6 +13,7 @@
 from packaging import version
 
 import numpy as np
+import pandas as pd
 import scipy.optimize
 import scipy.stats
 import sklearn.base
@@ -39,7 +40,7 @@
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer
+from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
 
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -537,8 +538,7 @@ def test_serialize_column_transformer(self):
         fixture = (
             "sklearn.compose._column_transformer.ColumnTransformer("
             "numeric=sklearn.preprocessing.{}.StandardScaler,"
-            "nominal=sklearn.preprocessing._encoders.OneHotEncoder,"
-            "drop=drop)".format(scaler_name)
+            "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)".format(scaler_name)
         )
         fixture_short_name = "sklearn.ColumnTransformer"
 
@@ -564,13 +564,7 @@ def test_serialize_column_transformer(self):
             "drop": ["drop"],
         }
 
-        serialization, new_model = self._serialization_test_helper(
-            model,
-            X=None,
-            y=None,
-            subcomponent_parameters=["transformers", "numeric", "nominal"],
-            dependencies_mock_call_count=(4, 8),
-        )
+        serialization = self.extension.model_to_flow(model)
         structure = serialization.get_structure("name")
         self.assertEqual(serialization.name, fixture)
         self.assertEqual(serialization.custom_name, fixture_short_name)
@@ -1566,12 +1560,15 @@ def setUp(self):
     # Test methods for performing runs with this extension module
 
     def test_run_model_on_task(self):
-        class MyPipe(sklearn.pipeline.Pipeline):
-            pass
-
         task = openml.tasks.get_task(1)
-        pipe = MyPipe([("imp", SimpleImputer()), ("dummy", sklearn.dummy.DummyClassifier())])
-        openml.runs.run_model_on_task(pipe, task)
+        # using most_frequent imputer since dataset has mixed types and to keep things simple
+        pipe = sklearn.pipeline.Pipeline(
+            [
+                ("imp", SimpleImputer(strategy="most_frequent")),
+                ("dummy", sklearn.dummy.DummyClassifier()),
+            ]
+        )
+        openml.runs.run_model_on_task(pipe, task, dataset_format="array")
 
     def test_seed_model(self):
         # randomized models that are initialized without seeds, can be seeded
@@ -1627,7 +1624,7 @@ def test_seed_model_raises(self):
             with self.assertRaises(ValueError):
                 self.extension.seed_model(model=clf, seed=42)
 
-    def test_run_model_on_fold_classification_1(self):
+    def test_run_model_on_fold_classification_1_array(self):
         task = openml.tasks.get_task(1)
 
         X, y = task.get_X_and_y()
@@ -1656,14 +1653,87 @@ def test_run_model_on_fold_classification_1(self):
         # predictions
         self.assertIsInstance(y_hat, np.ndarray)
         self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsInstance(y_hat_proba, np.ndarray)
+        self.assertIsInstance(y_hat_proba, pd.DataFrame)
         self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6))
         np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
         # The class '4' (at index 3) is not present in the training data. We check that the
         # predicted probabilities for that class are zero!
-        np.testing.assert_array_almost_equal(y_hat_proba[:, 3], np.zeros(y_test.shape))
+        np.testing.assert_array_almost_equal(
+            y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape)
+        )
         for i in (0, 1, 2, 4, 5):
-            self.assertTrue(np.any(y_hat_proba[:, i] != np.zeros(y_test.shape)))
+            self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)))
+
+        # check user defined measures
+        fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
+        for measure in user_defined_measures:
+            fold_evaluations[measure][0][0] = user_defined_measures[measure]
+
+        # trace. SGD does not produce any
+        self.assertIsNone(trace)
+
+        self._check_fold_timing_evaluations(
+            fold_evaluations,
+            num_repeats=1,
+            num_folds=1,
+            task_type=task.task_type_id,
+            check_scores=False,
+        )
+
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.21",
+        reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
+        "Pipeline till 0.20 doesn't support indexing and 'passthrough'",
+    )
+    def test_run_model_on_fold_classification_1_dataframe(self):
+        from sklearn.compose import ColumnTransformer
+
+        task = openml.tasks.get_task(1)
+
+        # diff test_run_model_on_fold_classification_1_array()
+        X, y = task.get_X_and_y(dataset_format="dataframe")
+        train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
+        X_train = X.iloc[train_indices]
+        y_train = y.iloc[train_indices]
+        X_test = X.iloc[test_indices]
+        y_test = y.iloc[test_indices]
+
+        # Helper functions to return required columns for ColumnTransformer
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"),
+            OneHotEncoder(handle_unknown="ignore", sparse=False),
+        )
+        cont_imp = make_pipeline(CustomImputer(strategy="mean"), StandardScaler())
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+        pipeline = sklearn.pipeline.Pipeline(
+            steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+        )
+        # TODO add some mocking here to actually test the innards of this function, too!
+        res = self.extension._run_model_on_fold(
+            model=pipeline,
+            task=task,
+            fold_no=0,
+            rep_no=0,
+            X_train=X_train,
+            y_train=y_train,
+            X_test=X_test,
+        )
+
+        y_hat, y_hat_proba, user_defined_measures, trace = res
+
+        # predictions
+        self.assertIsInstance(y_hat, np.ndarray)
+        self.assertEqual(y_hat.shape, y_test.shape)
+        self.assertIsInstance(y_hat_proba, pd.DataFrame)
+        self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6))
+        np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
+        # The class '4' (at index 3) is not present in the training data. We check that the
+        # predicted probabilities for that class are zero!
+        np.testing.assert_array_almost_equal(
+            y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape)
+        )
+        for i in (0, 1, 2, 4, 5):
+            self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape)))
 
         # check user defined measures
         fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
@@ -1710,11 +1780,11 @@ def test_run_model_on_fold_classification_2(self):
         # predictions
         self.assertIsInstance(y_hat, np.ndarray)
         self.assertEqual(y_hat.shape, y_test.shape)
-        self.assertIsInstance(y_hat_proba, np.ndarray)
+        self.assertIsInstance(y_hat_proba, pd.DataFrame)
         self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 2))
         np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape))
         for i in (0, 1):
-            self.assertTrue(np.any(y_hat_proba[:, i] != np.zeros(y_test.shape)))
+            self.assertTrue(np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape)))
 
         # check user defined measures
         fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict))
@@ -1791,14 +1861,14 @@ def predict_proba(*args, **kwargs):
             np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0]))
             # Test that there are predictions other than ones and zeros
             self.assertLess(
-                np.sum(proba_1 == 0) + np.sum(proba_1 == 1),
+                np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1),
                 X_test.shape[0] * len(task.class_labels),
             )
 
             np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0]))
             # Test that there are only ones and zeros predicted
             self.assertEqual(
-                np.sum(proba_2 == 0) + np.sum(proba_2 == 1),
+                np.sum(proba_2.to_numpy() == 0) + np.sum(proba_2.to_numpy() == 1),
                 X_test.shape[0] * len(task.class_labels),
             )
 
@@ -2099,3 +2169,49 @@ def test_sklearn_serialization_with_none_step(self):
         )
         with self.assertRaisesRegex(ValueError, msg):
             self.extension.model_to_flow(clf)
+
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="columntransformer introduction in 0.20.0",
+    )
+    def test_failed_serialization_of_custom_class(self):
+        """Test to check if any custom class inherited from sklearn expectedly fails serialization
+        """
+        try:
+            from sklearn.impute import SimpleImputer
+        except ImportError:
+            # for lower versions
+            from sklearn.preprocessing import Imputer as SimpleImputer
+
+        class CustomImputer(SimpleImputer):
+            pass
+
+        def cont(X):
+            return X.dtypes != "category"
+
+        def cat(X):
+            return X.dtypes == "category"
+
+        import sklearn.metrics
+        import sklearn.tree
+        from sklearn.pipeline import Pipeline, make_pipeline
+        from sklearn.compose import ColumnTransformer
+        from sklearn.preprocessing import OneHotEncoder, StandardScaler
+
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+        clf = Pipeline(
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+        )  # build a sklearn classifier
+
+        task = openml.tasks.get_task(253)  # data with mixed types from test server
+        try:
+            _ = openml.runs.run_model_on_task(clf, task)
+        except AttributeError as e:
+            if e.args[0] == "module '__main__' has no attribute '__version__'":
+                raise AttributeError(e)
+            else:
+                raise Exception(e)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index dcc7b0b96..89f01c72e 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -21,7 +21,7 @@
 import pandas as pd
 
 import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer
+from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
 from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
 from openml.runs.trace import OpenMLRunTrace
 from openml.tasks import TaskType
@@ -31,13 +31,13 @@
 from sklearn.tree import DecisionTreeClassifier
 
 from sklearn.dummy import DummyClassifier
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.feature_selection import VarianceThreshold
 from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 from sklearn.svm import SVC
 from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
-from sklearn.pipeline import Pipeline
+from sklearn.pipeline import Pipeline, make_pipeline
 
 
 class TestRun(TestBase):
@@ -348,9 +348,13 @@ def test_run_regression_on_classif_task(self):
 
         clf = LinearRegression()
         task = openml.tasks.get_task(task_id)
-        with self.assertRaises(AttributeError):
+        # internally dataframe is loaded and targets are categorical
+        # which LinearRegression() cannot handle
+        with self.assertRaisesRegex(
+            AttributeError, "'LinearRegression' object has no attribute 'classes_'"
+        ):
             openml.runs.run_model_on_task(
-                model=clf, task=task, avoid_duplicate_runs=False,
+                model=clf, task=task, avoid_duplicate_runs=False, dataset_format="array",
             )
 
     def test_check_erronous_sklearn_flow_fails(self):
@@ -553,18 +557,26 @@ def test_run_and_upload_column_transformer_pipeline(self):
         def get_ct_cf(nominal_indices, numeric_indices):
             inner = sklearn.compose.ColumnTransformer(
                 transformers=[
-                    ("numeric", sklearn.preprocessing.StandardScaler(), nominal_indices),
                     (
-                        "nominal",
-                        sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"),
+                        "numeric",
+                        make_pipeline(
+                            SimpleImputer(strategy="mean"), sklearn.preprocessing.StandardScaler()
+                        ),
                         numeric_indices,
                     ),
+                    (
+                        "nominal",
+                        make_pipeline(
+                            CustomImputer(strategy="most_frequent"),
+                            sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"),
+                        ),
+                        nominal_indices,
+                    ),
                 ],
                 remainder="passthrough",
             )
             return sklearn.pipeline.Pipeline(
                 steps=[
-                    ("imputer", sklearn.impute.SimpleImputer(strategy="constant", fill_value=-1)),
                     ("transformer", inner),
                     ("classifier", sklearn.tree.DecisionTreeClassifier()),
                 ]
@@ -590,25 +602,36 @@ def get_ct_cf(nominal_indices, numeric_indices):
             sentinel=sentinel,
         )
 
-    def test_run_and_upload_decision_tree_pipeline(self):
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="columntransformer introduction in 0.20.0",
+    )
+    def test_run_and_upload_knn_pipeline(self):
+
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        from sklearn.compose import ColumnTransformer
+        from sklearn.neighbors import KNeighborsClassifier
+
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         pipeline2 = Pipeline(
             steps=[
-                ("Imputer", SimpleImputer(strategy="median")),
+                ("Imputer", ct),
                 ("VarianceThreshold", VarianceThreshold()),
                 (
                     "Estimator",
                     RandomizedSearchCV(
-                        DecisionTreeClassifier(),
-                        {
-                            "min_samples_split": [2 ** x for x in range(1, 8)],
-                            "min_samples_leaf": [2 ** x for x in range(0, 7)],
-                        },
+                        KNeighborsClassifier(),
+                        {"n_neighbors": [x for x in range(2, 10)]},
                         cv=3,
                         n_iter=10,
                     ),
                 ),
             ]
         )
+
         task_id = self.TEST_SERVER_TASK_MISSING_VALS[0]
         n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS[1]
         n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS[2]
@@ -732,19 +755,31 @@ def test_learning_curve_task_2(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.21",
+        reason="Pipelines don't support indexing (used for the assert check)",
+    )
     def test_initialize_cv_from_run(self):
-        randomsearch = RandomizedSearchCV(
-            RandomForestClassifier(n_estimators=5),
-            {
-                "max_depth": [3, None],
-                "max_features": [1, 2, 3, 4],
-                "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
-                "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
-                "bootstrap": [True, False],
-                "criterion": ["gini", "entropy"],
-            },
-            cv=StratifiedKFold(n_splits=2, shuffle=True),
-            n_iter=2,
+        randomsearch = Pipeline(
+            [
+                ("enc", OneHotEncoder(handle_unknown="ignore")),
+                (
+                    "rs",
+                    RandomizedSearchCV(
+                        RandomForestClassifier(n_estimators=5),
+                        {
+                            "max_depth": [3, None],
+                            "max_features": [1, 2, 3, 4],
+                            "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+                            "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                            "bootstrap": [True, False],
+                            "criterion": ["gini", "entropy"],
+                        },
+                        cv=StratifiedKFold(n_splits=2, shuffle=True),
+                        n_iter=2,
+                    ),
+                ),
+            ]
         )
 
         task = openml.tasks.get_task(11)
@@ -759,8 +794,8 @@ def test_initialize_cv_from_run(self):
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
         modelS = openml.setups.initialize_model(setup_id=run.setup_id)
 
-        self.assertEqual(modelS.cv.random_state, 62501)
-        self.assertEqual(modelR.cv.random_state, 62501)
+        self.assertEqual(modelS[-1].cv.random_state, 62501)
+        self.assertEqual(modelR[-1].cv.random_state, 62501)
 
     def _test_local_evaluations(self, run):
 
@@ -793,12 +828,23 @@ def _test_local_evaluations(self, run):
                 self.assertGreaterEqual(alt_scores[idx], 0)
                 self.assertLessEqual(alt_scores[idx], 1)
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
     def test_local_run_swapped_parameter_order_model(self):
 
         # construct sci-kit learn classifier
         clf = Pipeline(
             steps=[
-                ("imputer", SimpleImputer(strategy="median")),
+                (
+                    "imputer",
+                    make_pipeline(
+                        SimpleImputer(strategy="most_frequent"),
+                        OneHotEncoder(handle_unknown="ignore"),
+                    ),
+                ),
+                # random forest doesn't take categoricals
                 ("estimator", RandomForestClassifier()),
             ]
         )
@@ -813,13 +859,18 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
     def test_local_run_swapped_parameter_order_flow(self):
 
         # construct sci-kit learn classifier
         clf = Pipeline(
             steps=[
-                ("imputer", SimpleImputer(strategy="median")),
-                ("estimator", RandomForestClassifier()),
+                ("imputer", SimpleImputer(strategy="most_frequent")),
+                ("encoder", OneHotEncoder(handle_unknown="ignore")),
+                ("estimator", RandomForestClassifier(n_estimators=10)),
             ]
         )
 
@@ -834,13 +885,18 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
     def test_local_run_metric_score(self):
 
         # construct sci-kit learn classifier
         clf = Pipeline(
             steps=[
-                ("imputer", SimpleImputer(strategy="median")),
-                ("estimator", RandomForestClassifier()),
+                ("imputer", SimpleImputer(strategy="most_frequent")),
+                ("encoder", OneHotEncoder(handle_unknown="ignore")),
+                ("estimator", RandomForestClassifier(n_estimators=10)),
             ]
         )
 
@@ -863,15 +919,19 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(
             steps=[
-                ("Imputer", SimpleImputer(strategy="median")),
+                ("Imputer", SimpleImputer(strategy="most_frequent")),
                 ("VarianceThreshold", VarianceThreshold(threshold=0.05)),
                 ("Estimator", GaussianNB()),
             ]
         )
-        task = openml.tasks.get_task(11)
+        task = openml.tasks.get_task(1198)
         run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=False,)
         run_ = run.publish()
         TestBase._mark_entity_for_removal("run", run_.run_id)
@@ -887,7 +947,7 @@ def test_initialize_model_from_run(self):
         openml.flows.assert_flows_equal(flowR, flowL)
         openml.flows.assert_flows_equal(flowS, flowL)
 
-        self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"median"')
+        self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"most_frequent"')
         self.assertEqual(flowS.components["VarianceThreshold"].parameters["threshold"], "0.05")
 
     @pytest.mark.flaky()
@@ -939,6 +999,10 @@ def test_get_run_trace(self):
         run_trace = openml.runs.get_run_trace(run_id)
         self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds)
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
+    )
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
@@ -1080,6 +1144,10 @@ def test_run_with_illegal_flow_id_1_after_load(self):
             openml.exceptions.PyOpenMLError, expected_message_regex, loaded_run.publish
         )
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="OneHotEncoder cannot handle mixed type DataFrame as input",
+    )
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(7)
         num_instances = 3196
@@ -1088,9 +1156,16 @@ def test__run_task_get_arffcontent(self):
 
         flow = unittest.mock.Mock()
         flow.name = "dummy"
-        clf = SGDClassifier(loss="log", random_state=1)
+        clf = make_pipeline(
+            OneHotEncoder(handle_unknown="ignore"), SGDClassifier(loss="log", random_state=1)
+        )
         res = openml.runs.functions._run_task_get_arffcontent(
-            flow=flow, extension=self.extension, model=clf, task=task, add_local_measures=True,
+            flow=flow,
+            extension=self.extension,
+            model=clf,
+            task=task,
+            add_local_measures=True,
+            dataset_format="dataframe",
         )
         arff_datacontent, trace, fold_evaluations, _ = res
         # predictions
@@ -1288,24 +1363,81 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag="curves")
         self.assertGreaterEqual(len(runs), 1)
 
-    def test_run_on_dataset_with_missing_labels(self):
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="columntransformer introduction in 0.20.0",
+    )
+    def test_run_on_dataset_with_missing_labels_dataframe(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
         # actual data
-
         flow = unittest.mock.Mock()
         flow.name = "dummy"
         task = openml.tasks.get_task(2)
 
+        from sklearn.compose import ColumnTransformer
+
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         model = Pipeline(
-            steps=[
-                ("Imputer", SimpleImputer(strategy="median")),
-                ("Estimator", DecisionTreeClassifier()),
-            ]
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+        )  # build a sklearn classifier
+
+        data_content, _, _, _ = _run_task_get_arffcontent(
+            flow=flow,
+            model=model,
+            task=task,
+            extension=self.extension,
+            add_local_measures=True,
+            dataset_format="dataframe",
         )
+        # 2 folds, 5 repeats; keep in mind that this task comes from the test
+        # server, the task on the live server is different
+        self.assertEqual(len(data_content), 4490)
+        for row in data_content:
+            # repeat, fold, row_id, 6 confidences, prediction and correct label
+            self.assertEqual(len(row), 12)
+
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="columntransformer introduction in 0.20.0",
+    )
+    def test_run_on_dataset_with_missing_labels_array(self):
+        # Check that _run_task_get_arffcontent works when one of the class
+        # labels only declared in the arff file, but is not present in the
+        # actual data
+        flow = unittest.mock.Mock()
+        flow.name = "dummy"
+        task = openml.tasks.get_task(2)
+        # task_id=2 on test server has 38 columns with 6 numeric columns
+        cont_idx = [3, 4, 8, 32, 33, 34]
+        cat_idx = list(set(np.arange(38)) - set(cont_idx))
+        cont = np.array([False] * 38)
+        cat = np.array([False] * 38)
+        cont[cont_idx] = True
+        cat[cat_idx] = True
+
+        from sklearn.compose import ColumnTransformer
+
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+        model = Pipeline(
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
+        )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
-            flow=flow, model=model, task=task, extension=self.extension, add_local_measures=True,
+            flow=flow,
+            model=model,
+            task=task,
+            extension=self.extension,
+            add_local_measures=True,
+            dataset_format="array",  # diff test_run_on_dataset_with_missing_labels_dataframe()
         )
         # 2 folds, 5 repeats; keep in mind that this task comes from the test
         # server, the task on the live server is different
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
index 14e2405f2..fdb2747ec 100644
--- a/tests/test_study/test_study_examples.py
+++ b/tests/test_study/test_study_examples.py
@@ -1,12 +1,20 @@
 # License: BSD 3-Clause
 
-from openml.testing import TestBase, SimpleImputer
+from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+
+import sklearn
+import unittest
+from distutils.version import LooseVersion
 
 
 class TestStudyFunctions(TestBase):
     _multiprocess_can_split_ = True
     """Test the example code of Bischl et al. (2018)"""
 
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="columntransformer introduction in 0.20.0",
+    )
     def test_Figure1a(self):
         """Test listing in Figure 1a on a single task and the old OpenML100 study.
 
@@ -29,16 +37,19 @@ def test_Figure1a(self):
         """  # noqa: E501
         import openml
         import sklearn.metrics
-        import sklearn.pipeline
-        import sklearn.preprocessing
         import sklearn.tree
+        from sklearn.pipeline import Pipeline, make_pipeline
+        from sklearn.compose import ColumnTransformer
+        from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
         benchmark_suite = openml.study.get_study("OpenML100", "tasks")  # obtain the benchmark suite
-        clf = sklearn.pipeline.Pipeline(
-            steps=[
-                ("imputer", SimpleImputer()),
-                ("estimator", sklearn.tree.DecisionTreeClassifier()),
-            ]
+        cat_imp = make_pipeline(
+            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
+        )
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+        clf = Pipeline(
+            steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
         )  # build a sklearn classifier
         for task_id in benchmark_suite.tasks[:1]:  # iterate over all tasks
             task = openml.tasks.get_task(task_id)  # download the OpenML task