diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index e8aa94f2b..78ada4fde 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -21,10 +21,9 @@
 #   * Use the output_format parameter to select output type
 #   * Default gives 'dict' (other option: 'dataframe', see below)
 #
-openml_list = openml.datasets.list_datasets()  # returns a dict
-
-# Show a nice table with some key data properties
-datalist = pd.DataFrame.from_dict(openml_list, orient="index")
+# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
+# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
+datalist = openml.datasets.list_datasets(output_format="dataframe")
 datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
 
 print(f"First 10 of {len(datalist)} datasets...")
diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
index 9b8c1d73d..ff9902356 100644
--- a/examples/30_extended/suites_tutorial.py
+++ b/examples/30_extended/suites_tutorial.py
@@ -75,7 +75,7 @@
 
 # We'll take a random subset of at least ten tasks of all available tasks on
 # the test server:
-all_tasks = list(openml.tasks.list_tasks().keys())
+all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
 task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
 
 # The study needs a machine-readable and unique alias. To obtain this,
diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py
index 3f70d64fe..19a7e542c 100644
--- a/examples/30_extended/tasks_tutorial.py
+++ b/examples/30_extended/tasks_tutorial.py
@@ -29,28 +29,19 @@
 # Listing tasks
 # ^^^^^^^^^^^^^
 #
-# We will start by simply listing only *supervised classification* tasks:
-
-tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
-
-############################################################################
-# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, which we convert
-# into a
+# We will start by simply listing only *supervised classification* tasks.
+# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
+# request a
 # `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
-# to have better visualization capabilities and easier access:
+# instead to have better visualization capabilities and easier access:
 
-tasks = pd.DataFrame.from_dict(tasks, orient="index")
+tasks = openml.tasks.list_tasks(
+    task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
+)
 print(tasks.columns)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
-# As conversion to a pandas dataframe is a common task, we have added this functionality to the
-# OpenML-Python library which can be used by passing ``output_format='dataframe'``:
-tasks_df = openml.tasks.list_tasks(
-    task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
-)
-print(tasks_df.head())
-
 ############################################################################
 # We can filter the list of tasks to only contain datasets with more than
 # 500 samples, but less than 1000 samples:
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 7faae48c2..a1be2aa97 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -128,6 +128,15 @@ def list_datasets(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     return openml.utils._list_all(
         data_id=data_id,
         output_format=output_format,
@@ -241,7 +250,8 @@ def check_datasets_active(
     Check if the dataset ids provided are active.
 
     Raises an error if a dataset_id in the given list
-    of dataset_ids does not exist on the server.
+    of dataset_ids does not exist on the server and
+    `raise_error_if_not_exist` is set to True (default).
 
     Parameters
     ----------
@@ -256,18 +266,12 @@ def check_datasets_active(
     dict
         A dictionary with items {did: bool}
     """
-    dataset_list = list_datasets(status="all", data_id=dataset_ids)
-    active = {}
-
-    for did in dataset_ids:
-        dataset = dataset_list.get(did, None)
-        if dataset is None:
-            if raise_error_if_not_exist:
-                raise ValueError(f"Could not find dataset {did} in OpenML dataset list.")
-        else:
-            active[did] = dataset["status"] == "active"
-
-    return active
+    datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
+    missing = set(dataset_ids) - set(datasets.get("did", []))
+    if raise_error_if_not_exist and missing:
+        missing_str = ", ".join(str(did) for did in missing)
+        raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
+    return dict(datasets["status"] == "active")
 
 
 def _name_to_id(
@@ -285,7 +289,7 @@ def _name_to_id(
     ----------
     dataset_name : str
         The name of the dataset for which to find its id.
-    version : int
+    version : int, optional
         Version to retrieve. If not specified, the oldest active version is returned.
     error_if_multiple : bool (default=False)
         If `False`, if multiple datasets match, return the least recent active dataset.
@@ -299,16 +303,22 @@ def _name_to_id(
        The id of the dataset.
     """
     status = None if version is not None else "active"
-    candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
+    candidates = cast(
+        pd.DataFrame,
+        list_datasets(
+            data_name=dataset_name, status=status, data_version=version, output_format="dataframe"
+        ),
+    )
     if error_if_multiple and len(candidates) > 1:
-        raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
-    if len(candidates) == 0:
-        no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
-        and_version = " and version {}".format(version) if version is not None else ""
+        msg = f"Multiple active datasets exist with name '{dataset_name}'."
+        raise ValueError(msg)
+    if candidates.empty:
+        no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
+        and_version = f" and version '{version}'." if version is not None else "."
         raise RuntimeError(no_dataset_for_name + and_version)
 
     # Dataset ids are chronological so we can just sort based on ids (instead of version)
-    return sorted(candidates)[0]
+    return candidates["did"].min()
 
 
 def get_datasets(
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 693ec06cf..214348345 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -1,6 +1,8 @@
 # License: BSD 3-Clause
 
 import json
+import warnings
+
 import xmltodict
 import pandas as pd
 import numpy as np
@@ -77,6 +79,15 @@ def list_evaluations(
             "Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15. "
+            "To ensure your code will continue to work, "
+            "use `output_format`='dataframe' or `output_format`='object'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     per_fold_str = None
     if per_fold is not None:
         per_fold_str = str(per_fold).lower()
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 42cf9a6af..0e278d33a 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -1,4 +1,5 @@
 # License: BSD 3-Clause
+import warnings
 
 import dateutil.parser
 from collections import OrderedDict
@@ -188,6 +189,15 @@ def list_flows(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     return openml.utils._list_all(
         output_format=output_format,
         listing_call=_list_flows,
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 1e5f519df..96e031aee 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -5,7 +5,7 @@
 import itertools
 import os
 import time
-from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING  # noqa F401
+from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING, cast  # noqa F401
 import warnings
 
 import sklearn.metrics
@@ -103,7 +103,7 @@ def run_model_on_task(
             "avoid_duplicate_runs is set to True, but no API key is set. "
             "Please set your API key in the OpenML configuration file, see"
             "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial"
-            + ".html#authentication for more information on authentication.",
+            ".html#authentication for more information on authentication.",
         )
 
     # TODO: At some point in the future do not allow for arguments in old order (6-2018).
@@ -428,11 +428,10 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
         return set()
 
     try:
-        result = list_runs(task=[task_id], setup=[setup_id])
-        if len(result) > 0:
-            return set(result.keys())
-        else:
-            return set()
+        result = cast(
+            pd.DataFrame, list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
+        )
+        return set() if result.empty else set(result["run_id"])
     except OpenMLServerException as exception:
         # error code 512 implies no results. The run does not exist yet
         assert exception.code == 512
@@ -1012,6 +1011,14 @@ def list_runs(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     if id is not None and (not isinstance(id, list)):
         raise TypeError("id must be of type list.")
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 1e3d44e0b..5a81f7248 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -1,5 +1,5 @@
 # License: BSD 3-Clause
-
+import warnings
 from collections import OrderedDict
 import io
 import os
@@ -140,6 +140,15 @@ def list_setups(
             "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15. "
+            "To ensure your code will continue to work, "
+            "use `output_format`='dataframe' or `output_format`='object'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     batch_size = 1000  # batch size for setups is lower
     return openml.utils._list_all(
         output_format=output_format,
diff --git a/openml/study/functions.py b/openml/study/functions.py
index ae257dd9c..08ca44361 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -459,6 +459,14 @@ def list_suites(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     return openml.utils._list_all(
         output_format=output_format,
@@ -532,6 +540,14 @@ def list_studies(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     return openml.utils._list_all(
         output_format=output_format,
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 1c8daf613..b038179fc 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -176,6 +176,14 @@ def list_tasks(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
     return openml.utils._list_all(
         output_format=output_format,
         listing_call=_list_tasks,
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 55be58f51..93e0247d2 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -302,15 +302,15 @@ def setUp(self):
 
     def test_tagging(self):
         tag = "testing_tag_{}_{}".format(self.id(), time())
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 0)
+        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        self.assertTrue(datasets.empty)
         self.dataset.push_tag(tag)
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 1)
-        self.assertIn(125, ds_list)
+        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        self.assertEqual(len(datasets), 1)
+        self.assertIn(125, datasets["did"])
         self.dataset.remove_tag(tag)
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 0)
+        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        self.assertTrue(datasets.empty)
 
 
 class OpenMLDatasetTestSparse(TestBase):
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 437d4d342..fe04f7d96 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -109,56 +109,11 @@ def test_tag_untag_dataset(self):
         all_tags = _tag_entity("data", 1, tag, untag=True)
         self.assertTrue(tag not in all_tags)
 
-    def test_list_datasets(self):
-        # We can only perform a smoke test here because we test on dynamic
-        # data from the internet...
-        datasets = openml.datasets.list_datasets()
-        # 1087 as the number of datasets on openml.org
-        self.assertGreaterEqual(len(datasets), 100)
-        self._check_datasets(datasets)
-
     def test_list_datasets_output_format(self):
         datasets = openml.datasets.list_datasets(output_format="dataframe")
         self.assertIsInstance(datasets, pd.DataFrame)
         self.assertGreaterEqual(len(datasets), 100)
 
-    def test_list_datasets_by_tag(self):
-        datasets = openml.datasets.list_datasets(tag="study_14")
-        self.assertGreaterEqual(len(datasets), 100)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_size(self):
-        datasets = openml.datasets.list_datasets(size=10050)
-        self.assertGreaterEqual(len(datasets), 120)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_instances(self):
-        datasets = openml.datasets.list_datasets(number_instances="5..100")
-        self.assertGreaterEqual(len(datasets), 4)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_features(self):
-        datasets = openml.datasets.list_datasets(number_features="50..100")
-        self.assertGreaterEqual(len(datasets), 8)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_classes(self):
-        datasets = openml.datasets.list_datasets(number_classes="5")
-        self.assertGreaterEqual(len(datasets), 3)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_by_number_missing_values(self):
-        datasets = openml.datasets.list_datasets(number_missing_values="5..100")
-        self.assertGreaterEqual(len(datasets), 5)
-        self._check_datasets(datasets)
-
-    def test_list_datasets_combined_filters(self):
-        datasets = openml.datasets.list_datasets(
-            tag="study_14", number_instances="100..1000", number_missing_values="800..1000"
-        )
-        self.assertGreaterEqual(len(datasets), 1)
-        self._check_datasets(datasets)
-
     def test_list_datasets_paginate(self):
         size = 10
         max = 100
@@ -168,11 +123,10 @@ def test_list_datasets_paginate(self):
             self._check_datasets(datasets)
 
     def test_list_datasets_empty(self):
-        datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
-        if len(datasets) > 0:
-            raise ValueError("UnitTest Outdated, tag was already used (please remove)")
-
-        self.assertIsInstance(datasets, dict)
+        datasets = openml.datasets.list_datasets(
+            tag="NoOneWouldUseThisTagAnyway", output_format="dataframe"
+        )
+        self.assertTrue(datasets.empty)
 
     def test_check_datasets_active(self):
         # Have to test on live because there is no deactivated dataset on the test server.
@@ -186,7 +140,7 @@ def test_check_datasets_active(self):
         self.assertIsNone(active.get(79))
         self.assertRaisesRegex(
             ValueError,
-            "Could not find dataset 79 in OpenML dataset list.",
+            r"Could not find dataset\(s\) 79 in OpenML dataset list.",
             openml.datasets.check_datasets_active,
             [79],
         )
@@ -255,7 +209,7 @@ def test__name_to_id_with_multiple_active_error(self):
         openml.config.server = self.production_server
         self.assertRaisesRegex(
             ValueError,
-            "Multiple active datasets exist with name iris",
+            "Multiple active datasets exist with name 'iris'.",
             openml.datasets.functions._name_to_id,
             dataset_name="iris",
             error_if_multiple=True,
@@ -265,7 +219,7 @@ def test__name_to_id_name_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
             RuntimeError,
-            "No active datasets exist with name does_not_exist",
+            "No active datasets exist with name 'does_not_exist'.",
             openml.datasets.functions._name_to_id,
             dataset_name="does_not_exist",
         )
@@ -274,7 +228,7 @@ def test__name_to_id_version_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
             RuntimeError,
-            "No active datasets exist with name iris and version 100000",
+            "No active datasets exist with name 'iris' and version '100000'.",
             openml.datasets.functions._name_to_id,
             dataset_name="iris",
             version=100000,
@@ -667,6 +621,18 @@ def test_upload_dataset_with_url(self):
         )
         self.assertIsInstance(dataset.dataset_id, int)
 
+    def _assert_status_of_dataset(self, *, did: int, status: str):
+        """Asserts there is exactly one dataset with id `did` and its current status is `status`"""
+        # need to use listing fn, as this is immune to cache
+        result = openml.datasets.list_datasets(
+            data_id=[did], status="all", output_format="dataframe"
+        )
+        result = result.to_dict(orient="index")
+        # I think we should drop the test that one result is returned,
+        # the server should never return multiple results?
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[did]["status"], status)
+
     @pytest.mark.flaky()
     def test_data_status(self):
         dataset = OpenMLDataset(
@@ -686,26 +652,17 @@ def test_data_status(self):
         openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
 
         openml.datasets.status_update(did, "active")
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=[did], status="all")
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]["status"], "active")
+        self._assert_status_of_dataset(did=did, status="active")
+
         openml.datasets.status_update(did, "deactivated")
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=[did], status="all")
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]["status"], "deactivated")
+        self._assert_status_of_dataset(did=did, status="deactivated")
+
         openml.datasets.status_update(did, "active")
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=[did], status="all")
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]["status"], "active")
+        self._assert_status_of_dataset(did=did, status="active")
+
         with self.assertRaises(ValueError):
             openml.datasets.status_update(did, "in_preparation")
-        # need to use listing fn, as this is immune to cache
-        result = openml.datasets.list_datasets(data_id=[did], status="all")
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[did]["status"], "active")
+        self._assert_status_of_dataset(did=did, status="active")
 
     def test_attributes_arff_from_df(self):
         # DataFrame case
@@ -1608,6 +1565,17 @@ def test_get_dataset_parquet(self):
         self.assertIsNotNone(dataset.parquet_file)
         self.assertTrue(os.path.isfile(dataset.parquet_file))
 
+    def test_list_datasets_with_high_size_parameter(self):
+        # Testing on prod since concurrent deletion of uploded datasets make the test fail
+        openml.config.server = self.production_server
+
+        datasets_a = openml.datasets.list_datasets(output_format="dataframe")
+        datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf)
+
+        # Reverting to test server
+        openml.config.server = self.test_server
+        self.assertEqual(len(datasets_a), len(datasets_b))
+
 
 @pytest.mark.parametrize(
     "default_target_attribute,row_id_attribute,ignore_attribute",
@@ -1857,3 +1825,76 @@ def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key)
         {"params": {"api_key": test_api_key}},
     ]
     assert expected_call_args == list(mock_delete.call_args)
+
+
+def _assert_datasets_have_id_and_valid_status(datasets: pd.DataFrame):
+    assert pd.api.types.is_integer_dtype(datasets["did"])
+    assert {"in_preparation", "active", "deactivated"} >= set(datasets["status"])
+
+
+@pytest.fixture(scope="module")
+def all_datasets():
+    return openml.datasets.list_datasets(output_format="dataframe")
+
+
+def test_list_datasets(all_datasets: pd.DataFrame):
+    # We can only perform a smoke test here because we test on dynamic
+    # data from the internet...
+    # 1087 as the number of datasets on openml.org
+    assert 100 <= len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(all_datasets)
+
+
+def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
+    tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe")
+    assert 0 < len(tag_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(tag_datasets)
+
+
+def test_list_datasets_by_size():
+    datasets = openml.datasets.list_datasets(size=5, output_format="dataframe")
+    assert 5 == len(datasets)
+    _assert_datasets_have_id_and_valid_status(datasets)
+
+
+def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
+    small_datasets = openml.datasets.list_datasets(
+        number_instances="5..100", output_format="dataframe"
+    )
+    assert 0 < len(small_datasets) <= len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(small_datasets)
+
+
+def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
+    wide_datasets = openml.datasets.list_datasets(
+        number_features="50..100", output_format="dataframe"
+    )
+    assert 8 <= len(wide_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(wide_datasets)
+
+
+def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
+    five_class_datasets = openml.datasets.list_datasets(
+        number_classes="5", output_format="dataframe"
+    )
+    assert 3 <= len(five_class_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(five_class_datasets)
+
+
+def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
+    na_datasets = openml.datasets.list_datasets(
+        number_missing_values="5..100", output_format="dataframe"
+    )
+    assert 5 <= len(na_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(na_datasets)
+
+
+def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
+    combined_filter_datasets = openml.datasets.list_datasets(
+        tag="study_14",
+        number_instances="100..1000",
+        number_missing_values="800..1000",
+        output_format="dataframe",
+    )
+    assert 1 <= len(combined_filter_datasets) < len(all_datasets)
+    _assert_datasets_have_id_and_valid_status(combined_filter_datasets)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index c3c72f267..983ea206d 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -99,19 +99,19 @@ def test_get_structure(self):
                 self.assertEqual(subflow.flow_id, sub_flow_id)
 
     def test_tagging(self):
-        flow_list = openml.flows.list_flows(size=1)
-        flow_id = list(flow_list.keys())[0]
+        flows = openml.flows.list_flows(size=1, output_format="dataframe")
+        flow_id = flows["id"].iloc[0]
         flow = openml.flows.get_flow(flow_id)
         tag = "testing_tag_{}_{}".format(self.id(), time.time())
-        flow_list = openml.flows.list_flows(tag=tag)
-        self.assertEqual(len(flow_list), 0)
+        flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+        self.assertEqual(len(flows), 0)
         flow.push_tag(tag)
-        flow_list = openml.flows.list_flows(tag=tag)
-        self.assertEqual(len(flow_list), 1)
-        self.assertIn(flow_id, flow_list)
+        flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+        self.assertEqual(len(flows), 1)
+        self.assertIn(flow_id, flows["id"])
         flow.remove_tag(tag)
-        flow_list = openml.flows.list_flows(tag=tag)
-        self.assertEqual(len(flow_list), 0)
+        flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+        self.assertEqual(len(flows), 0)
 
     def test_from_xml_to_xml(self):
         # Get the raw xml thing
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index f2520cb36..3814a8f9d 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -48,11 +48,11 @@ def test_list_flows(self):
         openml.config.server = self.production_server
         # We can only perform a smoke test here because we test on dynamic
         # data from the internet...
-        flows = openml.flows.list_flows()
+        flows = openml.flows.list_flows(output_format="dataframe")
         # 3000 as the number of flows on openml.org
         self.assertGreaterEqual(len(flows), 1500)
-        for fid in flows:
-            self._check_flow(flows[fid])
+        for flow in flows.to_dict(orient="index").values():
+            self._check_flow(flow)
 
     def test_list_flows_output_format(self):
         openml.config.server = self.production_server
@@ -64,28 +64,25 @@ def test_list_flows_output_format(self):
 
     def test_list_flows_empty(self):
         openml.config.server = self.production_server
-        flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
-        if len(flows) > 0:
-            raise ValueError("UnitTest Outdated, got somehow results (please adapt)")
-
-        self.assertIsInstance(flows, dict)
+        flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123", output_format="dataframe")
+        assert flows.empty
 
     def test_list_flows_by_tag(self):
         openml.config.server = self.production_server
-        flows = openml.flows.list_flows(tag="weka")
+        flows = openml.flows.list_flows(tag="weka", output_format="dataframe")
         self.assertGreaterEqual(len(flows), 5)
-        for did in flows:
-            self._check_flow(flows[did])
+        for flow in flows.to_dict(orient="index").values():
+            self._check_flow(flow)
 
     def test_list_flows_paginate(self):
         openml.config.server = self.production_server
         size = 10
         maximum = 100
         for i in range(0, maximum, size):
-            flows = openml.flows.list_flows(offset=i, size=size)
+            flows = openml.flows.list_flows(offset=i, size=size, output_format="dataframe")
             self.assertGreaterEqual(size, len(flows))
-            for did in flows:
-                self._check_flow(flows[did])
+            for flow in flows.to_dict(orient="index").values():
+                self._check_flow(flow)
 
     def test_are_flows_equal(self):
         flow = openml.flows.OpenMLFlow(
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index ecc7111fa..4a4764bed 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -10,7 +10,7 @@ def test_too_long_uri(self):
             openml.exceptions.OpenMLServerError,
             "URI too long!",
         ):
-            openml.datasets.list_datasets(data_id=list(range(10000)))
+            openml.datasets.list_datasets(data_id=list(range(10000)), output_format="dataframe")
 
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 062d5a6aa..0396d0f19 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -26,19 +26,20 @@ class TestRun(TestBase):
     # less than 1 seconds
 
     def test_tagging(self):
-        runs = openml.runs.list_runs(size=1)
-        run_id = list(runs.keys())[0]
+        runs = openml.runs.list_runs(size=1, output_format="dataframe")
+        assert not runs.empty, "Test server state is incorrect"
+        run_id = runs["run_id"].iloc[0]
         run = openml.runs.get_run(run_id)
         tag = "testing_tag_{}_{}".format(self.id(), time())
-        run_list = openml.runs.list_runs(tag=tag)
-        self.assertEqual(len(run_list), 0)
+        runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+        self.assertEqual(len(runs), 0)
         run.push_tag(tag)
-        run_list = openml.runs.list_runs(tag=tag)
-        self.assertEqual(len(run_list), 1)
-        self.assertIn(run_id, run_list)
+        runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+        self.assertEqual(len(runs), 1)
+        self.assertIn(run_id, runs["run_id"])
         run.remove_tag(tag)
-        run_list = openml.runs.list_runs(tag=tag)
-        self.assertEqual(len(run_list), 0)
+        runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+        self.assertEqual(len(runs), 0)
 
     @staticmethod
     def _test_prediction_data_equal(run, run_prime):
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 1f8d1df70..8f3c0a71b 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1366,17 +1366,14 @@ def _check_run(self, run):
     def test_get_runs_list(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
-        runs = openml.runs.list_runs(id=[2], show_errors=True)
+        runs = openml.runs.list_runs(id=[2], show_errors=True, output_format="dataframe")
         self.assertEqual(len(runs), 1)
-        for rid in runs:
-            self._check_run(runs[rid])
+        for run in runs.to_dict(orient="index").values():
+            self._check_run(run)
 
     def test_list_runs_empty(self):
-        runs = openml.runs.list_runs(task=[0])
-        if len(runs) > 0:
-            raise ValueError("UnitTest Outdated, got somehow results")
-
-        self.assertIsInstance(runs, dict)
+        runs = openml.runs.list_runs(task=[0], output_format="dataframe")
+        assert runs.empty
 
     def test_list_runs_output_format(self):
         runs = openml.runs.list_runs(size=1000, output_format="dataframe")
@@ -1386,19 +1383,19 @@ def test_get_runs_list_by_task(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
         task_ids = [20]
-        runs = openml.runs.list_runs(task=task_ids)
+        runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
         self.assertGreaterEqual(len(runs), 590)
-        for rid in runs:
-            self.assertIn(runs[rid]["task_id"], task_ids)
-            self._check_run(runs[rid])
+        for run in runs.to_dict(orient="index").values():
+            self.assertIn(run["task_id"], task_ids)
+            self._check_run(run)
         num_runs = len(runs)
 
         task_ids.append(21)
-        runs = openml.runs.list_runs(task=task_ids)
+        runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
         self.assertGreaterEqual(len(runs), num_runs + 1)
-        for rid in runs:
-            self.assertIn(runs[rid]["task_id"], task_ids)
-            self._check_run(runs[rid])
+        for run in runs.to_dict(orient="index").values():
+            self.assertIn(run["task_id"], task_ids)
+            self._check_run(run)
 
     def test_get_runs_list_by_uploader(self):
         # TODO: comes from live, no such lists on test
@@ -1406,38 +1403,38 @@ def test_get_runs_list_by_uploader(self):
         # 29 is Dominik Kirchhoff
         uploader_ids = [29]
 
-        runs = openml.runs.list_runs(uploader=uploader_ids)
+        runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
         self.assertGreaterEqual(len(runs), 2)
-        for rid in runs:
-            self.assertIn(runs[rid]["uploader"], uploader_ids)
-            self._check_run(runs[rid])
+        for run in runs.to_dict(orient="index").values():
+            self.assertIn(run["uploader"], uploader_ids)
+            self._check_run(run)
         num_runs = len(runs)
 
         uploader_ids.append(274)
 
-        runs = openml.runs.list_runs(uploader=uploader_ids)
+        runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
         self.assertGreaterEqual(len(runs), num_runs + 1)
-        for rid in runs:
-            self.assertIn(runs[rid]["uploader"], uploader_ids)
-            self._check_run(runs[rid])
+        for run in runs.to_dict(orient="index").values():
+            self.assertIn(run["uploader"], uploader_ids)
+            self._check_run(run)
 
     def test_get_runs_list_by_flow(self):
         # TODO: comes from live, no such lists on test
         openml.config.server = self.production_server
         flow_ids = [1154]
-        runs = openml.runs.list_runs(flow=flow_ids)
+        runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
         self.assertGreaterEqual(len(runs), 1)
-        for rid in runs:
-            self.assertIn(runs[rid]["flow_id"], flow_ids)
-            self._check_run(runs[rid])
+        for run in runs.to_dict(orient="index").values():
+            self.assertIn(run["flow_id"], flow_ids)
+            self._check_run(run)
         num_runs = len(runs)
 
         flow_ids.append(1069)
-        runs = openml.runs.list_runs(flow=flow_ids)
+        runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
         self.assertGreaterEqual(len(runs), num_runs + 1)
-        for rid in runs:
-            self.assertIn(runs[rid]["flow_id"], flow_ids)
-            self._check_run(runs[rid])
+        for run in runs.to_dict(orient="index").values():
+            self.assertIn(run["flow_id"], flow_ids)
+            self._check_run(run)
 
     def test_get_runs_pagination(self):
         # TODO: comes from live, no such lists on test
@@ -1446,10 +1443,12 @@ def test_get_runs_pagination(self):
         size = 10
         max = 100
         for i in range(0, max, size):
-            runs = openml.runs.list_runs(offset=i, size=size, uploader=uploader_ids)
+            runs = openml.runs.list_runs(
+                offset=i, size=size, uploader=uploader_ids, output_format="dataframe"
+            )
             self.assertGreaterEqual(size, len(runs))
-            for rid in runs:
-                self.assertIn(runs[rid]["uploader"], uploader_ids)
+            for run in runs.to_dict(orient="index").values():
+                self.assertIn(run["uploader"], uploader_ids)
 
     def test_get_runs_list_by_filters(self):
         # TODO: comes from live, no such lists on test
@@ -1468,25 +1467,28 @@ def test_get_runs_list_by_filters(self):
         # self.assertRaises(openml.exceptions.OpenMLServerError,
         # openml.runs.list_runs)
 
-        runs = openml.runs.list_runs(id=ids)
+        runs = openml.runs.list_runs(id=ids, output_format="dataframe")
         self.assertEqual(len(runs), 2)
 
-        runs = openml.runs.list_runs(task=tasks)
+        runs = openml.runs.list_runs(task=tasks, output_format="dataframe")
         self.assertGreaterEqual(len(runs), 2)
 
-        runs = openml.runs.list_runs(uploader=uploaders_2)
+        runs = openml.runs.list_runs(uploader=uploaders_2, output_format="dataframe")
         self.assertGreaterEqual(len(runs), 10)
 
-        runs = openml.runs.list_runs(flow=flows)
+        runs = openml.runs.list_runs(flow=flows, output_format="dataframe")
         self.assertGreaterEqual(len(runs), 100)
 
-        runs = openml.runs.list_runs(id=ids, task=tasks, uploader=uploaders_1)
+        runs = openml.runs.list_runs(
+            id=ids, task=tasks, uploader=uploaders_1, output_format="dataframe"
+        )
+        self.assertEqual(len(runs), 2)
 
     def test_get_runs_list_by_tag(self):
         # TODO: comes from live, no such lists on test
         # Unit test works on production server only
         openml.config.server = self.production_server
-        runs = openml.runs.list_runs(tag="curves")
+        runs = openml.runs.list_runs(tag="curves", output_format="dataframe")
         self.assertGreaterEqual(len(runs), 1)
 
     @pytest.mark.sklearn
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 33b2a5551..ef1acc405 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -162,7 +162,9 @@ def test_list_setups_output_format(self):
         self.assertIsInstance(setups, pd.DataFrame)
         self.assertEqual(len(setups), 10)
 
-        setups = openml.setups.list_setups(flow=flow_id, output_format="dict", size=10)
+        # TODO: [0.15] Remove section as `dict` is no longer supported.
+        with pytest.warns(FutureWarning):
+            setups = openml.setups.list_setups(flow=flow_id, output_format="dict", size=10)
         self.assertIsInstance(setups, Dict)
         self.assertIsInstance(setups[list(setups.keys())[0]], Dict)
         self.assertEqual(len(setups), 10)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 3d7811f6e..333c12d7c 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -241,7 +241,7 @@ def test_study_attach_illegal(self):
         self.assertListEqual(study_original.runs, study_downloaded.runs)
 
     def test_study_list(self):
-        study_list = openml.study.list_studies(status="in_preparation")
+        study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
         # might fail if server is recently reset
         self.assertGreaterEqual(len(study_list), 2)
 
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index 09a0024ac..cd8e515c1 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -71,29 +71,19 @@ def test_upload_task(self):
             )
 
     def _get_compatible_rand_dataset(self) -> List:
-        compatible_datasets = []
-        active_datasets = list_datasets(status="active")
+        active_datasets = list_datasets(status="active", output_format="dataframe")
 
         # depending on the task type, find either datasets
         # with only symbolic features or datasets with only
         # numerical features.
         if self.task_type == TaskType.SUPERVISED_REGRESSION:
-            # regression task
-            for dataset_id, dataset_info in active_datasets.items():
-                if "NumberOfSymbolicFeatures" in dataset_info:
-                    if dataset_info["NumberOfSymbolicFeatures"] == 0:
-                        compatible_datasets.append(dataset_id)
+            compatible_datasets = active_datasets[active_datasets["NumberOfSymbolicFeatures"] == 0]
         elif self.task_type == TaskType.CLUSTERING:
-            # clustering task
-            compatible_datasets = list(active_datasets.keys())
+            compatible_datasets = active_datasets
         else:
-            for dataset_id, dataset_info in active_datasets.items():
-                # extra checks because of:
-                # https://github.com/openml/OpenML/issues/959
-                if "NumberOfNumericFeatures" in dataset_info:
-                    if dataset_info["NumberOfNumericFeatures"] == 0:
-                        compatible_datasets.append(dataset_id)
+            compatible_datasets = active_datasets[active_datasets["NumberOfNumericFeatures"] == 0]
 
+        compatible_datasets = list(compatible_datasets["did"])
         # in-place shuffling
         shuffle(compatible_datasets)
         return compatible_datasets
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index cf59974e5..481ef2d83 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -1,6 +1,7 @@
 # License: BSD 3-Clause
 
 import os
+from typing import cast
 from unittest import mock
 
 import pytest
@@ -56,7 +57,7 @@ def test__get_estimation_procedure_list(self):
     def test_list_clustering_task(self):
         # as shown by #383, clustering tasks can give list/dict casting problems
         openml.config.server = self.production_server
-        openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10)
+        openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10, output_format="dataframe")
         # the expected outcome is that it doesn't crash. No assertions.
 
     def _check_task(self, task):
@@ -71,11 +72,11 @@ def _check_task(self, task):
     def test_list_tasks_by_type(self):
         num_curves_tasks = 198  # number is flexible, check server if fails
         ttid = TaskType.LEARNING_CURVE
-        tasks = openml.tasks.list_tasks(task_type=ttid)
+        tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
         self.assertGreaterEqual(len(tasks), num_curves_tasks)
-        for tid in tasks:
-            self.assertEqual(ttid, tasks[tid]["ttid"])
-            self._check_task(tasks[tid])
+        for task in tasks.to_dict(orient="index").values():
+            self.assertEqual(ttid, task["ttid"])
+            self._check_task(task)
 
     def test_list_tasks_output_format(self):
         ttid = TaskType.LEARNING_CURVE
@@ -84,33 +85,33 @@ def test_list_tasks_output_format(self):
         self.assertGreater(len(tasks), 100)
 
     def test_list_tasks_empty(self):
-        tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
-        if len(tasks) > 0:
-            raise ValueError("UnitTest Outdated, got somehow results (tag is used, please adapt)")
-
-        self.assertIsInstance(tasks, dict)
+        tasks = cast(
+            pd.DataFrame,
+            openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag", output_format="dataframe"),
+        )
+        assert tasks.empty
 
     def test_list_tasks_by_tag(self):
         num_basic_tasks = 100  # number is flexible, check server if fails
-        tasks = openml.tasks.list_tasks(tag="OpenML100")
+        tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
         self.assertGreaterEqual(len(tasks), num_basic_tasks)
-        for tid in tasks:
-            self._check_task(tasks[tid])
+        for task in tasks.to_dict(orient="index").values():
+            self._check_task(task)
 
     def test_list_tasks(self):
-        tasks = openml.tasks.list_tasks()
+        tasks = openml.tasks.list_tasks(output_format="dataframe")
         self.assertGreaterEqual(len(tasks), 900)
-        for tid in tasks:
-            self._check_task(tasks[tid])
+        for task in tasks.to_dict(orient="index").values():
+            self._check_task(task)
 
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
         for i in range(0, max, size):
-            tasks = openml.tasks.list_tasks(offset=i, size=size)
+            tasks = openml.tasks.list_tasks(offset=i, size=size, output_format="dataframe")
             self.assertGreaterEqual(size, len(tasks))
-            for tid in tasks:
-                self._check_task(tasks[tid])
+            for task in tasks.to_dict(orient="index").values():
+                self._check_task(task)
 
     def test_list_tasks_per_type_paginate(self):
         size = 40
@@ -122,11 +123,13 @@ def test_list_tasks_per_type_paginate(self):
         ]
         for j in task_types:
             for i in range(0, max, size):
-                tasks = openml.tasks.list_tasks(task_type=j, offset=i, size=size)
+                tasks = openml.tasks.list_tasks(
+                    task_type=j, offset=i, size=size, output_format="dataframe"
+                )
                 self.assertGreaterEqual(size, len(tasks))
-                for tid in tasks:
-                    self.assertEqual(j, tasks[tid]["ttid"])
-                    self._check_task(tasks[tid])
+                for task in tasks.to_dict(orient="index").values():
+                    self.assertEqual(j, task["ttid"])
+                    self._check_task(task)
 
     def test__get_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index d22b6a2a9..4f15ccce2 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -17,15 +17,15 @@ def tearDown(self):
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         tag = "testing_tag_{}_{}".format(self.id(), time())
-        task_list = openml.tasks.list_tasks(tag=tag)
-        self.assertEqual(len(task_list), 0)
+        tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+        self.assertEqual(len(tasks), 0)
         task.push_tag(tag)
-        task_list = openml.tasks.list_tasks(tag=tag)
-        self.assertEqual(len(task_list), 1)
-        self.assertIn(1, task_list)
+        tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+        self.assertEqual(len(tasks), 1)
+        self.assertIn(1, tasks["tid"])
         task.remove_tag(tag)
-        task_list = openml.tasks.list_tasks(tag=tag)
-        self.assertEqual(len(task_list), 0)
+        tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+        self.assertEqual(len(tasks), 0)
 
     def test_get_train_and_test_split_indices(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 8558d27c8..ace12c8e9 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -2,8 +2,6 @@
 import tempfile
 import unittest.mock
 
-import numpy as np
-
 import openml
 from openml.testing import TestBase
 
@@ -42,40 +40,34 @@ def test_list_all_few_results_available(self, _perform_api_call):
         # Although we have multiple versions of the iris dataset, there is only
         # one with this name/version combination
 
-        datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
+        datasets = openml.datasets.list_datasets(
+            size=1000, data_name="iris", data_version=1, output_format="dataframe"
+        )
         self.assertEqual(len(datasets), 1)
         self.assertEqual(_perform_api_call.call_count, 1)
 
     def test_list_all_for_datasets(self):
         required_size = 127  # default test server reset value
-        datasets = openml.datasets.list_datasets(batch_size=100, size=required_size)
+        datasets = openml.datasets.list_datasets(
+            batch_size=100, size=required_size, output_format="dataframe"
+        )
 
         self.assertEqual(len(datasets), required_size)
-        for did in datasets:
-            self._check_dataset(datasets[did])
-
-    def test_list_datasets_with_high_size_parameter(self):
-        # Testing on prod since concurrent deletion of uploded datasets make the test fail
-        openml.config.server = self.production_server
-
-        datasets_a = openml.datasets.list_datasets()
-        datasets_b = openml.datasets.list_datasets(size=np.inf)
-
-        # Reverting to test server
-        openml.config.server = self.test_server
-
-        self.assertEqual(len(datasets_a), len(datasets_b))
+        for dataset in datasets.to_dict(orient="index").values():
+            self._check_dataset(dataset)
 
     def test_list_all_for_tasks(self):
         required_size = 1068  # default test server reset value
-        tasks = openml.tasks.list_tasks(batch_size=1000, size=required_size)
-
+        tasks = openml.tasks.list_tasks(
+            batch_size=1000, size=required_size, output_format="dataframe"
+        )
         self.assertEqual(len(tasks), required_size)
 
     def test_list_all_for_flows(self):
         required_size = 15  # default test server reset value
-        flows = openml.flows.list_flows(batch_size=25, size=required_size)
-
+        flows = openml.flows.list_flows(
+            batch_size=25, size=required_size, output_format="dataframe"
+        )
         self.assertEqual(len(flows), required_size)
 
     def test_list_all_for_setups(self):