openml · mfeurer · Jun 16, 2023 · Jun 15, 2023 · Jun 15, 2023 · Jun 15, 2023
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -21,10 +21,9 @@
 #   * Use the output_format parameter to select output type
 #   * Default gives 'dict' (other option: 'dataframe', see below)
 #
-openml_list = openml.datasets.list_datasets()  # returns a dict
-
-# Show a nice table with some key data properties
-datalist = pd.DataFrame.from_dict(openml_list, orient="index")
+# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
+# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
+datalist = openml.datasets.list_datasets(output_format="dataframe")
 datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
 
 print(f"First 10 of {len(datalist)} datasets...")

diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
@@ -75,7 +75,7 @@
 
 # We'll take a random subset of at least ten tasks of all available tasks on
 # the test server:
-all_tasks = list(openml.tasks.list_tasks().keys())
+all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
 task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
 
 # The study needs a machine-readable and unique alias. To obtain this,

diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py
@@ -29,28 +29,19 @@
 # Listing tasks
 # ^^^^^^^^^^^^^
 #
-# We will start by simply listing only *supervised classification* tasks:
-
-tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
-
-############################################################################
-# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, which we convert
-# into a
+# We will start by simply listing only *supervised classification* tasks.
+# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
+# request a
 # `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
-# to have better visualization capabilities and easier access:
+# instead to have better visualization capabilities and easier access:
 
-tasks = pd.DataFrame.from_dict(tasks, orient="index")
+tasks = openml.tasks.list_tasks(
+    task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
+)
 print(tasks.columns)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
-# As conversion to a pandas dataframe is a common task, we have added this functionality to the
-# OpenML-Python library which can be used by passing ``output_format='dataframe'``:
-tasks_df = openml.tasks.list_tasks(
-    task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
-)
-print(tasks_df.head())
-
 ############################################################################
 # We can filter the list of tasks to only contain datasets with more than
 # 500 samples, but less than 1000 samples:

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -128,6 +128,15 @@ def list_datasets(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     return openml.utils._list_all(
         data_id=data_id,
         output_format=output_format,
@@ -241,7 +250,8 @@ def check_datasets_active(
     Check if the dataset ids provided are active.
 
     Raises an error if a dataset_id in the given list
-    of dataset_ids does not exist on the server.
+    of dataset_ids does not exist on the server and
+    `raise_error_if_not_exist` is set to True (default).
 
     Parameters
     ----------
@@ -256,18 +266,12 @@ def check_datasets_active(
     dict
         A dictionary with items {did: bool}
     """
-    dataset_list = list_datasets(status="all", data_id=dataset_ids)
-    active = {}
-
-    for did in dataset_ids:
-        dataset = dataset_list.get(did, None)
-        if dataset is None:
-            if raise_error_if_not_exist:
-                raise ValueError(f"Could not find dataset {did} in OpenML dataset list.")
-        else:
-            active[did] = dataset["status"] == "active"
-
-    return active
+    datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
+    missing = set(dataset_ids) - set(datasets.get("did", []))
+    if raise_error_if_not_exist and missing:
+        missing_str = ", ".join(str(did) for did in missing)
+        raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
+    return dict(datasets["status"] == "active")
 
 
 def _name_to_id(
@@ -285,7 +289,7 @@ def _name_to_id(
     ----------
     dataset_name : str
         The name of the dataset for which to find its id.
-    version : int
+    version : int, optional
         Version to retrieve. If not specified, the oldest active version is returned.
     error_if_multiple : bool (default=False)
         If `False`, if multiple datasets match, return the least recent active dataset.
@@ -299,16 +303,22 @@ def _name_to_id(
        The id of the dataset.
     """
     status = None if version is not None else "active"
-    candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
+    candidates = cast(
+        pd.DataFrame,
+        list_datasets(
+            data_name=dataset_name, status=status, data_version=version, output_format="dataframe"
+        ),
+    )
     if error_if_multiple and len(candidates) > 1:
-        raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
-    if len(candidates) == 0:
-        no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
-        and_version = " and version {}".format(version) if version is not None else ""
+        msg = f"Multiple active datasets exist with name '{dataset_name}'."
+        raise ValueError(msg)
+    if candidates.empty:
+        no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
+        and_version = f" and version '{version}'." if version is not None else "."
         raise RuntimeError(no_dataset_for_name + and_version)
 
     # Dataset ids are chronological so we can just sort based on ids (instead of version)
-    return sorted(candidates)[0]
+    return candidates["did"].min()
 
 
 def get_datasets(

diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
@@ -1,6 +1,8 @@
 # License: BSD 3-Clause
 
 import json
+import warnings
+
 import xmltodict
 import pandas as pd
 import numpy as np
@@ -77,6 +79,15 @@ def list_evaluations(
             "Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15. "
+            "To ensure your code will continue to work, "
+            "use `output_format`='dataframe' or `output_format`='object'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     per_fold_str = None
     if per_fold is not None:
         per_fold_str = str(per_fold).lower()

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -1,4 +1,5 @@
 # License: BSD 3-Clause
+import warnings
 
 import dateutil.parser
 from collections import OrderedDict
@@ -188,6 +189,15 @@ def list_flows(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     return openml.utils._list_all(
         output_format=output_format,
         listing_call=_list_flows,

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -5,7 +5,7 @@
 import itertools
 import os
 import time
-from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING  # noqa F401
+from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING, cast  # noqa F401
 import warnings
 
 import sklearn.metrics
@@ -103,7 +103,7 @@ def run_model_on_task(
             "avoid_duplicate_runs is set to True, but no API key is set. "
             "Please set your API key in the OpenML configuration file, see"
             "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial"
-            + ".html#authentication for more information on authentication.",
+            ".html#authentication for more information on authentication.",
         )
 
     # TODO: At some point in the future do not allow for arguments in old order (6-2018).
@@ -428,11 +428,10 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
         return set()
 
     try:
-        result = list_runs(task=[task_id], setup=[setup_id])
-        if len(result) > 0:
-            return set(result.keys())
-        else:
-            return set()
+        result = cast(
+            pd.DataFrame, list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
+        )
+        return set() if result.empty else set(result["run_id"])
     except OpenMLServerException as exception:
         # error code 512 implies no results. The run does not exist yet
         assert exception.code == 512
@@ -1012,6 +1011,14 @@ def list_runs(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     if id is not None and (not isinstance(id, list)):
         raise TypeError("id must be of type list.")

diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -1,5 +1,5 @@
 # License: BSD 3-Clause
-
+import warnings
 from collections import OrderedDict
 import io
 import os
@@ -140,6 +140,15 @@ def list_setups(
             "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable."
         )
 
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15. "
+            "To ensure your code will continue to work, "
+            "use `output_format`='dataframe' or `output_format`='object'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     batch_size = 1000  # batch size for setups is lower
     return openml.utils._list_all(
         output_format=output_format,

diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -459,6 +459,14 @@ def list_suites(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     return openml.utils._list_all(
         output_format=output_format,
@@ -532,6 +540,14 @@ def list_studies(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     return openml.utils._list_all(
         output_format=output_format,

diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -176,6 +176,14 @@ def list_tasks(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    # TODO: [0.15]
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
     return openml.utils._list_all(
         output_format=output_format,
         listing_call=_list_tasks,

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -302,15 +302,15 @@ def setUp(self):
 
     def test_tagging(self):
         tag = "testing_tag_{}_{}".format(self.id(), time())
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 0)
+        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        self.assertTrue(datasets.empty)
         self.dataset.push_tag(tag)
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 1)
-        self.assertIn(125, ds_list)
+        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        self.assertEqual(len(datasets), 1)
+        self.assertIn(125, datasets["did"])
         self.dataset.remove_tag(tag)
-        ds_list = openml.datasets.list_datasets(tag=tag)
-        self.assertEqual(len(ds_list), 0)
+        datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+        self.assertTrue(datasets.empty)
 
 
 class OpenMLDatasetTestSparse(TestBase):