Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
30dd55f
Add deprecation warning for retrieving dict
PGijsbers Jun 15, 2023
b502312
Refactor check_datasets_active to work with dataframe
PGijsbers Jun 15, 2023
357bb7d
Update unit tests to use list_datasets with output_format dataframe
PGijsbers Jun 15, 2023
29bbb57
Move list_datasets test to proper file
PGijsbers Jun 15, 2023
464e5dd
Remove list_datasets test, duplicate in test_datasets_functions
PGijsbers Jun 15, 2023
aaad25f
Update list_flows calls to use output_format='dataframe'
PGijsbers Jun 15, 2023
cf9dd7b
Update list_runs calls to require dataframe output
PGijsbers Jun 15, 2023
13f2fb5
Update list_setup calls for deprecation
PGijsbers Jun 15, 2023
d3342a1
Update list_study calls
PGijsbers Jun 15, 2023
b8a915b
Update list_tasks to specify output_format dataframe
PGijsbers Jun 15, 2023
3361b15
Add `output_format` to `list_datasets` call
PGijsbers Jun 15, 2023
be16355
Add TODO markers for removing `dict` support of `list_*` functions
PGijsbers Jun 15, 2023
5cc1287
Make status check less strict, call list_dataset with output_format
PGijsbers Jun 15, 2023
576e09c
Change index on id to did, since thats the dataset id's column name
PGijsbers Jun 15, 2023
b82febe
Update test to reflect new error message
PGijsbers Jun 15, 2023
cc944b5
Fix bug introduced by refactor
PGijsbers Jun 15, 2023
dca2590
Fix minor oversights of refactoring
PGijsbers Jun 15, 2023
5240504
Merge branch 'develop' into pandas_default
PGijsbers Jun 15, 2023
3cff453
Rename variables to reflect they are no longer lists
PGijsbers Jun 16, 2023
c130c41
Fix unsafe indexing on dataframe and remaining unit tests
PGijsbers Jun 16, 2023
22a6dd3
Perform safer check for integer dtypes
PGijsbers Jun 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@
# * Use the output_format parameter to select output type
# * Default gives 'dict' (other option: 'dataframe', see below)
#
openml_list = openml.datasets.list_datasets() # returns a dict

# Show a nice table with some key data properties
datalist = pd.DataFrame.from_dict(openml_list, orient="index")
# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
datalist = openml.datasets.list_datasets(output_format="dataframe")
datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]

print(f"First 10 of {len(datalist)} datasets...")
Expand Down
2 changes: 1 addition & 1 deletion examples/30_extended/suites_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@

# We'll take a random subset of at least ten tasks of all available tasks on
# the test server:
all_tasks = list(openml.tasks.list_tasks().keys())
all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))

# The study needs a machine-readable and unique alias. To obtain this,
Expand Down
23 changes: 7 additions & 16 deletions examples/30_extended/tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,19 @@
# Listing tasks
# ^^^^^^^^^^^^^
#
# We will start by simply listing only *supervised classification* tasks:

tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)

############################################################################
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, which we convert
# into a
# We will start by simply listing only *supervised classification* tasks.
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
# request a
# `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
# to have better visualization capabilities and easier access:
# instead to have better visualization capabilities and easier access:

tasks = pd.DataFrame.from_dict(tasks, orient="index")
tasks = openml.tasks.list_tasks(
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

# As conversion to a pandas dataframe is a common task, we have added this functionality to the
# OpenML-Python library which can be used by passing ``output_format='dataframe'``:
tasks_df = openml.tasks.list_tasks(
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
)
print(tasks_df.head())

############################################################################
# We can filter the list of tasks to only contain datasets with more than
# 500 samples, but less than 1000 samples:
Expand Down
50 changes: 30 additions & 20 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ def list_datasets(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)

# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
data_id=data_id,
output_format=output_format,
Expand Down Expand Up @@ -241,7 +250,8 @@ def check_datasets_active(
Check if the dataset ids provided are active.

Raises an error if a dataset_id in the given list
of dataset_ids does not exist on the server.
of dataset_ids does not exist on the server and
`raise_error_if_not_exist` is set to True (default).

Parameters
----------
Expand All @@ -256,18 +266,12 @@ def check_datasets_active(
dict
A dictionary with items {did: bool}
"""
dataset_list = list_datasets(status="all", data_id=dataset_ids)
active = {}

for did in dataset_ids:
dataset = dataset_list.get(did, None)
if dataset is None:
if raise_error_if_not_exist:
raise ValueError(f"Could not find dataset {did} in OpenML dataset list.")
else:
active[did] = dataset["status"] == "active"

return active
datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
missing = set(dataset_ids) - set(datasets.get("did", []))
if raise_error_if_not_exist and missing:
missing_str = ", ".join(str(did) for did in missing)
raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
return dict(datasets["status"] == "active")


def _name_to_id(
Expand All @@ -285,7 +289,7 @@ def _name_to_id(
----------
dataset_name : str
The name of the dataset for which to find its id.
version : int
version : int, optional
Version to retrieve. If not specified, the oldest active version is returned.
error_if_multiple : bool (default=False)
If `False`, if multiple datasets match, return the least recent active dataset.
Expand All @@ -299,16 +303,22 @@ def _name_to_id(
The id of the dataset.
"""
status = None if version is not None else "active"
candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
candidates = cast(
pd.DataFrame,
list_datasets(
data_name=dataset_name, status=status, data_version=version, output_format="dataframe"
),
)
if error_if_multiple and len(candidates) > 1:
raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
if len(candidates) == 0:
no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
and_version = " and version {}".format(version) if version is not None else ""
msg = f"Multiple active datasets exist with name '{dataset_name}'."
raise ValueError(msg)
if candidates.empty:
no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
and_version = f" and version '{version}'." if version is not None else "."
raise RuntimeError(no_dataset_for_name + and_version)

# Dataset ids are chronological so we can just sort based on ids (instead of version)
return sorted(candidates)[0]
return candidates["did"].min()


def get_datasets(
Expand Down
11 changes: 11 additions & 0 deletions openml/evaluations/functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# License: BSD 3-Clause

import json
import warnings

import xmltodict
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -77,6 +79,15 @@ def list_evaluations(
"Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable."
)

# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15. "
"To ensure your code will continue to work, "
"use `output_format`='dataframe' or `output_format`='object'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

per_fold_str = None
if per_fold is not None:
per_fold_str = str(per_fold).lower()
Expand Down
10 changes: 10 additions & 0 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# License: BSD 3-Clause
import warnings

import dateutil.parser
from collections import OrderedDict
Expand Down Expand Up @@ -188,6 +189,15 @@ def list_flows(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)

# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
output_format=output_format,
listing_call=_list_flows,
Expand Down
21 changes: 14 additions & 7 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import itertools
import os
import time
from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING # noqa F401
from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING, cast # noqa F401
import warnings

import sklearn.metrics
Expand Down Expand Up @@ -103,7 +103,7 @@ def run_model_on_task(
"avoid_duplicate_runs is set to True, but no API key is set. "
"Please set your API key in the OpenML configuration file, see"
"https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial"
+ ".html#authentication for more information on authentication.",
".html#authentication for more information on authentication.",
)

# TODO: At some point in the future do not allow for arguments in old order (6-2018).
Expand Down Expand Up @@ -428,11 +428,10 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
return set()

try:
result = list_runs(task=[task_id], setup=[setup_id])
if len(result) > 0:
return set(result.keys())
else:
return set()
result = cast(
pd.DataFrame, list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
)
return set() if result.empty else set(result["run_id"])
except OpenMLServerException as exception:
# error code 512 implies no results. The run does not exist yet
assert exception.code == 512
Expand Down Expand Up @@ -1012,6 +1011,14 @@ def list_runs(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

if id is not None and (not isinstance(id, list)):
raise TypeError("id must be of type list.")
Expand Down
11 changes: 10 additions & 1 deletion openml/setups/functions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# License: BSD 3-Clause

import warnings
from collections import OrderedDict
import io
import os
Expand Down Expand Up @@ -140,6 +140,15 @@ def list_setups(
"Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable."
)

# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15. "
"To ensure your code will continue to work, "
"use `output_format`='dataframe' or `output_format`='object'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

batch_size = 1000 # batch size for setups is lower
return openml.utils._list_all(
output_format=output_format,
Expand Down
16 changes: 16 additions & 0 deletions openml/study/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,14 @@ def list_suites(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
output_format=output_format,
Expand Down Expand Up @@ -532,6 +540,14 @@ def list_studies(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
output_format=output_format,
Expand Down
8 changes: 8 additions & 0 deletions openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@ def list_tasks(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
# TODO: [0.15]
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)
return openml.utils._list_all(
output_format=output_format,
listing_call=_list_tasks,
Expand Down
14 changes: 7 additions & 7 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,15 +302,15 @@ def setUp(self):

def test_tagging(self):
tag = "testing_tag_{}_{}".format(self.id(), time())
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 0)
datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
self.assertTrue(datasets.empty)
self.dataset.push_tag(tag)
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 1)
self.assertIn(125, ds_list)
datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
self.assertEqual(len(datasets), 1)
self.assertIn(125, datasets["did"])
self.dataset.remove_tag(tag)
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 0)
datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
self.assertTrue(datasets.empty)


class OpenMLDatasetTestSparse(TestBase):
Expand Down
Loading