diff --git a/README.md b/README.md
index 0bad7ac66..081bf7923 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
-
+
OpenML-Python
diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
index b90d53660..9b18aab14 100644
--- a/examples/20_basic/simple_datasets_tutorial.py
+++ b/examples/20_basic/simple_datasets_tutorial.py
@@ -19,7 +19,7 @@
# List datasets
# =============
-datasets_df = openml.datasets.list_datasets(output_format="dataframe")
+datasets_df = openml.datasets.list_datasets()
print(datasets_df.head(n=10))
############################################################################
@@ -48,7 +48,7 @@
# attribute_names - the names of the features for the examples (X) and
# target feature (y)
X, y, categorical_indicator, attribute_names = dataset.get_data(
- dataset_format="dataframe", target=dataset.default_target_attribute
+ target=dataset.default_target_attribute
)
############################################################################
@@ -63,9 +63,9 @@
# Visualize the dataset
# =====================
+import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
-import matplotlib.pyplot as plt
sns.set_style("darkgrid")
diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
index eec6d7e8b..f7d7a49d1 100644
--- a/examples/20_basic/simple_flows_and_runs_tutorial.py
+++ b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
# License: BSD 3-Clause
-import openml
from sklearn import ensemble, neighbors
+import openml
############################################################################
# .. warning::
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index 606455dd8..77a46d8b0 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -8,29 +8,24 @@
# License: BSD 3-Clauses
-import openml
import pandas as pd
+
+import openml
from openml.datasets import edit_dataset, fork_dataset, get_dataset
############################################################################
# Exercise 0
# **********
#
-# * List datasets
-#
-# * Use the output_format parameter to select output type
-# * Default gives 'dict' (other option: 'dataframe', see below)
-#
-# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
-# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
-datalist = openml.datasets.list_datasets(output_format="dataframe")
+# * List datasets and return a dataframe
+datalist = openml.datasets.list_datasets()
datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
print(f"First 10 of {len(datalist)} datasets...")
datalist.head(n=10)
# The same can be done with lesser lines of code
-openml_df = openml.datasets.list_datasets(output_format="dataframe")
+openml_df = openml.datasets.list_datasets()
openml_df.head(n=10)
############################################################################
diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py
index 86302e2d1..6c8a88ec8 100644
--- a/examples/30_extended/fetch_evaluations_tutorial.py
+++ b/examples/30_extended/fetch_evaluations_tutorial.py
@@ -32,9 +32,7 @@
# Required filters can be applied to retrieve results from runs as required.
# We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
-openml.evaluations.list_evaluations(
- function="predictive_accuracy", size=10, output_format="dataframe"
-)
+openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)
# Using other evaluation metrics, 'precision' in this case
evals = openml.evaluations.list_evaluations(
@@ -94,7 +92,7 @@ def plot_cdf(values, metric="predictive_accuracy"):
plt.minorticks_on()
plt.grid(visible=True, which="minor", linestyle="--")
plt.axvline(max_val, linestyle="--", color="gray")
- plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
+ plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9)
plt.show()
@@ -162,7 +160,10 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
# List evaluations in descending order based on predictive_accuracy with
# hyperparameters
evals_setups = openml.evaluations.list_evaluations_setups(
- function="predictive_accuracy", tasks=[31], size=100, sort_order="desc"
+ function="predictive_accuracy",
+ tasks=[31],
+ size=100,
+ sort_order="desc",
)
""
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
index b7c000101..afd398feb 100644
--- a/examples/30_extended/flows_and_runs_tutorial.py
+++ b/examples/30_extended/flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
# License: BSD 3-Clause
-import openml
-from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
+from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree
+import openml
############################################################################
# We'll use the test server for the rest of this tutorial.
diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
index e366c56df..491507d16 100644
--- a/examples/30_extended/plot_svm_hyperparameters_tutorial.py
+++ b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
@@ -6,9 +6,10 @@
# License: BSD 3-Clause
-import openml
import numpy as np
+import openml
+
####################################################################################################
# First step - obtaining the data
# ===============================
@@ -22,7 +23,6 @@
function="predictive_accuracy",
flows=[8353],
tasks=[6],
- output_format="dataframe",
# Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
# the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
parameters_in_separate_columns=True,
diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
index 8715dfb4a..c0874b944 100644
--- a/examples/30_extended/study_tutorial.py
+++ b/examples/30_extended/study_tutorial.py
@@ -17,16 +17,11 @@
import openml
-
############################################################################
# Listing studies
# ***************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-# easier-to-work-with data structure
-studies = openml.study.list_studies(output_format="dataframe", status="all")
+studies = openml.study.list_studies(status="all")
print(studies.head(n=10))
@@ -52,8 +47,8 @@
# the evaluations available for the conducted runs:
evaluations = openml.evaluations.list_evaluations(
function="predictive_accuracy",
- output_format="dataframe",
study=study.study_id,
+ output_format="dataframe",
)
print(evaluations.head())
@@ -81,7 +76,7 @@
# To verify
# https://test.openml.org/api/v1/study/1
suite = openml.study.get_suite("OpenML100")
-print(all([t_id in suite.tasks for t_id in tasks]))
+print(all(t_id in suite.tasks for t_id in tasks))
run_ids = []
for task_id in tasks:
diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
index 935d4c529..19f5cdc1a 100644
--- a/examples/30_extended/suites_tutorial.py
+++ b/examples/30_extended/suites_tutorial.py
@@ -19,16 +19,11 @@
import openml
-
############################################################################
# Listing suites
# **************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-# easier-to-work-with data structure
-suites = openml.study.list_suites(output_format="dataframe", status="all")
+suites = openml.study.list_suites(status="all")
print(suites.head(n=10))
############################################################################
@@ -51,7 +46,7 @@
############################################################################
# And we can use the task listing functionality to learn more about them:
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
# Using ``@`` in `pd.DataFrame.query <
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
@@ -76,7 +71,7 @@
# We'll take a random subset of at least ten tasks of all available tasks on
# the test server:
-all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
+all_tasks = list(openml.tasks.list_tasks()["tid"])
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
# The study needs a machine-readable and unique alias. To obtain this,
diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py
index 676a742a1..dda40de50 100644
--- a/examples/30_extended/task_manual_iteration_tutorial.py
+++ b/examples/30_extended/task_manual_iteration_tutorial.py
@@ -68,7 +68,7 @@
####################################################################################################
# And then split the data based on this:
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
@@ -88,7 +88,7 @@
task_id = 3
task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -132,7 +132,7 @@
task_id = 1767
task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -176,7 +176,7 @@
task_id = 1702
task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py
index 19a7e542c..63821c7a2 100644
--- a/examples/30_extended/tasks_tutorial.py
+++ b/examples/30_extended/tasks_tutorial.py
@@ -9,7 +9,6 @@
import openml
from openml.tasks import TaskType
-import pandas as pd
############################################################################
#
@@ -30,14 +29,11 @@
# ^^^^^^^^^^^^^
#
# We will start by simply listing only *supervised classification* tasks.
-# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
-# request a
+# **openml.tasks.list_tasks()** getting a
# `pandas dataframe `_
-# instead to have better visualization capabilities and easier access:
+# to have good visualization capabilities and easier access:
-tasks = openml.tasks.list_tasks(
- task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
-)
+tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
@@ -71,21 +67,21 @@
#
# Similar to listing tasks by task type, we can list tasks by tags:
-tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
+tasks = openml.tasks.list_tasks(tag="OpenML100")
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
############################################################################
# Furthermore, we can list tasks based on the dataset id:
-tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe")
+tasks = openml.tasks.list_tasks(data_id=1471)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
############################################################################
# In addition, a size limit and an offset can be applied both separately and simultaneously:
-tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
+tasks = openml.tasks.list_tasks(size=10, offset=50)
print(tasks)
############################################################################
@@ -101,7 +97,7 @@
# Finally, it is also possible to list all tasks on OpenML with:
############################################################################
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
print(len(tasks))
############################################################################
@@ -195,7 +191,7 @@
# Error code for 'task already exists'
if e.code == 614:
# Lookup task
- tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
+ tasks = openml.tasks.list_tasks(data_id=128)
tasks = tasks.query(
'task_type == "Supervised Classification" '
'and estimation_procedure == "10-fold Crossvalidation" '
diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py
index ae59c9ced..28015557b 100644
--- a/examples/40_paper/2015_neurips_feurer_example.py
+++ b/examples/40_paper/2015_neurips_feurer_example.py
@@ -13,12 +13,10 @@
| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
| In *Advances in Neural Information Processing Systems 28*, 2015
| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-""" # noqa F401
+"""
# License: BSD 3-Clause
-import pandas as pd
-
import openml
####################################################################################################
@@ -60,7 +58,6 @@
tasks = openml.tasks.list_tasks(
task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
status="all",
- output_format="dataframe",
)
# Query only those with holdout as the resampling startegy.
@@ -68,7 +65,7 @@
task_ids = []
for did in dataset_ids:
- tasks_ = list(tasks.query("did == {}".format(did)).tid)
+ tasks_ = list(tasks.query(f"did == {did}").tid)
if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest).
task_id = min(tasks_)
else:
diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py
index 8b225125b..d9fdc78a7 100644
--- a/examples/40_paper/2018_ida_strang_example.py
+++ b/examples/40_paper/2018_ida_strang_example.py
@@ -17,8 +17,8 @@
# License: BSD 3-Clause
import matplotlib.pyplot as plt
+
import openml
-import pandas as pd
##############################################################################
# A basic step for each data-mining or machine learning task is to determine
@@ -47,13 +47,17 @@
# Downloads all evaluation records related to this study
evaluations = openml.evaluations.list_evaluations(
- measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
+ measure,
+ size=None,
+ flows=flow_ids,
+ study=study_id,
+ output_format="dataframe",
)
# gives us a table with columns data_id, flow1_value, flow2_value
evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
# downloads all data qualities (for scatter plot)
data_qualities = openml.datasets.list_datasets(
- data_id=list(evaluations.index.values), output_format="dataframe"
+ data_id=list(evaluations.index.values),
)
# removes irrelevant data qualities
data_qualities = data_qualities[meta_features]
@@ -86,10 +90,9 @@
def determine_class(val_lin, val_nonlin):
if val_lin < val_nonlin:
return class_values[0]
- elif val_nonlin < val_lin:
+ if val_nonlin < val_lin:
return class_values[1]
- else:
- return class_values[2]
+ return class_values[2]
evaluations["class"] = evaluations.apply(
diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py
index 6522013e3..751f53470 100644
--- a/examples/40_paper/2018_kdd_rijn_example.py
+++ b/examples/40_paper/2018_kdd_rijn_example.py
@@ -1,4 +1,7 @@
"""
+This example is deprecated! You will need to manually remove adapt this code to make it run.
+We deprecated this example in our CI as it requires fanova as a dependency. However, fanova is not supported in all Python versions used in our CI/CD.
+
van Rijn and Hutter (2018)
==========================
@@ -29,147 +32,144 @@
"""
# License: BSD 3-Clause
-
+run_code = False
import sys
-
-if sys.platform == "win32": # noqa
+# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
+print("This example is deprecated, remove this code to use it manually.")
+if not run_code:
+ print("Exiting...")
+ sys.exit()
+
+import json
+
+import fanova
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+import openml
+
+##############################################################################
+# With the advent of automated machine learning, automated hyperparameter
+# optimization methods are by now routinely used in data mining. However, this
+# progress is not yet matched by equal progress on automatic analyses that
+# yield information beyond performance-optimizing hyperparameter settings.
+# In this example, we aim to answer the following two questions: Given an
+# algorithm, what are generally its most important hyperparameters?
+#
+# This work is carried out on the OpenML-100 benchmark suite, which can be
+# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
+# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
+# with specific kernel (we will perform a post-process filter operation for
+# this). We should set some other experimental parameters (number of results
+# per task, evaluation measure and the number of trees of the internal
+# functional Anova) before the fun can begin.
+#
+# Note that we simplify the example in several ways:
+#
+# 1) We only consider numerical hyperparameters
+# 2) We consider all hyperparameters that are numerical (in reality, some
+# hyperparameters might be inactive (e.g., ``degree``) or irrelevant
+# (e.g., ``random_state``)
+# 3) We assume all hyperparameters to be on uniform scale
+#
+# Any difference in conclusion between the actual paper and the presented
+# results is most likely due to one of these simplifications. For example,
+# the hyperparameter C looks rather insignificant, whereas it is quite
+# important when it is put on a log-scale. All these simplifications can be
+# addressed by defining a ConfigSpace. For a more elaborated example that uses
+# this, please see:
+# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py
+
+suite = openml.study.get_suite("OpenML100")
+flow_id = 7707
+parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
+evaluation_measure = "predictive_accuracy"
+limit_per_task = 500
+limit_nr_tasks = 15
+n_trees = 16
+
+fanova_results = []
+# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
+# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
+for idx, task_id in enumerate(suite.tasks):
+ if limit_nr_tasks is not None and idx >= limit_nr_tasks:
+ continue
print(
- "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
+ "Starting with task %d (%d/%d)"
+ % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
+ )
+ # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
+ evals = openml.evaluations.list_evaluations_setups(
+ evaluation_measure,
+ flows=[flow_id],
+ tasks=[task_id],
+ size=limit_per_task,
)
- exit()
-
-# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
-print("This example is deprecated, remove the `if False` in this code to use it manually.")
-if False:
- import json
- import fanova
- import matplotlib.pyplot as plt
- import pandas as pd
- import seaborn as sns
-
- import openml
-
- ##############################################################################
- # With the advent of automated machine learning, automated hyperparameter
- # optimization methods are by now routinely used in data mining. However, this
- # progress is not yet matched by equal progress on automatic analyses that
- # yield information beyond performance-optimizing hyperparameter settings.
- # In this example, we aim to answer the following two questions: Given an
- # algorithm, what are generally its most important hyperparameters?
- #
- # This work is carried out on the OpenML-100 benchmark suite, which can be
- # obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
- # conduct the experiment on the Support Vector Machine (``flow_id=7707``)
- # with specific kernel (we will perform a post-process filter operation for
- # this). We should set some other experimental parameters (number of results
- # per task, evaluation measure and the number of trees of the internal
- # functional Anova) before the fun can begin.
- #
- # Note that we simplify the example in several ways:
- #
- # 1) We only consider numerical hyperparameters
- # 2) We consider all hyperparameters that are numerical (in reality, some
- # hyperparameters might be inactive (e.g., ``degree``) or irrelevant
- # (e.g., ``random_state``)
- # 3) We assume all hyperparameters to be on uniform scale
- #
- # Any difference in conclusion between the actual paper and the presented
- # results is most likely due to one of these simplifications. For example,
- # the hyperparameter C looks rather insignificant, whereas it is quite
- # important when it is put on a log-scale. All these simplifications can be
- # addressed by defining a ConfigSpace. For a more elaborated example that uses
- # this, please see:
- # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
-
- suite = openml.study.get_suite("OpenML100")
- flow_id = 7707
- parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
- evaluation_measure = "predictive_accuracy"
- limit_per_task = 500
- limit_nr_tasks = 15
- n_trees = 16
-
- fanova_results = []
- # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
- # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
- for idx, task_id in enumerate(suite.tasks):
- if limit_nr_tasks is not None and idx >= limit_nr_tasks:
- continue
- print(
- "Starting with task %d (%d/%d)"
- % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
- )
- # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
- evals = openml.evaluations.list_evaluations_setups(
- evaluation_measure,
- flows=[flow_id],
- tasks=[task_id],
- size=limit_per_task,
- output_format="dataframe",
+ performance_column = "value"
+ # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
+ # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
+ # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
+ # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
+ # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
+ # setups that belong to the flows embedded in this example though.
+ try:
+ setups_evals = pd.DataFrame(
+ [
+ dict(
+ **{name: json.loads(value) for name, value in setup["parameters"].items()},
+ **{performance_column: setup[performance_column]},
+ )
+ for _, setup in evals.iterrows()
+ ]
)
-
- performance_column = "value"
- # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
- # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
- # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
- # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
- # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
- # setups that belong to the flows embedded in this example though.
+ except json.decoder.JSONDecodeError as e:
+ print("Task %d error: %s" % (task_id, e))
+ continue
+ # apply our filters, to have only the setups that comply to the hyperparameters we want
+ for filter_key, filter_value in parameter_filters.items():
+ setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
+ # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
+ # the fanova library needs to be informed by using a configspace object.
+ setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
+ # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
+ # ``verbose``.
+ setups_evals = setups_evals[
+ [
+ c
+ for c in list(setups_evals)
+ if len(setups_evals[c].unique()) > 1 or c == performance_column
+ ]
+ ]
+ # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
+ # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
+
+ # determine x values to pass to fanova library
+ parameter_names = [
+ pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
+ ]
+ evaluator = fanova.fanova.fANOVA(
+ X=setups_evals[parameter_names].to_numpy(),
+ Y=setups_evals[performance_column].to_numpy(),
+ n_trees=n_trees,
+ )
+ for idx, pname in enumerate(parameter_names):
try:
- setups_evals = pd.DataFrame(
- [
- dict(
- **{name: json.loads(value) for name, value in setup["parameters"].items()},
- **{performance_column: setup[performance_column]}
- )
- for _, setup in evals.iterrows()
- ]
+ fanova_results.append(
+ {
+ "hyperparameter": pname.split(".")[-1],
+ "fanova": evaluator.quantify_importance([idx])[(idx,)][
+ "individual importance"
+ ],
+ }
)
- except json.decoder.JSONDecodeError as e:
+ except RuntimeError as e:
+ # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
+ # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
+ # paper).
print("Task %d error: %s" % (task_id, e))
continue
- # apply our filters, to have only the setups that comply to the hyperparameters we want
- for filter_key, filter_value in parameter_filters.items():
- setups_evals = setups_evals[setups_evals[filter_key] == filter_value]
- # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters,
- # the fanova library needs to be informed by using a configspace object.
- setups_evals = setups_evals.select_dtypes(include=["int64", "float64"])
- # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``,
- # ``verbose``.
- setups_evals = setups_evals[
- [
- c
- for c in list(setups_evals)
- if len(setups_evals[c].unique()) > 1 or c == performance_column
- ]
- ]
- # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g.,
- # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out:
-
- # determine x values to pass to fanova library
- parameter_names = [
- pname for pname in setups_evals.columns.to_numpy() if pname != performance_column
- ]
- evaluator = fanova.fanova.fANOVA(
- X=setups_evals[parameter_names].to_numpy(),
- Y=setups_evals[performance_column].to_numpy(),
- n_trees=n_trees,
- )
- for idx, pname in enumerate(parameter_names):
- try:
- fanova_results.append(
- {
- "hyperparameter": pname.split(".")[-1],
- "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
- }
- )
- except RuntimeError as e:
- # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
- # for all configurations (there is no variance). We will skip these tasks (like the authors did in the
- # paper).
- print("Task %d error: %s" % (task_id, e))
- continue
# transform ``fanova_results`` from a list of dicts into a DataFrame
fanova_results = pd.DataFrame(fanova_results)
diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
index 0d72846ac..91768e010 100644
--- a/examples/40_paper/2018_neurips_perrone_example.py
+++ b/examples/40_paper/2018_neurips_perrone_example.py
@@ -27,16 +27,17 @@
# License: BSD 3-Clause
-import openml
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
+from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
-from sklearn.ensemble import RandomForestRegressor
+
+import openml
flow_type = "svm" # this example will use the smaller svm flow evaluations
############################################################################
@@ -94,7 +95,6 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu
tasks=task_ids,
flows=[flow_id],
uploaders=[2702],
- output_format="dataframe",
parameters_in_separate_columns=True,
)
return eval_df, task_ids, flow_id
@@ -181,8 +181,18 @@ def list_categorical_attributes(flow_type="svm"):
num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
# Creating the one-hot encoder for numerical representation of categorical columns
-enc = OneHotEncoder(handle_unknown="ignore")
-
+enc = Pipeline(
+ [
+ (
+ "cat_si",
+ SimpleImputer(
+ strategy="constant",
+ fill_value="missing",
+ ),
+ ),
+ ("cat_ohe", OneHotEncoder(handle_unknown="ignore")),
+ ],
+)
# Combining column transformers
ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
@@ -206,7 +216,7 @@ def list_categorical_attributes(flow_type="svm"):
model.fit(X, y)
y_pred = model.predict(X)
-print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
+print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")
#############################################################################
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index 5190ac522..fa83d2b8a 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -19,13 +19,28 @@
from openml.base import OpenMLBase
from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
-from openml.exceptions import PyOpenMLError
from .data_feature import OpenMLDataFeature
logger = logging.getLogger(__name__)
+def _ensure_dataframe(
+ data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix,
+ attribute_names: list | None = None,
+) -> pd.DataFrame:
+ if isinstance(data, pd.DataFrame):
+ return data
+ if scipy.sparse.issparse(data):
+ return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
+ if isinstance(data, np.ndarray):
+ return pd.DataFrame(data, columns=attribute_names) # type: ignore
+ if isinstance(data, pd.Series):
+ return data.to_frame()
+
+ raise TypeError(f"Data type {type(data)} not supported.")
+
+
class OpenMLDataset(OpenMLBase):
"""Dataset object.
@@ -448,7 +463,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915
data = self._get_arff(self.format)
except OSError as e:
logger.critical(
- f"Please check that the data file {arff_file_path} is " "there and can be read.",
+ f"Please check that the data file {arff_file_path} is there and can be read.",
)
raise e
@@ -579,13 +594,17 @@ def _cache_compressed_file_from_file(
return data, categorical, attribute_names
- def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
+ def _parse_data_from_file(
+ self,
+ data_file: Path,
+ ) -> tuple[list[str], list[bool], pd.DataFrame | scipy.sparse.csr_matrix]:
if data_file.suffix == ".arff":
data, categorical, attribute_names = self._parse_data_from_arff(data_file)
elif data_file.suffix == ".pq":
attribute_names, categorical, data = self._parse_data_from_pq(data_file)
else:
raise ValueError(f"Unknown file type for file '{data_file}'.")
+
return attribute_names, categorical, data
def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
@@ -597,7 +616,7 @@ def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], p
attribute_names = list(data.columns)
return attribute_names, categorical, data
- def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool], list[str]]: # noqa: PLR0912, C901
+ def _load_data(self) -> tuple[pd.DataFrame, list[bool], list[str]]: # noqa: PLR0912, C901, PLR0915
"""Load data from compressed format or arff. Download data if not present on disk."""
need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None
@@ -608,7 +627,8 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
assert file_to_load is not None
- return self._cache_compressed_file_from_file(Path(file_to_load))
+ data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load))
+ return _ensure_dataframe(data, attrs), cats, attrs
# helper variable to help identify where errors occur
fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file
@@ -620,12 +640,13 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
data = pd.read_feather(self.data_feather_file)
fpath = self.feather_attribute_file
- with open(self.feather_attribute_file, "rb") as fh: # noqa: PTH123
+ with self.feather_attribute_file.open("rb") as fh:
categorical, attribute_names = pickle.load(fh) # noqa: S301
else:
assert self.data_pickle_file is not None
- with open(self.data_pickle_file, "rb") as fh: # noqa: PTH123
+ with self.data_pickle_file.open("rb") as fh:
data, categorical, attribute_names = pickle.load(fh) # noqa: S301
+
except FileNotFoundError as e:
raise ValueError(
f"Cannot find file for dataset {self.name} at location '{fpath}'."
@@ -664,7 +685,7 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
assert file_to_load is not None
attr, cat, df = self._parse_data_from_file(Path(file_to_load))
- return df, cat, attr
+ return _ensure_dataframe(df), cat, attr
data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
if self.cache_format == "pickle" and not data_up_to_date:
@@ -672,79 +693,9 @@ def _load_data(self) -> tuple[pd.DataFrame | scipy.sparse.csr_matrix, list[bool]
file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
assert file_to_load is not None
- return self._cache_compressed_file_from_file(Path(file_to_load))
- return data, categorical, attribute_names
-
- # TODO(eddiebergman): Can type this better with overload
- # TODO(eddiebergman): Could also techinically use scipy.sparse.sparray
- @staticmethod
- def _convert_array_format(
- data: pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix,
- array_format: Literal["array", "dataframe"],
- attribute_names: list | None = None,
- ) -> pd.DataFrame | pd.Series | np.ndarray | scipy.sparse.spmatrix:
- """Convert a dataset to a given array format.
-
- Converts to numpy array if data is non-sparse.
- Converts to a sparse dataframe if data is sparse.
+ data, cats, attrs = self._cache_compressed_file_from_file(Path(file_to_load))
- Parameters
- ----------
- array_format : str {'array', 'dataframe'}
- Desired data type of the output
- - If array_format='array'
- If data is non-sparse
- Converts to numpy-array
- Enforces numeric encoding of categorical columns
- Missing values are represented as NaN in the numpy-array
- else returns data as is
- - If array_format='dataframe'
- If data is sparse
- Works only on sparse data
- Converts sparse data to sparse dataframe
- else returns data as is
-
- """
- if array_format == "array" and not isinstance(data, scipy.sparse.spmatrix):
- # We encode the categories such that they are integer to be able
- # to make a conversion to numeric for backward compatibility
- def _encode_if_category(column: pd.Series | np.ndarray) -> pd.Series | np.ndarray:
- if column.dtype.name == "category":
- column = column.cat.codes.astype(np.float32)
- mask_nan = column == -1
- column[mask_nan] = np.nan
- return column
-
- if isinstance(data, pd.DataFrame):
- columns = {
- column_name: _encode_if_category(data.loc[:, column_name])
- for column_name in data.columns
- }
- data = pd.DataFrame(columns)
- else:
- data = _encode_if_category(data)
-
- try:
- # TODO(eddiebergman): float32?
- return_array = np.asarray(data, dtype=np.float32)
- except ValueError as e:
- raise PyOpenMLError(
- "PyOpenML cannot handle string when returning numpy"
- ' arrays. Use dataset_format="dataframe".',
- ) from e
-
- return return_array
-
- if array_format == "dataframe":
- if scipy.sparse.issparse(data):
- data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
- else:
- data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
- logger.warning(
- f"Cannot convert {data_type} ({type(data)}) to '{array_format}'."
- " Returning input data.",
- )
- return data
+ return _ensure_dataframe(data, attribute_names), categorical, attribute_names
@staticmethod
def _unpack_categories(series: pd.Series, categories: list) -> pd.Series:
@@ -765,19 +716,13 @@ def valid_category(cat: Any) -> bool:
raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories)
return pd.Series(raw_cat, index=series.index, name=series.name)
- def get_data( # noqa: C901, PLR0912, PLR0915
+ def get_data( # noqa: C901
self,
target: list[str] | str | None = None,
include_row_id: bool = False, # noqa: FBT001, FBT002
include_ignore_attribute: bool = False, # noqa: FBT001, FBT002
- dataset_format: Literal["array", "dataframe"] = "dataframe",
- ) -> tuple[
- np.ndarray | pd.DataFrame | scipy.sparse.csr_matrix,
- np.ndarray | pd.DataFrame | None,
- list[bool],
- list[str],
- ]:
- """Returns dataset content as dataframes or sparse matrices.
+ ) -> tuple[pd.DataFrame, pd.Series | None, list[bool], list[str]]:
+ """Returns dataset content as dataframes.
Parameters
----------
@@ -789,35 +734,20 @@ def get_data( # noqa: C901, PLR0912, PLR0915
include_ignore_attribute : boolean (default=False)
Whether to include columns that are marked as "ignore"
on the server in the dataset.
- dataset_format : string (default='dataframe')
- The format of returned dataset.
- If ``array``, the returned dataset will be a NumPy array or a SciPy sparse
- matrix. Support for ``array`` will be removed in 0.15.
- If ``dataframe``, the returned dataset will be a Pandas DataFrame.
Returns
-------
- X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
- Dataset
- y : ndarray or pd.Series, shape (n_samples, ) or None
+ X : dataframe, shape (n_samples, n_columns)
+ Dataset, may have sparse dtypes in the columns if required.
+ y : pd.Series, shape (n_samples, ) or None
Target column
- categorical_indicator : boolean ndarray
+ categorical_indicator : list[bool]
Mask that indicate categorical features.
- attribute_names : List[str]
+ attribute_names : list[str]
List of attribute names.
"""
- # TODO: [0.15]
- if dataset_format == "array":
- warnings.warn(
- "Support for `dataset_format='array'` will be removed in 0.15,"
- "start using `dataset_format='dataframe' to ensure your code "
- "will continue to work. You can use the dataframe's `to_numpy` "
- "function to continue using numpy arrays.",
- category=FutureWarning,
- stacklevel=2,
- )
- data, categorical, attribute_names = self._load_data()
+ data, categorical_mask, attribute_names = self._load_data()
to_exclude = []
if not include_row_id and self.row_id_attribute is not None:
@@ -835,54 +765,34 @@ def get_data( # noqa: C901, PLR0912, PLR0915
if len(to_exclude) > 0:
logger.info(f"Going to remove the following attributes: {to_exclude}")
keep = np.array([column not in to_exclude for column in attribute_names])
- data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep]
-
- categorical = [cat for cat, k in zip(categorical, keep) if k]
+ data = data.drop(columns=to_exclude)
+ categorical_mask = [cat for cat, k in zip(categorical_mask, keep) if k]
attribute_names = [att for att, k in zip(attribute_names, keep) if k]
if target is None:
- data = self._convert_array_format(data, dataset_format, attribute_names) # type: ignore
- targets = None
+ return data, None, categorical_mask, attribute_names
+
+ if isinstance(target, str):
+ target_names = target.split(",") if "," in target else [target]
else:
- if isinstance(target, str):
- target = target.split(",") if "," in target else [target]
- targets = np.array([column in target for column in attribute_names])
- target_names = [column for column in attribute_names if column in target]
- if np.sum(targets) > 1:
- raise NotImplementedError(
- "Number of requested targets %d is not implemented." % np.sum(targets),
- )
- target_categorical = [
- cat for cat, column in zip(categorical, attribute_names) if column in target
- ]
- target_dtype = int if target_categorical[0] else float
-
- if isinstance(data, pd.DataFrame):
- x = data.iloc[:, ~targets]
- y = data.iloc[:, targets]
- else:
- x = data[:, ~targets]
- y = data[:, targets].astype(target_dtype) # type: ignore
-
- categorical = [cat for cat, t in zip(categorical, targets) if not t]
- attribute_names = [att for att, k in zip(attribute_names, targets) if not k]
-
- x = self._convert_array_format(x, dataset_format, attribute_names) # type: ignore
- if dataset_format == "array" and scipy.sparse.issparse(y):
- # scikit-learn requires dense representation of targets
- y = np.asarray(y.todense()).astype(target_dtype)
- # dense representation of single column sparse arrays become a 2-d array
- # need to flatten it to a 1-d array for _convert_array_format()
- y = y.squeeze()
- y = self._convert_array_format(y, dataset_format, target_names)
- y = y.astype(target_dtype) if isinstance(y, np.ndarray) else y
- if len(y.shape) > 1 and y.shape[1] == 1:
- # single column targets should be 1-d for both `array` and `dataframe` formats
- assert isinstance(y, (np.ndarray, pd.DataFrame, pd.Series))
- y = y.squeeze()
- data, targets = x, y
-
- return data, targets, categorical, attribute_names # type: ignore
+ target_names = target
+
+ # All the assumptions below for the target are dependant on the number of targets being 1
+ n_targets = len(target_names)
+ if n_targets > 1:
+ raise NotImplementedError(f"Number of targets {n_targets} not implemented.")
+
+ target_name = target_names[0]
+ x = data.drop(columns=[target_name])
+ y = data[target_name].squeeze()
+
+ # Finally, remove the target from the list of attributes and categorical mask
+ target_index = attribute_names.index(target_name)
+ categorical_mask.pop(target_index)
+ attribute_names.remove(target_name)
+
+ assert isinstance(y, pd.Series)
+ return x, y, categorical_mask, attribute_names
def _load_features(self) -> None:
"""Load the features metadata from the server and store it in the dataset object."""
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 026515e5d..59f1da521 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -6,9 +6,10 @@
import os
import warnings
from collections import OrderedDict
+from functools import partial
from pathlib import Path
from pyexpat import ExpatError
-from typing import TYPE_CHECKING, Any, overload
+from typing import TYPE_CHECKING, Any
from typing_extensions import Literal
import arff
@@ -72,59 +73,26 @@ def list_qualities() -> list[str]:
raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"')
if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list):
- raise TypeError("Error in return XML, does not contain " '"oml:quality" as a list')
+ raise TypeError('Error in return XML, does not contain "oml:quality" as a list')
return qualities["oml:data_qualities_list"]["oml:quality"]
-@overload
-def list_datasets(
- data_id: list[int] | None = ...,
- offset: int | None = ...,
- size: int | None = ...,
- status: str | None = ...,
- tag: str | None = ...,
- *,
- output_format: Literal["dataframe"],
- **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-@overload
-def list_datasets(
- data_id: list[int] | None,
- offset: int | None,
- size: int | None,
- status: str | None,
- tag: str | None,
- output_format: Literal["dataframe"],
- **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-@overload
-def list_datasets(
- data_id: list[int] | None = ...,
- offset: int | None = ...,
- size: int | None = ...,
- status: str | None = ...,
- tag: str | None = ...,
- output_format: Literal["dict"] = "dict",
- **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
def list_datasets(
data_id: list[int] | None = None,
offset: int | None = None,
size: int | None = None,
status: str | None = None,
tag: str | None = None,
- output_format: Literal["dataframe", "dict"] = "dict",
- **kwargs: Any,
-) -> dict | pd.DataFrame:
- """
- Return a list of all dataset which are on OpenML.
+ data_name: str | None = None,
+ data_version: int | None = None,
+ number_instances: int | str | None = None,
+ number_features: int | str | None = None,
+ number_classes: int | str | None = None,
+ number_missing_values: int | str | None = None,
+) -> pd.DataFrame:
+ """Return a dataframe of all dataset which are on OpenML.
+
Supports large amount of results.
Parameters
@@ -141,87 +109,51 @@ def list_datasets(
default active datasets are returned, but also datasets
from another status can be requested.
tag : str, optional
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
- kwargs : dict, optional
- Legal filter operators (keys in the dict):
- data_name, data_version, number_instances,
- number_features, number_classes, number_missing_values.
+ data_name : str, optional
+ data_version : int, optional
+ number_instances : int | str, optional
+ number_features : int | str, optional
+ number_classes : int | str, optional
+ number_missing_values : int | str, optional
Returns
-------
- datasets : dict of dicts, or dataframe
- - If output_format='dict'
- A mapping from dataset ID to dict.
-
- Every dataset is represented by a dictionary containing
- the following information:
- - dataset id
- - name
- - format
- - status
- If qualities are calculated for the dataset, some of
- these are also returned.
-
- - If output_format='dataframe'
- Each row maps to a dataset
- Each column contains the following information:
- - dataset id
- - name
- - format
- - status
- If qualities are calculated for the dataset, some of
- these are also included as columns.
+ datasets: dataframe
+ Each row maps to a dataset
+ Each column contains the following information:
+ - dataset id
+ - name
+ - format
+ - status
+ If qualities are calculated for the dataset, some of
+ these are also included as columns.
"""
- if output_format not in ["dataframe", "dict"]:
- raise ValueError(
- "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
- )
-
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15 "
- "and pandas dataframes will be returned instead. To ensure your code "
- "will continue to work, use `output_format`='dataframe'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
- return openml.utils._list_all( # type: ignore
+ listing_call = partial(
+ _list_datasets,
data_id=data_id,
- list_output_format=output_format, # type: ignore
- listing_call=_list_datasets,
- offset=offset,
- size=size,
status=status,
tag=tag,
- **kwargs,
+ data_name=data_name,
+ data_version=data_version,
+ number_instances=number_instances,
+ number_features=number_features,
+ number_classes=number_classes,
+ number_missing_values=number_missing_values,
)
+ batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+ if len(batches) == 0:
+ return pd.DataFrame()
-
-@overload
-def _list_datasets(
- data_id: list | None = ...,
- output_format: Literal["dict"] = "dict",
- **kwargs: Any,
-) -> dict: ...
+ return pd.concat(batches)
-@overload
def _list_datasets(
- data_id: list | None = ...,
- output_format: Literal["dataframe"] = "dataframe",
- **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-def _list_datasets(
- data_id: list | None = None,
- output_format: Literal["dict", "dataframe"] = "dict",
+ limit: int,
+ offset: int,
+ *,
+ data_id: list[int] | None = None,
**kwargs: Any,
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
"""
Perform api call to return a list of all datasets.
@@ -232,12 +164,12 @@ def _list_datasets(
display_errors is also separated from the kwargs since it has a
default value.
+ limit : int
+ The maximum number of datasets to show.
+ offset : int
+ The number of datasets to skip, starting from the first.
data_id : list, optional
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
kwargs : dict, optional
Legal filter operators (keys in the dict):
tag, status, limit, offset, data_name, data_version, number_instances,
@@ -245,30 +177,25 @@ def _list_datasets(
Returns
-------
- datasets : dict of dicts, or dataframe
+ datasets : dataframe
"""
api_call = "data/list"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
+
if kwargs is not None:
for operator, value in kwargs.items():
- api_call += f"/{operator}/{value}"
+ if value is not None:
+ api_call += f"/{operator}/{value}"
if data_id is not None:
api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id]))
- return __list_datasets(api_call=api_call, output_format=output_format)
-
-
-@overload
-def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
-
-
-@overload
-def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
+ return __list_datasets(api_call=api_call)
-def __list_datasets(
- api_call: str,
- output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
+def __list_datasets(api_call: str) -> pd.DataFrame:
xml_string = openml._api_calls._perform_api_call(api_call, "get")
datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
@@ -297,10 +224,13 @@ def __list_datasets(
dataset[quality["@name"]] = float(quality["#text"])
datasets[dataset["did"]] = dataset
- if output_format == "dataframe":
- datasets = pd.DataFrame.from_dict(datasets, orient="index")
-
- return datasets
+ return pd.DataFrame.from_dict(datasets, orient="index").astype(
+ {
+ "did": int,
+ "version": int,
+ "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]),
+ }
+ )
def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
@@ -351,12 +281,13 @@ def check_datasets_active(
dict
A dictionary with items {did: bool}
"""
- datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
- missing = set(dataset_ids) - set(datasets.get("did", []))
+ datasets = list_datasets(status="all", data_id=dataset_ids)
+ missing = set(dataset_ids) - set(datasets.index)
if raise_error_if_not_exist and missing:
missing_str = ", ".join(str(did) for did in missing)
raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
- return dict(datasets["status"] == "active")
+ mask = datasets["status"] == "active"
+ return dict(mask)
def _name_to_id(
@@ -394,7 +325,6 @@ def _name_to_id(
data_name=dataset_name,
status=status,
data_version=version,
- output_format="dataframe",
)
if error_if_multiple and len(candidates) > 1:
msg = f"Multiple active datasets exist with name '{dataset_name}'."
@@ -1497,8 +1427,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
def _get_online_dataset_format(dataset_id: int) -> str:
- """Get the dataset format for a given dataset id
- from the OpenML website.
+ """Get the dataset format for a given dataset id from the OpenML website.
Parameters
----------
diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index 3cf732f25..70fab9f28 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -8,6 +8,8 @@
import openml.tasks
+# TODO(eddiebergman): A lot of this class is automatically
+# handled by a dataclass
class OpenMLEvaluation:
"""
Contains all meta-information about a run / evaluation combination,
@@ -78,6 +80,24 @@ def __init__( # noqa: PLR0913
self.values = values
self.array_data = array_data
+ def _to_dict(self) -> dict:
+ return {
+ "run_id": self.run_id,
+ "task_id": self.task_id,
+ "setup_id": self.setup_id,
+ "flow_id": self.flow_id,
+ "flow_name": self.flow_name,
+ "data_id": self.data_id,
+ "data_name": self.data_name,
+ "function": self.function,
+ "upload_time": self.upload_time,
+ "uploader": self.uploader,
+ "uploader_name": self.uploader_name,
+ "value": self.value,
+ "values": self.values,
+ "array_data": self.array_data,
+ }
+
def __repr__(self) -> str:
header = "OpenML Evaluation"
header = "{}\n{}\n".format(header, "=" * len(header))
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index a39096a58..f44fe3a93 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -3,7 +3,8 @@
from __future__ import annotations
import json
-import warnings
+from functools import partial
+from itertools import chain
from typing import Any
from typing_extensions import Literal, overload
@@ -20,43 +21,43 @@
@overload
def list_evaluations(
function: str,
- offset: int | None = ...,
- size: int | None = ...,
- tasks: list[str | int] | None = ...,
- setups: list[str | int] | None = ...,
- flows: list[str | int] | None = ...,
- runs: list[str | int] | None = ...,
- uploaders: list[str | int] | None = ...,
- tag: str | None = ...,
- study: int | None = ...,
- per_fold: bool | None = ...,
- sort_order: str | None = ...,
- output_format: Literal["dict", "object"] = "dict",
-) -> dict: ...
+ offset: int | None = None,
+ size: int | None = None,
+ tasks: list[str | int] | None = None,
+ setups: list[str | int] | None = None,
+ flows: list[str | int] | None = None,
+ runs: list[str | int] | None = None,
+ uploaders: list[str | int] | None = None,
+ tag: str | None = None,
+ study: int | None = None,
+ per_fold: bool | None = None,
+ sort_order: str | None = None,
+ output_format: Literal["dataframe"] = ...,
+) -> pd.DataFrame: ...
@overload
def list_evaluations(
function: str,
- offset: int | None = ...,
- size: int | None = ...,
- tasks: list[str | int] | None = ...,
- setups: list[str | int] | None = ...,
- flows: list[str | int] | None = ...,
- runs: list[str | int] | None = ...,
- uploaders: list[str | int] | None = ...,
- tag: str | None = ...,
- study: int | None = ...,
- per_fold: bool | None = ...,
- sort_order: str | None = ...,
- output_format: Literal["dataframe"] = ...,
-) -> pd.DataFrame: ...
+ offset: int | None = None,
+ size: int | None = None,
+ tasks: list[str | int] | None = None,
+ setups: list[str | int] | None = None,
+ flows: list[str | int] | None = None,
+ runs: list[str | int] | None = None,
+ uploaders: list[str | int] | None = None,
+ tag: str | None = None,
+ study: int | None = None,
+ per_fold: bool | None = None,
+ sort_order: str | None = None,
+ output_format: Literal["object"] = "object",
+) -> dict[int, OpenMLEvaluation]: ...
def list_evaluations(
function: str,
offset: int | None = None,
- size: int | None = 10000,
+ size: int | None = None,
tasks: list[str | int] | None = None,
setups: list[str | int] | None = None,
flows: list[str | int] | None = None,
@@ -66,10 +67,10 @@ def list_evaluations(
study: int | None = None,
per_fold: bool | None = None,
sort_order: str | None = None,
- output_format: Literal["object", "dict", "dataframe"] = "object",
-) -> dict | pd.DataFrame:
- """
- List all run-evaluation pairs matching all of the given filters.
+ output_format: Literal["object", "dataframe"] = "object",
+) -> dict[int, OpenMLEvaluation] | pd.DataFrame:
+ """List all run-evaluation pairs matching all of the given filters.
+
(Supports large amount of results)
Parameters
@@ -105,37 +106,22 @@ def list_evaluations(
output_format: str, optional (default='object')
The parameter decides the format of the output.
- If 'object' the output is a dict of OpenMLEvaluation objects
- - If 'dict' the output is a dict of dict
- If 'dataframe' the output is a pandas DataFrame
Returns
-------
dict or dataframe
"""
- if output_format not in ["dataframe", "dict", "object"]:
- raise ValueError(
- "Invalid output format selected. Only 'object', 'dataframe', or 'dict' applicable.",
- )
-
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15. "
- "To ensure your code will continue to work, "
- "use `output_format`='dataframe' or `output_format`='object'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
+ if output_format not in ("dataframe", "object"):
+ raise ValueError("Invalid output format. Only 'object', 'dataframe'.")
per_fold_str = None
if per_fold is not None:
per_fold_str = str(per_fold).lower()
- return openml.utils._list_all( # type: ignore
- list_output_format=output_format, # type: ignore
- listing_call=_list_evaluations,
+ listing_call = partial(
+ _list_evaluations,
function=function,
- offset=offset,
- size=size,
tasks=tasks,
setups=setups,
flows=flows,
@@ -146,9 +132,20 @@ def list_evaluations(
sort_order=sort_order,
per_fold=per_fold_str,
)
+ eval_collection = openml.utils._list_all(listing_call, offset=offset, limit=size)
+
+ flattened = list(chain.from_iterable(eval_collection))
+ if output_format == "dataframe":
+ records = [item._to_dict() for item in flattened]
+ return pd.DataFrame.from_records(records) # No index...
+ return {e.run_id: e for e in flattened}
-def _list_evaluations(
+
+def _list_evaluations( # noqa: C901
+ limit: int,
+ offset: int,
+ *,
function: str,
tasks: list | None = None,
setups: list | None = None,
@@ -157,9 +154,8 @@ def _list_evaluations(
uploaders: list | None = None,
study: int | None = None,
sort_order: str | None = None,
- output_format: Literal["object", "dict", "dataframe"] = "object",
**kwargs: Any,
-) -> dict | pd.DataFrame:
+) -> list[OpenMLEvaluation]:
"""
Perform API call ``/evaluation/function{function}/{filters}``
@@ -168,6 +164,10 @@ def _list_evaluations(
The arguments that are lists are separated from the single value
ones which are put into the kwargs.
+ limit : int
+ the number of evaluations to return
+ offset : int
+ the number of evaluations to skip, starting from the first
function : str
the evaluation function. e.g., predictive_accuracy
@@ -185,27 +185,24 @@ def _list_evaluations(
study : int, optional
kwargs: dict, optional
- Legal filter operators: tag, limit, offset.
+ Legal filter operators: tag, per_fold
sort_order : str, optional
order of sorting evaluations, ascending ("asc") or descending ("desc")
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
- - If 'dataframe' the output is a pandas DataFrame
-
Returns
-------
- dict of objects, or dataframe
+ list of OpenMLEvaluation objects
"""
api_call = f"evaluation/list/function/{function}"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
if kwargs is not None:
for operator, value in kwargs.items():
- api_call += f"/{operator}/{value}"
+ if value is not None:
+ api_call += f"/{operator}/{value}"
if tasks is not None:
api_call += "/task/{}".format(",".join([str(int(i)) for i in tasks]))
if setups is not None:
@@ -217,17 +214,14 @@ def _list_evaluations(
if uploaders is not None:
api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploaders]))
if study is not None:
- api_call += "/study/%d" % study
+ api_call += f"/study/{study}"
if sort_order is not None:
api_call += f"/sort_order/{sort_order}"
- return __list_evaluations(api_call, output_format=output_format)
+ return __list_evaluations(api_call)
-def __list_evaluations(
- api_call: str,
- output_format: Literal["object", "dict", "dataframe"] = "object",
-) -> dict | pd.DataFrame:
+def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]:
"""Helper function to parse API calls which are lists of runs"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
@@ -241,29 +235,24 @@ def __list_evaluations(
evals_dict["oml:evaluations"],
)
- evals: dict[int, dict | OpenMLEvaluation] = {}
uploader_ids = list(
{eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]},
)
api_users = "user/list/user_id/" + ",".join(uploader_ids)
xml_string_user = openml._api_calls._perform_api_call(api_users, "get")
+
users = xmltodict.parse(xml_string_user, force_list=("oml:user",))
user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
+
+ evals = []
for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
run_id = int(eval_["oml:run_id"])
-
- value = None
- if "oml:value" in eval_:
- value = float(eval_["oml:value"])
-
- values = None
- if "oml:values" in eval_:
- values = json.loads(eval_["oml:values"])
-
+ value = float(eval_["oml:value"]) if "oml:value" in eval_ else None
+ values = json.loads(eval_["oml:values"]) if eval_.get("oml:values", None) else None
array_data = eval_.get("oml:array_data")
- if output_format == "object":
- evals[run_id] = OpenMLEvaluation(
+ evals.append(
+ OpenMLEvaluation(
run_id=run_id,
task_id=int(eval_["oml:task_id"]),
setup_id=int(eval_["oml:setup_id"]),
@@ -279,28 +268,7 @@ def __list_evaluations(
values=values,
array_data=array_data,
)
- else:
- # for output_format in ['dict', 'dataframe']
- evals[run_id] = {
- "run_id": int(eval_["oml:run_id"]),
- "task_id": int(eval_["oml:task_id"]),
- "setup_id": int(eval_["oml:setup_id"]),
- "flow_id": int(eval_["oml:flow_id"]),
- "flow_name": eval_["oml:flow_name"],
- "data_id": int(eval_["oml:data_id"]),
- "data_name": eval_["oml:data_name"],
- "function": eval_["oml:function"],
- "upload_time": eval_["oml:upload_time"],
- "uploader": int(eval_["oml:uploader"]),
- "uploader_name": user_dict[eval_["oml:uploader"]],
- "value": value,
- "values": values,
- "array_data": array_data,
- }
-
- if output_format == "dataframe":
- rows = list(evals.values())
- return pd.DataFrame.from_records(rows, columns=rows[0].keys()) # type: ignore
+ )
return evals
@@ -321,9 +289,11 @@ def list_evaluation_measures() -> list[str]:
qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
# Minimalistic check if the XML is useful
if "oml:evaluation_measures" not in qualities:
- raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
+ raise ValueError('Error in return XML, does not contain "oml:evaluation_measures"')
+
if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
- raise TypeError("Error in return XML, does not contain " '"oml:measure" as a list')
+ raise TypeError('Error in return XML, does not contain "oml:measure" as a list')
+
return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]
@@ -343,14 +313,13 @@ def list_estimation_procedures() -> list[str]:
# Minimalistic check if the XML is useful
if "oml:estimationprocedures" not in api_results:
- raise ValueError("Error in return XML, does not contain " '"oml:estimationprocedures"')
+ raise ValueError('Error in return XML, does not contain "oml:estimationprocedures"')
+
if "oml:estimationprocedure" not in api_results["oml:estimationprocedures"]:
- raise ValueError("Error in return XML, does not contain " '"oml:estimationprocedure"')
+ raise ValueError('Error in return XML, does not contain "oml:estimationprocedure"')
if not isinstance(api_results["oml:estimationprocedures"]["oml:estimationprocedure"], list):
- raise TypeError(
- "Error in return XML, does not contain " '"oml:estimationprocedure" as a list',
- )
+ raise TypeError('Error in return XML, does not contain "oml:estimationprocedure" as a list')
return [
prod["oml:name"]
@@ -370,11 +339,9 @@ def list_evaluations_setups(
tag: str | None = None,
per_fold: bool | None = None,
sort_order: str | None = None,
- output_format: str = "dataframe",
parameters_in_separate_columns: bool = False, # noqa: FBT001, FBT002
-) -> dict | pd.DataFrame:
- """
- List all run-evaluation pairs matching all of the given filters
+) -> pd.DataFrame:
+ """List all run-evaluation pairs matching all of the given filters
and their hyperparameter settings.
Parameters
@@ -400,23 +367,16 @@ def list_evaluations_setups(
per_fold : bool, optional
sort_order : str, optional
order of sorting evaluations, ascending ("asc") or descending ("desc")
- output_format: str, optional (default='dataframe')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
parameters_in_separate_columns: bool, optional (default= False)
Returns hyperparameters in separate columns if set to True.
Valid only for a single flow
-
Returns
-------
- dict or dataframe with hyperparameter settings as a list of tuples.
+ dataframe with hyperparameter settings as a list of tuples.
"""
if parameters_in_separate_columns and (flows is None or len(flows) != 1):
- raise ValueError(
- "Can set parameters_in_separate_columns to true " "only for single flow_id",
- )
+ raise ValueError("Can set parameters_in_separate_columns to true only for single flow_id")
# List evaluations
evals = list_evaluations(
@@ -439,21 +399,24 @@ def list_evaluations_setups(
_df = pd.DataFrame()
if len(evals) != 0:
N = 100 # size of section
- length = len(evals["setup_id"].unique()) # length of the array we want to split
+ uniq = np.asarray(evals["setup_id"].unique())
+ length = len(uniq)
+
# array_split - allows indices_or_sections to not equally divide the array
# array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
- uniq = np.asarray(evals["setup_id"].unique())
- setup_chunks = np.array_split(uniq, ((length - 1) // N) + 1)
+ split_size = ((length - 1) // N) + 1
+ setup_chunks = np.array_split(uniq, split_size)
+
setup_data = pd.DataFrame()
for _setups in setup_chunks:
result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
assert isinstance(result, pd.DataFrame)
result = result.drop("flow_id", axis=1)
# concat resulting setup chunks into single datframe
- setup_data = pd.concat([setup_data, result], ignore_index=True)
+ setup_data = pd.concat([setup_data, result])
parameters = []
- # Convert parameters of setup into list of tuples of (hyperparameter, value)
+ # Convert parameters of setup into dict of (hyperparameter, value)
for parameter_dict in setup_data["parameters"]:
if parameter_dict is not None:
parameters.append(
@@ -471,7 +434,4 @@ def list_evaluations_setups(
axis=1,
)
- if output_format == "dataframe":
- return _df
-
- return _df.to_dict(orient="index")
+ return _df
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index 2d40d03b8..fc8697e84 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -1144,7 +1144,7 @@ def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]
optional_params[param] = default_val
return optional_params, required_params
- def _deserialize_model(
+ def _deserialize_model( # noqa: C901
self,
flow: OpenMLFlow,
keep_defaults: bool, # noqa: FBT001
@@ -1219,6 +1219,20 @@ def _deserialize_model(
if param not in components:
del parameter_dict[param]
+ if not strict_version:
+ # Ignore incompatible parameters
+ allowed_parameter = list(inspect.signature(model_class.__init__).parameters)
+ for p in list(parameter_dict.keys()):
+ if p not in allowed_parameter:
+ warnings.warn(
+ f"While deserializing in a non-strict way, parameter {p} is not "
+ f"allowed for {model_class.__name__} likely due to a version mismatch. "
+ "We ignore the parameter.",
+ UserWarning,
+ stacklevel=2,
+ )
+ del parameter_dict[p]
+
return model_class(**parameter_dict)
def _check_dependencies(
@@ -1254,8 +1268,7 @@ def _check_dependencies(
else:
raise NotImplementedError(f"operation '{operation}' is not supported")
message = (
- "Trying to deserialize a model with dependency "
- f"{dependency_string} not satisfied."
+ f"Trying to deserialize a model with dependency {dependency_string} not satisfied."
)
if not check:
if strict_version:
@@ -1497,7 +1510,7 @@ def _prevent_optimize_n_jobs(self, model):
)
if len(n_jobs_vals) > 0:
raise PyOpenMLError(
- "openml-python should not be used to " "optimize the n_jobs parameter.",
+ "openml-python should not be used to optimize the n_jobs parameter.",
)
################################################################################################
@@ -1555,7 +1568,7 @@ def _seed_current_object(current_value):
if current_value is not None:
raise ValueError(
- "Models should be seeded with int or None (this should never " "happen). ",
+ "Models should be seeded with int or None (this should never happen). ",
)
return True
@@ -1780,10 +1793,10 @@ def _prediction_to_probabilities(
# to handle the case when dataset is numpy and categories are encoded
# however the class labels stored in task are still categories
if isinstance(y_train, np.ndarray) and isinstance(
- cast(List, task.class_labels)[0],
+ cast("List", task.class_labels)[0],
str,
):
- model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes]
+ model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes]
modelpredict_start_cputime = time.process_time()
modelpredict_start_walltime = time.time()
@@ -2006,7 +2019,7 @@ def is_subcomponent_specification(values):
# (mixed)). OpenML replaces the subcomponent by an
# OpenMLFlow object.
if len(subcomponent) < 2 or len(subcomponent) > 3:
- raise ValueError("Component reference should be " "size {2,3}. ")
+ raise ValueError("Component reference should be size {2,3}. ")
subcomponent_identifier = subcomponent[0]
subcomponent_flow = subcomponent[1]
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 3d056ac60..9906958e5 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -3,10 +3,9 @@
import os
import re
-import warnings
from collections import OrderedDict
-from typing import Any, Dict, overload
-from typing_extensions import Literal
+from functools import partial
+from typing import Any, Dict
import dateutil.parser
import pandas as pd
@@ -67,7 +66,7 @@ def _get_cached_flow(fid: int) -> OpenMLFlow:
return _create_flow_from_xml(fh.read())
except OSError as e:
openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir)
- raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) from e
+ raise OpenMLCacheException("Flow file for fid %d not cached" % fid) from e
@openml.utils.thread_safe_if_oslo_installed
@@ -133,44 +132,12 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow:
return _create_flow_from_xml(flow_xml)
-@overload
-def list_flows(
- offset: int | None = ...,
- size: int | None = ...,
- tag: str | None = ...,
- output_format: Literal["dict"] = "dict",
- **kwargs: Any,
-) -> dict: ...
-
-
-@overload
-def list_flows(
- offset: int | None = ...,
- size: int | None = ...,
- tag: str | None = ...,
- *,
- output_format: Literal["dataframe"],
- **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
-@overload
-def list_flows(
- offset: int | None,
- size: int | None,
- tag: str | None,
- output_format: Literal["dataframe"],
- **kwargs: Any,
-) -> pd.DataFrame: ...
-
-
def list_flows(
offset: int | None = None,
size: int | None = None,
tag: str | None = None,
- output_format: Literal["dict", "dataframe"] = "dict",
- **kwargs: Any,
-) -> dict | pd.DataFrame:
+ uploader: str | None = None,
+) -> pd.DataFrame:
"""
Return a list of all flows which are on OpenML.
(Supports large amount of results)
@@ -183,29 +150,12 @@ def list_flows(
the maximum number of flows to return
tag : str, optional
the tag to include
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
kwargs: dict, optional
Legal filter operators: uploader.
Returns
-------
- flows : dict of dicts, or dataframe
- - If output_format='dict'
- A mapping from flow_id to a dict giving a brief overview of the
- respective flow.
- Every flow is represented by a dictionary containing
- the following information:
- - flow id
- - full name
- - name
- - version
- - external version
- - uploader
-
- - If output_format='dataframe'
+ flows : dataframe
Each row maps to a dataset
Each column contains the following information:
- flow id
@@ -215,69 +165,44 @@ def list_flows(
- external version
- uploader
"""
- if output_format not in ["dataframe", "dict"]:
- raise ValueError(
- "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
- )
-
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15 "
- "and pandas dataframes will be returned instead. To ensure your code "
- "will continue to work, use `output_format`='dataframe'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
- return openml.utils._list_all(
- list_output_format=output_format,
- listing_call=_list_flows,
- offset=offset,
- size=size,
- tag=tag,
- **kwargs,
- )
-
-
-@overload
-def _list_flows(output_format: Literal["dict"] = ..., **kwargs: Any) -> dict: ...
+ listing_call = partial(_list_flows, tag=tag, uploader=uploader)
+ batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+ if len(batches) == 0:
+ return pd.DataFrame()
+ return pd.concat(batches)
-@overload
-def _list_flows(*, output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ...
-
-@overload
-def _list_flows(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ...
-
-
-def _list_flows(
- output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any
-) -> dict | pd.DataFrame:
+def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
"""
Perform the api call that return a list of all flows.
Parameters
----------
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
-
+ limit : int
+ the maximum number of flows to return
+ offset : int
+ the number of flows to skip, starting from the first
kwargs: dict, optional
- Legal filter operators: uploader, tag, limit, offset.
+ Legal filter operators: uploader, tag
Returns
-------
- flows : dict, or dataframe
+ flows : dataframe
"""
api_call = "flow/list"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
+
if kwargs is not None:
for operator, value in kwargs.items():
- api_call += f"/{operator}/{value}"
+ if value is not None:
+ api_call += f"/{operator}/{value}"
- return __list_flows(api_call=api_call, output_format=output_format)
+ return __list_flows(api_call=api_call)
def flow_exists(name: str, external_version: str) -> int | bool:
@@ -378,23 +303,12 @@ def get_flow_id(
raise ValueError("exact_version should be False if model is None!")
return flow_exists(name=flow_name, external_version=external_version)
- flows = list_flows(output_format="dataframe")
- assert isinstance(flows, pd.DataFrame) # Make mypy happy
+ flows = list_flows()
flows = flows.query(f'name == "{flow_name}"')
return flows["id"].to_list() # type: ignore[no-any-return]
-@overload
-def __list_flows(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
-
-
-@overload
-def __list_flows(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
-
-
-def __list_flows(
- api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
-) -> dict | pd.DataFrame:
+def __list_flows(api_call: str) -> pd.DataFrame:
"""Retrieve information about flows from OpenML API
and parse it to a dictionary or a Pandas DataFrame.
@@ -402,8 +316,6 @@ def __list_flows(
----------
api_call: str
Retrieves the information about flows.
- output_format: str in {"dict", "dataframe"}
- The output format.
Returns
-------
@@ -431,10 +343,7 @@ def __list_flows(
}
flows[fid] = flow
- if output_format == "dataframe":
- flows = pd.DataFrame.from_dict(flows, orient="index")
-
- return flows
+ return pd.DataFrame.from_dict(flows, orient="index")
def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
@@ -514,11 +423,11 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915
for name in set(attr1.keys()).union(attr2.keys()):
if name not in attr1:
raise ValueError(
- f"Component {name} only available in " "argument2, but not in argument1.",
+ f"Component {name} only available in argument2, but not in argument1.",
)
if name not in attr2:
raise ValueError(
- f"Component {name} only available in " "argument2, but not in argument1.",
+ f"Component {name} only available in argument2, but not in argument1.",
)
assert_flows_equal(
attr1[name],
@@ -579,7 +488,7 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915
params2 = set(flow2.parameters_meta_info)
if params1 != params2:
raise ValueError(
- "Parameter list in meta info for parameters differ " "in the two flows.",
+ "Parameter list in meta info for parameters differ in the two flows.",
)
# iterating over the parameter's meta info list
for param in params1:
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index b6f950020..e66af7b15 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -5,9 +5,9 @@
import time
import warnings
from collections import OrderedDict
+from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING, Any
-from typing_extensions import Literal
import numpy as np
import pandas as pd
@@ -65,7 +65,6 @@ def run_model_on_task( # noqa: PLR0913
add_local_measures: bool = True, # noqa: FBT001, FBT002
upload_flow: bool = False, # noqa: FBT001, FBT002
return_flow: bool = False, # noqa: FBT001, FBT002
- dataset_format: Literal["array", "dataframe"] = "dataframe",
n_jobs: int | None = None,
) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]:
"""Run the model on the dataset defined by the task.
@@ -93,9 +92,6 @@ def run_model_on_task( # noqa: PLR0913
If False, do not upload the flow to OpenML.
return_flow : bool (default=False)
If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun.
- dataset_format : str (default='dataframe')
- If 'array', the dataset is passed to the model as a numpy array.
- If 'dataframe', the dataset is passed to the model as a pandas dataframe.
n_jobs : int (default=None)
The number of processes/threads to distribute the evaluation asynchronously.
If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
@@ -169,7 +165,6 @@ def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask:
seed=seed,
add_local_measures=add_local_measures,
upload_flow=upload_flow,
- dataset_format=dataset_format,
n_jobs=n_jobs,
)
if return_flow:
@@ -185,7 +180,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
seed: int | None = None,
add_local_measures: bool = True, # noqa: FBT001, FBT002
upload_flow: bool = False, # noqa: FBT001, FBT002
- dataset_format: Literal["array", "dataframe"] = "dataframe",
n_jobs: int | None = None,
) -> OpenMLRun:
"""Run the model provided by the flow on the dataset defined by task.
@@ -214,9 +208,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
upload_flow : bool (default=False)
If True, upload the flow to OpenML if it does not exist yet.
If False, do not upload the flow to OpenML.
- dataset_format : str (default='dataframe')
- If 'array', the dataset is passed to the model as a numpy array.
- If 'dataframe', the dataset is passed to the model as a pandas dataframe.
n_jobs : int (default=None)
The number of processes/threads to distribute the evaluation asynchronously.
If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
@@ -259,8 +250,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
if isinstance(flow.flow_id, int) and flow_id != flow.flow_id:
if flow_id is not False:
raise PyOpenMLError(
- "Local flow_id does not match server flow_id: "
- f"'{flow.flow_id}' vs '{flow_id}'",
+ f"Local flow_id does not match server flow_id: '{flow.flow_id}' vs '{flow_id}'",
)
raise PyOpenMLError(
"Flow does not exist on the server, but 'flow.flow_id' is not None."
@@ -292,8 +282,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
if flow.extension.check_if_model_fitted(flow.model):
warnings.warn(
- "The model is already fitted!"
- " This might cause inconsistency in comparison of results.",
+ "The model is already fitted! This might cause inconsistency in comparison of results.",
RuntimeWarning,
stacklevel=2,
)
@@ -304,7 +293,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
task=task,
extension=flow.extension,
add_local_measures=add_local_measures,
- dataset_format=dataset_format,
n_jobs=n_jobs,
)
@@ -458,8 +446,7 @@ def run_exists(task_id: int, setup_id: int) -> set[int]:
return set()
try:
- result = list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
- assert isinstance(result, pd.DataFrame) # TODO(eddiebergman): Remove once #1299
+ result = list_runs(task=[task_id], setup=[setup_id])
return set() if result.empty else set(result["run_id"])
except OpenMLServerException as exception:
# error code implies no results. The run does not exist yet
@@ -468,13 +455,12 @@ def run_exists(task_id: int, setup_id: int) -> set[int]:
return set()
-def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, PLR0913, C901
+def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, C901
*,
model: Any,
task: OpenMLTask,
extension: Extension,
add_local_measures: bool,
- dataset_format: Literal["array", "dataframe"],
n_jobs: int | None = None,
) -> tuple[
list[list],
@@ -495,8 +481,6 @@ def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, PLR0913, C901
The OpenML extension object.
add_local_measures : bool
Whether to compute additional local evaluation measures.
- dataset_format : str
- The format in which to download the dataset.
n_jobs : int
Number of jobs to run in parallel.
If None, use 1 core by default. If -1, use all available cores.
@@ -560,7 +544,6 @@ def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, PLR0913, C901
rep_no=rep_no,
sample_no=sample_no,
task=task,
- dataset_format=dataset_format,
configuration=_config,
)
for _n_fit, rep_no, fold_no, sample_no in jobs
@@ -704,7 +687,6 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913
rep_no: int,
sample_no: int,
task: OpenMLTask,
- dataset_format: Literal["array", "dataframe"],
configuration: _Config | None = None,
) -> tuple[
np.ndarray,
@@ -730,8 +712,6 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913
Sample number to be run.
task : OpenMLTask
The task object from OpenML.
- dataset_format : str
- The dataset format to be used.
configuration : _Config
Hyperparameters to configure the model.
@@ -755,24 +735,15 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913
)
if isinstance(task, OpenMLSupervisedTask):
- x, y = task.get_X_and_y(dataset_format=dataset_format)
- if isinstance(x, pd.DataFrame):
- assert isinstance(y, (pd.Series, pd.DataFrame))
- train_x = x.iloc[train_indices]
- train_y = y.iloc[train_indices]
- test_x = x.iloc[test_indices]
- test_y = y.iloc[test_indices]
- else:
- # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing
- assert y is not None
- train_x = x[train_indices] # type: ignore
- train_y = y[train_indices]
- test_x = x[test_indices] # type: ignore
- test_y = y[test_indices]
+ x, y = task.get_X_and_y()
+ assert isinstance(y, (pd.Series, pd.DataFrame))
+ train_x = x.iloc[train_indices]
+ train_y = y.iloc[train_indices]
+ test_x = x.iloc[test_indices]
+ test_y = y.iloc[test_indices]
elif isinstance(task, OpenMLClusteringTask):
- x = task.get_X(dataset_format=dataset_format)
- # TODO(eddiebergman): Complains spmatrix doesn't support __getitem__ for typing
- train_x = x.iloc[train_indices] if isinstance(x, pd.DataFrame) else x[train_indices] # type: ignore
+ x = task.get_X()
+ train_x = x.iloc[train_indices]
train_y = None
test_x = None
test_y = None
@@ -793,8 +764,7 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913
model=model,
task=task,
X_train=train_x,
- # TODO(eddiebergman): Likely should not be ignored
- y_train=train_y, # type: ignore
+ y_train=train_y,
rep_no=rep_no,
fold_no=fold_no,
X_test=test_x,
@@ -888,7 +858,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore
if not from_server:
return None
- raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname)
+ raise AttributeError("Run XML does not contain required (server) field: ", fieldname)
run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[
"oml:run"
@@ -948,7 +918,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore
sample_evaluations: dict[str, dict[int, dict[int, dict[int, float | Any]]]] = {}
if "oml:output_data" not in run:
if from_server:
- raise ValueError("Run does not contain output_data " "(OpenML server error?)")
+ raise ValueError("Run does not contain output_data (OpenML server error?)")
predictions_url = None
else:
output_data = run["oml:output_data"]
@@ -1000,7 +970,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore
evaluations[key] = value
if "description" not in files and from_server is True:
- raise ValueError("No description file for run %d in run " "description XML" % run_id)
+ raise ValueError("No description file for run %d in run description XML" % run_id)
if "predictions" not in files and from_server is True:
task = openml.tasks.get_task(task_id)
@@ -1050,8 +1020,6 @@ def _get_cached_run(run_id: int) -> OpenMLRun:
raise OpenMLCacheException(f"Run file for run id {run_id} not cached") from e
-# TODO(eddiebergman): Could overload, likely too large an annoying to do
-# nvm, will be deprecated in 0.15
def list_runs( # noqa: PLR0913
offset: int | None = None,
size: int | None = None,
@@ -1063,9 +1031,8 @@ def list_runs( # noqa: PLR0913
tag: str | None = None,
study: int | None = None,
display_errors: bool = False, # noqa: FBT001, FBT002
- output_format: Literal["dict", "dataframe"] = "dict",
- **kwargs: Any,
-) -> dict | pd.DataFrame:
+ task_type: TaskType | int | None = None,
+) -> pd.DataFrame:
"""
List all runs matching all of the given filters.
(Supports large amount of results)
@@ -1095,31 +1062,12 @@ def list_runs( # noqa: PLR0913
Whether to list runs which have an error (for example a missing
prediction file).
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
-
- kwargs : dict, optional
- Legal filter operators: task_type.
+ task_type : str, optional
Returns
-------
- dict of dicts, or dataframe
+ dataframe
"""
- if output_format not in ["dataframe", "dict"]:
- raise ValueError("Invalid output format selected. Only 'dict' or 'dataframe' applicable.")
-
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15 "
- "and pandas dataframes will be returned instead. To ensure your code "
- "will continue to work, use `output_format`='dataframe'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
- # TODO(eddiebergman): Do we really need this runtime type validation?
if id is not None and (not isinstance(id, list)):
raise TypeError("id must be of type list.")
if task is not None and (not isinstance(task, list)):
@@ -1131,11 +1079,8 @@ def list_runs( # noqa: PLR0913
if uploader is not None and (not isinstance(uploader, list)):
raise TypeError("uploader must be of type list.")
- return openml.utils._list_all( # type: ignore
- list_output_format=output_format, # type: ignore
- listing_call=_list_runs,
- offset=offset,
- size=size,
+ listing_call = partial(
+ _list_runs,
id=id,
task=task,
setup=setup,
@@ -1144,21 +1089,29 @@ def list_runs( # noqa: PLR0913
tag=tag,
study=study,
display_errors=display_errors,
- **kwargs,
+ task_type=task_type,
)
+ batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+ if len(batches) == 0:
+ return pd.DataFrame()
+
+ return pd.concat(batches)
-def _list_runs( # noqa: PLR0913
+def _list_runs( # noqa: PLR0913, C901
+ limit: int,
+ offset: int,
+ *,
id: list | None = None, # noqa: A002
task: list | None = None,
setup: list | None = None,
flow: list | None = None,
uploader: list | None = None,
study: int | None = None,
- display_errors: bool = False, # noqa: FBT002, FBT001
- output_format: Literal["dict", "dataframe"] = "dict",
- **kwargs: Any,
-) -> dict | pd.DataFrame:
+ tag: str | None = None,
+ display_errors: bool = False,
+ task_type: TaskType | int | None = None,
+) -> pd.DataFrame:
"""
Perform API call `/run/list/{filters}'
`
@@ -1178,6 +1131,8 @@ def _list_runs( # noqa: PLR0913
flow : list, optional
+ tag: str, optional
+
uploader : list, optional
study : int, optional
@@ -1186,13 +1141,7 @@ def _list_runs( # noqa: PLR0913
Whether to list runs which have an error (for example a missing
prediction file).
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
-
- kwargs : dict, optional
- Legal filter operators: task_type.
+ task_type : str, optional
Returns
-------
@@ -1200,9 +1149,10 @@ def _list_runs( # noqa: PLR0913
List of found runs.
"""
api_call = "run/list"
- if kwargs is not None:
- for operator, value in kwargs.items():
- api_call += f"/{operator}/{value}"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
if id is not None:
api_call += "/run/{}".format(",".join([str(int(i)) for i in id]))
if task is not None:
@@ -1217,12 +1167,15 @@ def _list_runs( # noqa: PLR0913
api_call += "/study/%d" % study
if display_errors:
api_call += "/show_errors/true"
- return __list_runs(api_call=api_call, output_format=output_format)
+ if tag is not None:
+ api_call += f"/tag/{tag}"
+ if task_type is not None:
+ tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+ api_call += f"/task_type/{tvalue}"
+ return __list_runs(api_call=api_call)
-def __list_runs(
- api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
-) -> dict | pd.DataFrame:
+def __list_runs(api_call: str) -> pd.DataFrame:
"""Helper function to parse API calls which are lists of runs"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
@@ -1257,11 +1210,7 @@ def __list_runs(
}
for r in runs_dict["oml:runs"]["oml:run"]
}
-
- if output_format == "dataframe":
- runs = pd.DataFrame.from_dict(runs, orient="index")
-
- return runs
+ return pd.DataFrame.from_dict(runs, orient="index")
def format_prediction( # noqa: PLR0913
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
index 877384636..cc71418df 100644
--- a/openml/setups/functions.py
+++ b/openml/setups/functions.py
@@ -1,8 +1,9 @@
# License: BSD 3-Clause
from __future__ import annotations
-import warnings
from collections import OrderedDict
+from functools import partial
+from itertools import chain
from pathlib import Path
from typing import Any, Iterable
from typing_extensions import Literal
@@ -89,7 +90,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup:
setup_file = setup_cache_dir / "description.xml"
with setup_file.open(encoding="utf8") as fh:
setup_xml = xmltodict.parse(fh.read())
- return _create_setup_from_xml(setup_xml, output_format="object") # type: ignore
+ return _create_setup_from_xml(setup_xml)
except OSError as e:
raise openml.exceptions.OpenMLCacheException(
@@ -119,13 +120,13 @@ def get_setup(setup_id: int) -> OpenMLSetup:
try:
return _get_cached_setup(setup_id)
except openml.exceptions.OpenMLCacheException:
- url_suffix = "/setup/%d" % setup_id
+ url_suffix = f"/setup/{setup_id}"
setup_xml = openml._api_calls._perform_api_call(url_suffix, "get")
with setup_file.open("w", encoding="utf8") as fh:
fh.write(setup_xml)
result_dict = xmltodict.parse(setup_xml)
- return _create_setup_from_xml(result_dict, output_format="object") # type: ignore
+ return _create_setup_from_xml(result_dict)
def list_setups( # noqa: PLR0913
@@ -134,8 +135,8 @@ def list_setups( # noqa: PLR0913
flow: int | None = None,
tag: str | None = None,
setup: Iterable[int] | None = None,
- output_format: Literal["object", "dict", "dataframe"] = "object",
-) -> dict | pd.DataFrame:
+ output_format: Literal["object", "dataframe"] = "object",
+) -> dict[int, OpenMLSetup] | pd.DataFrame:
"""
List all setups matching all of the given filters.
@@ -148,81 +149,74 @@ def list_setups( # noqa: PLR0913
setup : Iterable[int], optional
output_format: str, optional (default='object')
The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- If 'dataframe' the output is a pandas DataFrame
+ - If 'object' the output is a dictionary of OpenMLSetup objects
Returns
-------
dict or dataframe
"""
- if output_format not in ["dataframe", "dict", "object"]:
+ if output_format not in ["dataframe", "object"]:
raise ValueError(
- "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable.",
+ "Invalid output format selected. Only 'object', or 'dataframe' applicable.",
)
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15. "
- "To ensure your code will continue to work, "
- "use `output_format`='dataframe' or `output_format`='object'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
- batch_size = 1000 # batch size for setups is lower
- return openml.utils._list_all( # type: ignore
- list_output_format=output_format, # type: ignore
- listing_call=_list_setups,
+ listing_call = partial(_list_setups, flow=flow, tag=tag, setup=setup)
+ batches = openml.utils._list_all(
+ listing_call,
+ batch_size=1_000, # batch size for setups is lower
offset=offset,
- size=size,
- flow=flow,
- tag=tag,
- setup=setup,
- batch_size=batch_size,
+ limit=size,
)
+ flattened = list(chain.from_iterable(batches))
+ if output_format == "object":
+ return {setup.setup_id: setup for setup in flattened}
+
+ records = [setup._to_dict() for setup in flattened]
+ return pd.DataFrame.from_records(records, index="setup_id")
def _list_setups(
+ limit: int,
+ offset: int,
+ *,
setup: Iterable[int] | None = None,
- output_format: Literal["dict", "dataframe", "object"] = "object",
- **kwargs: Any,
-) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
- """
- Perform API call `/setup/list/{filters}`
+ flow: int | None = None,
+ tag: str | None = None,
+) -> list[OpenMLSetup]:
+ """Perform API call `/setup/list/{filters}`
Parameters
----------
The setup argument that is a list is separated from the single value
filters which are put into the kwargs.
+ limit : int
+ offset : int
setup : list(int), optional
-
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
- - If 'object' the output is a dict of OpenMLSetup objects
-
- kwargs: dict, optional
- Legal filter operators: flow, setup, limit, offset, tag.
+ flow : int, optional
+ tag : str, optional
Returns
-------
- dict or dataframe or list[OpenMLSetup]
+ The setups that match the filters, going from id to the OpenMLSetup object.
"""
api_call = "setup/list"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
if setup is not None:
api_call += "/setup/{}".format(",".join([str(int(i)) for i in setup]))
- if kwargs is not None:
- for operator, value in kwargs.items():
- api_call += f"/{operator}/{value}"
+ if flow is not None:
+ api_call += f"/flow/{flow}"
+ if tag is not None:
+ api_call += f"/tag/{tag}"
- return __list_setups(api_call=api_call, output_format=output_format)
+ return __list_setups(api_call=api_call)
-def __list_setups(
- api_call: str, output_format: Literal["dict", "dataframe", "object"] = "object"
-) -> dict[int, dict] | pd.DataFrame | dict[int, OpenMLSetup]:
+def __list_setups(api_call: str) -> list[OpenMLSetup]:
"""Helper function to parse API calls which are lists of setups"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",))
@@ -230,12 +224,12 @@ def __list_setups(
# Minimalistic check if the XML is useful
if "oml:setups" not in setups_dict:
raise ValueError(
- 'Error in return XML, does not contain "oml:setups":' f" {setups_dict!s}",
+ f'Error in return XML, does not contain "oml:setups": {setups_dict!s}',
)
if "@xmlns:oml" not in setups_dict["oml:setups"]:
raise ValueError(
- "Error in return XML, does not contain " f'"oml:setups"/@xmlns:oml: {setups_dict!s}',
+ f'Error in return XML, does not contain "oml:setups"/@xmlns:oml: {setups_dict!s}',
)
if setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri:
@@ -247,22 +241,10 @@ def __list_setups(
assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"])
- setups = {}
- for setup_ in setups_dict["oml:setups"]["oml:setup"]:
- # making it a dict to give it the right format
- current = _create_setup_from_xml(
- {"oml:setup_parameters": setup_},
- output_format=output_format,
- )
- if output_format == "object":
- setups[current.setup_id] = current # type: ignore
- else:
- setups[current["setup_id"]] = current # type: ignore
-
- if output_format == "dataframe":
- setups = pd.DataFrame.from_dict(setups, orient="index")
-
- return setups
+ return [
+ _create_setup_from_xml({"oml:setup_parameters": setup_})
+ for setup_ in setups_dict["oml:setups"]["oml:setup"]
+ ]
def initialize_model(setup_id: int, *, strict_version: bool = True) -> Any:
@@ -299,9 +281,7 @@ def initialize_model(setup_id: int, *, strict_version: bool = True) -> Any:
return flow.extension.flow_to_model(flow, strict_version=strict_version)
-def _to_dict(
- flow_id: int, openml_parameter_settings: list[OpenMLParameter] | list[dict[str, Any]]
-) -> OrderedDict:
+def _to_dict(flow_id: int, openml_parameter_settings: list[dict[str, Any]]) -> OrderedDict:
"""Convert a flow ID and a list of OpenML parameter settings to
a dictionary representation that can be serialized to XML.
@@ -309,7 +289,7 @@ def _to_dict(
----------
flow_id : int
ID of the flow.
- openml_parameter_settings : List[OpenMLParameter]
+ openml_parameter_settings : list[dict[str, Any]]
A list of OpenML parameter settings.
Returns
@@ -327,77 +307,41 @@ def _to_dict(
return xml
-def _create_setup_from_xml(
- result_dict: dict, output_format: Literal["dict", "dataframe", "object"] = "object"
-) -> OpenMLSetup | dict[str, int | dict[int, Any] | None]:
+def _create_setup_from_xml(result_dict: dict) -> OpenMLSetup:
"""Turns an API xml result into a OpenMLSetup object (or dict)"""
- if output_format in ["dataframe", "dict"]:
- _output_format: Literal["dict", "object"] = "dict"
- elif output_format == "object":
- _output_format = "object"
- else:
- raise ValueError(
- f"Invalid output format selected: {output_format}"
- "Only 'dict', 'object', or 'dataframe' applicable.",
- )
-
setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"])
flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"])
+
if "oml:parameter" not in result_dict["oml:setup_parameters"]:
- parameters = None
+ return OpenMLSetup(setup_id, flow_id, parameters=None)
+
+ xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"]
+ if isinstance(xml_parameters, dict):
+ parameters = {
+ int(xml_parameters["oml:id"]): _create_setup_parameter_from_xml(xml_parameters),
+ }
+ elif isinstance(xml_parameters, list):
+ parameters = {
+ int(xml_parameter["oml:id"]): _create_setup_parameter_from_xml(xml_parameter)
+ for xml_parameter in xml_parameters
+ }
else:
- parameters = {}
- # basically all others
- xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"]
- if isinstance(xml_parameters, dict):
- oml_id = int(xml_parameters["oml:id"])
- parameters[oml_id] = _create_setup_parameter_from_xml(
- result_dict=xml_parameters,
- output_format=_output_format,
- )
- elif isinstance(xml_parameters, list):
- for xml_parameter in xml_parameters:
- oml_id = int(xml_parameter["oml:id"])
- parameters[oml_id] = _create_setup_parameter_from_xml(
- result_dict=xml_parameter,
- output_format=_output_format,
- )
- else:
- raise ValueError(
- "Expected None, list or dict, received "
- f"something else: {type(xml_parameters)!s}",
- )
-
- if _output_format in ["dataframe", "dict"]:
- return {"setup_id": setup_id, "flow_id": flow_id, "parameters": parameters}
+ raise ValueError(
+ f"Expected None, list or dict, received something else: {type(xml_parameters)!s}",
+ )
+
return OpenMLSetup(setup_id, flow_id, parameters)
-def _create_setup_parameter_from_xml(
- result_dict: dict[str, str], output_format: Literal["object", "dict"] = "object"
-) -> dict[str, int | str] | OpenMLParameter:
+def _create_setup_parameter_from_xml(result_dict: dict[str, str]) -> OpenMLParameter:
"""Create an OpenMLParameter object or a dictionary from an API xml result."""
- if output_format == "object":
- return OpenMLParameter(
- input_id=int(result_dict["oml:id"]),
- flow_id=int(result_dict["oml:flow_id"]),
- flow_name=result_dict["oml:flow_name"],
- full_name=result_dict["oml:full_name"],
- parameter_name=result_dict["oml:parameter_name"],
- data_type=result_dict["oml:data_type"],
- default_value=result_dict["oml:default_value"],
- value=result_dict["oml:value"],
- )
-
- # FIXME: likely we want to crash here if unknown output_format but not backwards compatible
- # output_format == "dict" case,
- return {
- "input_id": int(result_dict["oml:id"]),
- "flow_id": int(result_dict["oml:flow_id"]),
- "flow_name": result_dict["oml:flow_name"],
- "full_name": result_dict["oml:full_name"],
- "parameter_name": result_dict["oml:parameter_name"],
- "data_type": result_dict["oml:data_type"],
- "default_value": result_dict["oml:default_value"],
- "value": result_dict["oml:value"],
- }
+ return OpenMLParameter(
+ input_id=int(result_dict["oml:id"]),
+ flow_id=int(result_dict["oml:flow_id"]),
+ flow_name=result_dict["oml:flow_name"],
+ full_name=result_dict["oml:full_name"],
+ parameter_name=result_dict["oml:parameter_name"],
+ data_type=result_dict["oml:data_type"],
+ default_value=result_dict["oml:default_value"],
+ value=result_dict["oml:value"],
+ )
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index e8dc059e7..c3d8149e7 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -34,6 +34,15 @@ def __init__(self, setup_id: int, flow_id: int, parameters: dict[int, Any] | Non
self.flow_id = flow_id
self.parameters = parameters
+ def _to_dict(self) -> dict[str, Any]:
+ return {
+ "setup_id": self.setup_id,
+ "flow_id": self.flow_id,
+ "parameters": {p.id: p._to_dict() for p in self.parameters.values()}
+ if self.parameters is not None
+ else None,
+ }
+
def __repr__(self) -> str:
header = "OpenML Setup"
header = "{}\n{}\n".format(header, "=" * len(header))
@@ -102,6 +111,18 @@ def __init__( # noqa: PLR0913
self.default_value = default_value
self.value = value
+ def _to_dict(self) -> dict[str, Any]:
+ return {
+ "id": self.id,
+ "flow_id": self.flow_id,
+ "flow_name": self.flow_name,
+ "full_name": self.full_name,
+ "parameter_name": self.parameter_name,
+ "data_type": self.data_type,
+ "default_value": self.default_value,
+ "value": self.value,
+ }
+
def __repr__(self) -> str:
header = "OpenML Parameter"
header = "{}\n{}\n".format(header, "=" * len(header))
diff --git a/openml/study/functions.py b/openml/study/functions.py
index 7fdc6f636..4e16879d7 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -3,8 +3,8 @@
from __future__ import annotations
import warnings
-from typing import TYPE_CHECKING, Any, overload
-from typing_extensions import Literal
+from functools import partial
+from typing import TYPE_CHECKING, Any
import pandas as pd
import xmltodict
@@ -298,7 +298,7 @@ def update_study_status(study_id: int, status: str) -> None:
"""
legal_status = {"active", "deactivated"}
if status not in legal_status:
- raise ValueError("Illegal status value. " f"Legal values: {legal_status}")
+ raise ValueError(f"Illegal status value. Legal values: {legal_status}")
data = {"study_id": study_id, "status": status} # type: openml._api_calls.DATA_TYPE
result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data)
result = xmltodict.parse(result_xml)
@@ -433,33 +433,12 @@ def detach_from_study(study_id: int, run_ids: list[int]) -> int:
return int(result["oml:linked_entities"])
-@overload
-def list_suites(
- offset: int | None = ...,
- size: int | None = ...,
- status: str | None = ...,
- uploader: list[int] | None = ...,
- output_format: Literal["dict"] = "dict",
-) -> dict: ...
-
-
-@overload
-def list_suites(
- offset: int | None = ...,
- size: int | None = ...,
- status: str | None = ...,
- uploader: list[int] | None = ...,
- output_format: Literal["dataframe"] = "dataframe",
-) -> pd.DataFrame: ...
-
-
def list_suites(
offset: int | None = None,
size: int | None = None,
status: str | None = None,
uploader: list[int] | None = None,
- output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
"""
Return a list of all suites which are on OpenML.
@@ -474,78 +453,30 @@ def list_suites(
suites are returned.
uploader : list (int), optional
Result filter. Will only return suites created by these users.
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
Returns
-------
- datasets : dict of dicts, or dataframe
- - If output_format='dict'
- Every suite is represented by a dictionary containing the following information:
- - id
- - alias (optional)
- - name
- - main_entity_type
- - status
- - creator
- - creation_date
-
- - If output_format='dataframe'
- Every row is represented by a dictionary containing the following information:
- - id
- - alias (optional)
- - name
- - main_entity_type
- - status
- - creator
- - creation_date
+ datasets : dataframe
+ Every row is represented by a dictionary containing the following information:
+ - id
+ - alias (optional)
+ - name
+ - main_entity_type
+ - status
+ - creator
+ - creation_date
"""
- if output_format not in ["dataframe", "dict"]:
- raise ValueError(
- "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
- )
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15 "
- "and pandas dataframes will be returned instead. To ensure your code "
- "will continue to work, use `output_format`='dataframe'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
- return openml.utils._list_all( # type: ignore
- list_output_format=output_format, # type: ignore
- listing_call=_list_studies,
- offset=offset,
- size=size,
+ listing_call = partial(
+ _list_studies,
main_entity_type="task",
status=status,
uploader=uploader,
)
+ batches = openml.utils._list_all(listing_call, limit=size, offset=offset)
+ if len(batches) == 0:
+ return pd.DataFrame()
-
-@overload
-def list_studies(
- offset: int | None = ...,
- size: int | None = ...,
- status: str | None = ...,
- uploader: list[str] | None = ...,
- benchmark_suite: int | None = ...,
- output_format: Literal["dict"] = "dict",
-) -> dict: ...
-
-
-@overload
-def list_studies(
- offset: int | None = ...,
- size: int | None = ...,
- status: str | None = ...,
- uploader: list[str] | None = ...,
- benchmark_suite: int | None = ...,
- output_format: Literal["dataframe"] = "dataframe",
-) -> pd.DataFrame: ...
+ return pd.concat(batches)
def list_studies(
@@ -554,8 +485,7 @@ def list_studies(
status: str | None = None,
uploader: list[str] | None = None,
benchmark_suite: int | None = None,
- output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
"""
Return a list of all studies which are on OpenML.
@@ -571,111 +501,66 @@ def list_studies(
uploader : list (int), optional
Result filter. Will only return studies created by these users.
benchmark_suite : int, optional
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
Returns
-------
- datasets : dict of dicts, or dataframe
- - If output_format='dict'
- Every dataset is represented by a dictionary containing
- the following information:
- - id
- - alias (optional)
- - name
- - benchmark_suite (optional)
- - status
- - creator
- - creation_date
- If qualities are calculated for the dataset, some of
- these are also returned.
-
- - If output_format='dataframe'
- Every dataset is represented by a dictionary containing
- the following information:
- - id
- - alias (optional)
- - name
- - benchmark_suite (optional)
- - status
- - creator
- - creation_date
- If qualities are calculated for the dataset, some of
- these are also returned.
+ datasets : dataframe
+ Every dataset is represented by a dictionary containing
+ the following information:
+ - id
+ - alias (optional)
+ - name
+ - benchmark_suite (optional)
+ - status
+ - creator
+ - creation_date
+ If qualities are calculated for the dataset, some of
+ these are also returned.
"""
- if output_format not in ["dataframe", "dict"]:
- raise ValueError(
- "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
- )
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15 "
- "and pandas dataframes will be returned instead. To ensure your code "
- "will continue to work, use `output_format`='dataframe'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
-
- return openml.utils._list_all( # type: ignore
- list_output_format=output_format, # type: ignore
- listing_call=_list_studies,
- offset=offset,
- size=size,
+ listing_call = partial(
+ _list_studies,
main_entity_type="run",
status=status,
uploader=uploader,
benchmark_suite=benchmark_suite,
)
+ batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+ if len(batches) == 0:
+ return pd.DataFrame()
+ return pd.concat(batches)
-@overload
-def _list_studies(output_format: Literal["dict"] = "dict", **kwargs: Any) -> dict: ...
-
-
-@overload
-def _list_studies(output_format: Literal["dataframe"], **kwargs: Any) -> pd.DataFrame: ...
-
-def _list_studies(
- output_format: Literal["dict", "dataframe"] = "dict", **kwargs: Any
-) -> dict | pd.DataFrame:
- """
- Perform api call to return a list of studies.
+def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
+ """Perform api call to return a list of studies.
Parameters
----------
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
+ limit: int
+ The maximum number of studies to return.
+ offset: int
+ The number of studies to skip, starting from the first.
kwargs : dict, optional
Legal filter operators (keys in the dict):
- status, limit, offset, main_entity_type, uploader
+ status, main_entity_type, uploader, benchmark_suite
Returns
-------
- studies : dict of dicts
+ studies : dataframe
"""
api_call = "study/list"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
if kwargs is not None:
for operator, value in kwargs.items():
- api_call += f"/{operator}/{value}"
- return __list_studies(api_call=api_call, output_format=output_format)
-
-
-@overload
-def __list_studies(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
-
-
-@overload
-def __list_studies(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
+ if value is not None:
+ api_call += f"/{operator}/{value}"
+ return __list_studies(api_call=api_call)
-def __list_studies(
- api_call: str, output_format: Literal["dict", "dataframe"] = "dict"
-) -> dict | pd.DataFrame:
+def __list_studies(api_call: str) -> pd.DataFrame:
"""Retrieves the list of OpenML studies and
returns it in a dictionary or a Pandas DataFrame.
@@ -683,15 +568,11 @@ def __list_studies(
----------
api_call : str
The API call for retrieving the list of OpenML studies.
- output_format : str in {"dict", "dataframe"}
- Format of the output, either 'object' for a dictionary
- or 'dataframe' for a Pandas DataFrame.
Returns
-------
- Union[Dict, pd.DataFrame]
- A dictionary or Pandas DataFrame of OpenML studies,
- depending on the value of 'output_format'.
+ pd.DataFrame
+ A Pandas DataFrame of OpenML studies
"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
@@ -725,6 +606,4 @@ def __list_studies(
current_study["id"] = int(current_study["id"])
studies[study_id] = current_study
- if output_format == "dataframe":
- studies = pd.DataFrame.from_dict(studies, orient="index")
- return studies
+ return pd.DataFrame.from_dict(studies, orient="index")
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 54030422d..25156f2e5 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -4,8 +4,8 @@
import os
import re
import warnings
+from functools import partial
from typing import Any
-from typing_extensions import Literal
import pandas as pd
import xmltodict
@@ -126,14 +126,20 @@ def _get_estimation_procedure_list() -> list[dict[str, Any]]:
return procs
-def list_tasks(
+def list_tasks( # noqa: PLR0913
task_type: TaskType | None = None,
offset: int | None = None,
size: int | None = None,
tag: str | None = None,
- output_format: Literal["dict", "dataframe"] = "dict",
- **kwargs: Any,
-) -> dict | pd.DataFrame:
+ data_tag: str | None = None,
+ status: str | None = None,
+ data_name: str | None = None,
+ data_id: int | None = None,
+ number_instances: int | None = None,
+ number_features: int | None = None,
+ number_classes: int | None = None,
+ number_missing_values: int | None = None,
+) -> pd.DataFrame:
"""
Return a number of tasks having the given tag and task_type
@@ -142,64 +148,58 @@ def list_tasks(
Filter task_type is separated from the other filters because
it is used as task_type in the task description, but it is named
type when used as a filter in list tasks call.
- task_type : TaskType, optional
- Refers to the type of task.
offset : int, optional
the number of tasks to skip, starting from the first
+ task_type : TaskType, optional
+ Refers to the type of task.
size : int, optional
the maximum number of tasks to show
tag : str, optional
the tag to include
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
- kwargs: dict, optional
- Legal filter operators: data_tag, status, data_id, data_name,
- number_instances, number_features,
- number_classes, number_missing_values.
+ data_tag : str, optional
+ the tag of the dataset
+ data_id : int, optional
+ status : str, optional
+ data_name : str, optional
+ number_instances : int, optional
+ number_features : int, optional
+ number_classes : int, optional
+ number_missing_values : int, optional
Returns
-------
- dict
- All tasks having the given task_type and the give tag. Every task is
- represented by a dictionary containing the following information:
- task id, dataset id, task_type and status. If qualities are calculated
- for the associated dataset, some of these are also returned.
dataframe
All tasks having the given task_type and the give tag. Every task is
represented by a row in the data frame containing the following information
as columns: task id, dataset id, task_type and status. If qualities are
calculated for the associated dataset, some of these are also returned.
"""
- if output_format not in ["dataframe", "dict"]:
- raise ValueError(
- "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
- )
- # TODO: [0.15]
- if output_format == "dict":
- msg = (
- "Support for `output_format` of 'dict' will be removed in 0.15 "
- "and pandas dataframes will be returned instead. To ensure your code "
- "will continue to work, use `output_format`='dataframe'."
- )
- warnings.warn(msg, category=FutureWarning, stacklevel=2)
- return openml.utils._list_all( # type: ignore
- list_output_format=output_format, # type: ignore
- listing_call=_list_tasks,
+ listing_call = partial(
+ _list_tasks,
task_type=task_type,
- offset=offset,
- size=size,
tag=tag,
- **kwargs,
+ data_tag=data_tag,
+ status=status,
+ data_id=data_id,
+ data_name=data_name,
+ number_instances=number_instances,
+ number_features=number_features,
+ number_classes=number_classes,
+ number_missing_values=number_missing_values,
)
+ batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
+ if len(batches) == 0:
+ return pd.DataFrame()
+
+ return pd.concat(batches)
def _list_tasks(
- task_type: TaskType | None = None,
- output_format: Literal["dict", "dataframe"] = "dict",
+ limit: int,
+ offset: int,
+ task_type: TaskType | int | None = None,
**kwargs: Any,
-) -> dict | pd.DataFrame:
+) -> pd.DataFrame:
"""
Perform the api call to return a number of tasks having the given filters.
@@ -208,12 +208,10 @@ def _list_tasks(
Filter task_type is separated from the other filters because
it is used as task_type in the task description, but it is named
type when used as a filter in list tasks call.
+ limit: int
+ offset: int
task_type : TaskType, optional
Refers to the type of task.
- output_format: str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
kwargs: dict, optional
Legal filter operators: tag, task_id (list), data_tag, status, limit,
offset, data_id, data_name, number_instances, number_features,
@@ -221,38 +219,37 @@ def _list_tasks(
Returns
-------
- dict or dataframe
+ dataframe
"""
api_call = "task/list"
+ if limit is not None:
+ api_call += f"/limit/{limit}"
+ if offset is not None:
+ api_call += f"/offset/{offset}"
if task_type is not None:
- api_call += "/type/%d" % task_type.value
+ tvalue = task_type.value if isinstance(task_type, TaskType) else task_type
+ api_call += f"/type/{tvalue}"
if kwargs is not None:
for operator, value in kwargs.items():
- if operator == "task_id":
- value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901
- api_call += f"/{operator}/{value}"
+ if value is not None:
+ if operator == "task_id":
+ value = ",".join([str(int(i)) for i in value]) # noqa: PLW2901
+ api_call += f"/{operator}/{value}"
- return __list_tasks(api_call=api_call, output_format=output_format)
+ return __list_tasks(api_call=api_call)
-# TODO(eddiebergman): overload todefine type returned
-def __list_tasks( # noqa: PLR0912, C901
- api_call: str,
- output_format: Literal["dict", "dataframe"] = "dict",
-) -> dict | pd.DataFrame:
- """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks.
+def __list_tasks(api_call: str) -> pd.DataFrame: # noqa: C901, PLR0912
+ """Returns a Pandas DataFrame with information about OpenML tasks.
Parameters
----------
api_call : str
The API call specifying which tasks to return.
- output_format : str in {"dict", "dataframe"}
- Output format for the returned object.
Returns
-------
- Union[Dict, pd.DataFrame]
- A dictionary or a Pandas DataFrame with information about OpenML tasks.
+ A Pandas DataFrame with information about OpenML tasks.
Raises
------
@@ -339,13 +336,9 @@ def __list_tasks( # noqa: PLR0912, C901
else:
warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)
- if output_format == "dataframe":
- tasks = pd.DataFrame.from_dict(tasks, orient="index")
-
- return tasks
+ return pd.DataFrame.from_dict(tasks, orient="index")
-# TODO(eddiebergman): Maybe since this isn't public api, we can make it keyword only?
def get_tasks(
task_ids: list[int],
download_data: bool | None = None,
@@ -590,7 +583,7 @@ def create_task(
task_type_id=task_type,
task_type="None", # TODO: refactor to get task type string from ID.
data_set_id=dataset_id,
- target_name=target_name,
+ target_name=target_name, # type: ignore
estimation_procedure_id=estimation_procedure_id,
evaluation_measure=evaluation_measure,
**kwargs,
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index e7d19bdce..395b52482 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -8,7 +8,7 @@
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Any, Sequence
-from typing_extensions import Literal, TypedDict, overload
+from typing_extensions import TypedDict
import openml._api_calls
import openml.config
@@ -21,7 +21,6 @@
if TYPE_CHECKING:
import numpy as np
import pandas as pd
- import scipy.sparse
# TODO(eddiebergman): Should use `auto()` but might be too late if these numbers are used
@@ -277,52 +276,14 @@ def __init__( # noqa: PLR0913
self.target_name = target_name
- @overload
- def get_X_and_y(
- self, dataset_format: Literal["array"] = "array"
- ) -> tuple[
- np.ndarray | scipy.sparse.spmatrix,
- np.ndarray | None,
- ]: ...
-
- @overload
- def get_X_and_y(
- self, dataset_format: Literal["dataframe"]
- ) -> tuple[
- pd.DataFrame,
- pd.Series | pd.DataFrame | None,
- ]: ...
-
- # TODO(eddiebergman): Do all OpenMLSupervisedTask have a `y`?
- def get_X_and_y(
- self, dataset_format: Literal["dataframe", "array"] = "array"
- ) -> tuple[
- np.ndarray | pd.DataFrame | scipy.sparse.spmatrix,
- np.ndarray | pd.Series | pd.DataFrame | None,
- ]:
+ def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
"""Get data associated with the current task.
- Parameters
- ----------
- dataset_format : str
- Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
- for possible options.
-
Returns
-------
tuple - X and y
"""
- # TODO: [0.15]
- if dataset_format == "array":
- warnings.warn(
- "Support for `dataset_format='array'` will be removed in 0.15,"
- "start using `dataset_format='dataframe' to ensure your code "
- "will continue to work. You can use the dataframe's `to_numpy` "
- "function to continue using numpy arrays.",
- category=FutureWarning,
- stacklevel=2,
- )
dataset = self.get_dataset()
if self.task_type_id not in (
TaskType.SUPERVISED_CLASSIFICATION,
@@ -331,10 +292,7 @@ def get_X_and_y(
):
raise NotImplementedError(self.task_type)
- X, y, _, _ = dataset.get_data(
- dataset_format=dataset_format,
- target=self.target_name,
- )
+ X, y, _, _ = dataset.get_data(target=self.target_name)
return X, y
def _to_dict(self) -> dict[str, dict]:
@@ -536,34 +494,15 @@ def __init__( # noqa: PLR0913
self.target_name = target_name
- @overload
- def get_X(
- self,
- dataset_format: Literal["array"] = "array",
- ) -> np.ndarray | scipy.sparse.spmatrix: ...
-
- @overload
- def get_X(self, dataset_format: Literal["dataframe"]) -> pd.DataFrame: ...
-
- def get_X(
- self,
- dataset_format: Literal["array", "dataframe"] = "array",
- ) -> np.ndarray | pd.DataFrame | scipy.sparse.spmatrix:
+ def get_X(self) -> pd.DataFrame:
"""Get data associated with the current task.
- Parameters
- ----------
- dataset_format : str
- Data structure of the returned data. See :meth:`openml.datasets.OpenMLDataset.get_data`
- for possible options.
-
Returns
-------
- tuple - X and y
-
+ The X data as a dataframe
"""
dataset = self.get_dataset()
- data, *_ = dataset.get_data(dataset_format=dataset_format, target=None)
+ data, *_ = dataset.get_data(target=None)
return data
def _to_dict(self) -> dict[str, dict[str, int | str | list[dict[str, Any]]]]:
diff --git a/openml/testing.py b/openml/testing.py
index 5d547f482..a3a5806e8 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -12,7 +12,6 @@
from pathlib import Path
from typing import ClassVar
-import pandas as pd
import requests
import openml
@@ -286,8 +285,7 @@ def check_task_existence(
int, None
"""
return_val = None
- tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
- assert isinstance(tasks, pd.DataFrame)
+ tasks = openml.tasks.list_tasks(task_type=task_type)
if len(tasks) == 0:
return None
tasks = tasks.loc[tasks["did"] == dataset_id]
diff --git a/openml/utils.py b/openml/utils.py
index 82859fd40..7e72e7aee 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -6,11 +6,10 @@
import warnings
from functools import wraps
from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Mapping, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload
from typing_extensions import Literal, ParamSpec
import numpy as np
-import pandas as pd
import xmltodict
from minio.helpers import ProgressType
from tqdm import tqdm
@@ -27,6 +26,7 @@
P = ParamSpec("P")
R = TypeVar("R")
+ _SizedT = TypeVar("_SizedT", bound=Sized)
@overload
@@ -237,39 +237,13 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool:
raise e
-@overload
-def _list_all(
- listing_call: Callable[P, Any],
- list_output_format: Literal["dict"] = ...,
- *args: P.args,
- **filters: P.kwargs,
-) -> dict: ...
-
-
-@overload
-def _list_all(
- listing_call: Callable[P, Any],
- list_output_format: Literal["object"],
- *args: P.args,
- **filters: P.kwargs,
-) -> dict: ...
-
-
-@overload
-def _list_all(
- listing_call: Callable[P, Any],
- list_output_format: Literal["dataframe"],
- *args: P.args,
- **filters: P.kwargs,
-) -> pd.DataFrame: ...
-
-
-def _list_all( # noqa: C901, PLR0912
- listing_call: Callable[P, Any],
- list_output_format: Literal["dict", "dataframe", "object"] = "dict",
- *args: P.args,
- **filters: P.kwargs,
-) -> dict | pd.DataFrame:
+def _list_all( # noqa: C901
+ listing_call: Callable[[int, int], _SizedT],
+ *,
+ limit: int | None = None,
+ offset: int | None = None,
+ batch_size: int | None = 10_000,
+) -> list[_SizedT]:
"""Helper to handle paged listing requests.
Example usage:
@@ -279,44 +253,44 @@ def _list_all( # noqa: C901, PLR0912
Parameters
----------
listing_call : callable
- Call listing, e.g. list_evaluations.
- list_output_format : str, optional (default='dict')
- The parameter decides the format of the output.
- - If 'dict' the output is a dict of dict
- - If 'dataframe' the output is a pandas DataFrame
- - If 'object' the output is a dict of objects (only for some `listing_call`)
- *args : Variable length argument list
- Any required arguments for the listing call.
- **filters : Arbitrary keyword arguments
- Any filters that can be applied to the listing function.
- additionally, the batch_size can be specified. This is
- useful for testing purposes.
+ Call listing, e.g. list_evaluations. Takes two positional
+ arguments: batch_size and offset.
+ batch_size : int, optional
+ The batch size to use for the listing call.
+ offset : int, optional
+ The initial offset to use for the listing call.
+ limit : int, optional
+ The total size of the listing. If not provided, the function will
+ request the first batch and then continue until no more results are
+ returned
Returns
-------
- dict or dataframe
+ List of types returned from type of the listing call
"""
- # eliminate filters that have a None value
- active_filters = {key: value for key, value in filters.items() if value is not None}
page = 0
- result = pd.DataFrame() if list_output_format == "dataframe" else {}
+ results: list[_SizedT] = []
+
+ offset = offset if offset is not None else 0
+ batch_size = batch_size if batch_size is not None else 10_000
+
+ LIMIT = limit
+ BATCH_SIZE_ORIG = batch_size
# Default batch size per paging.
# This one can be set in filters (batch_size), but should not be
# changed afterwards. The derived batch_size can be changed.
- BATCH_SIZE_ORIG = active_filters.pop("batch_size", 10000)
if not isinstance(BATCH_SIZE_ORIG, int):
raise ValueError(f"'batch_size' should be an integer but got {BATCH_SIZE_ORIG}")
- # max number of results to be shown
- LIMIT: int | float | None = active_filters.pop("size", None) # type: ignore
if (LIMIT is not None) and (not isinstance(LIMIT, int)) and (not np.isinf(LIMIT)):
raise ValueError(f"'limit' should be an integer or inf but got {LIMIT}")
+ # If our batch size is larger than the limit, we should only
+ # request one batch of size of LIMIT
if LIMIT is not None and BATCH_SIZE_ORIG > LIMIT:
BATCH_SIZE_ORIG = LIMIT
- offset = active_filters.pop("offset", 0)
if not isinstance(offset, int):
raise ValueError(f"'offset' should be an integer but got {offset}")
@@ -324,26 +298,16 @@ def _list_all( # noqa: C901, PLR0912
while True:
try:
current_offset = offset + BATCH_SIZE_ORIG * page
- new_batch = listing_call(
- *args,
- output_format=list_output_format, # type: ignore
- **{**active_filters, "limit": batch_size, "offset": current_offset}, # type: ignore
- )
+ new_batch = listing_call(batch_size, current_offset)
except openml.exceptions.OpenMLServerNoResult:
- # we want to return an empty dict in this case
# NOTE: This above statement may not actually happen, but we could just return here
# to enforce it...
break
- if list_output_format == "dataframe":
- if len(result) == 0:
- result = new_batch
- else:
- result = pd.concat([result, new_batch], ignore_index=True)
- else:
- # For output_format = 'dict' (or catch all)
- result.update(new_batch)
+ results.append(new_batch)
+ # If the batch is less than our requested batch_size, that's the last batch
+ # and we can bail out.
if len(new_batch) < batch_size:
break
@@ -352,14 +316,15 @@ def _list_all( # noqa: C901, PLR0912
# check if the number of required results has been achieved
# always do a 'bigger than' check,
# in case of bugs to prevent infinite loops
- if len(result) >= LIMIT:
+ n_received = sum(len(result) for result in results)
+ if n_received >= LIMIT:
break
# check if there are enough results to fulfill a batch
- if LIMIT - len(result) < BATCH_SIZE_ORIG:
- batch_size = LIMIT - len(result)
+ if LIMIT - n_received < BATCH_SIZE_ORIG:
+ batch_size = LIMIT - n_received
- return result
+ return results
def _get_cache_dir_for_key(key: str) -> Path:
diff --git a/tests/conftest.py b/tests/conftest.py
index 79ee2bbd3..b523117c1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,6 +23,10 @@
# License: BSD 3-Clause
from __future__ import annotations
+import multiprocessing
+
+multiprocessing.set_start_method("spawn", force=True)
+
from collections.abc import Iterator
import logging
import os
@@ -33,6 +37,7 @@
import openml
from openml.testing import TestBase
+
# creating logger for unit test file deletion status
logger = logging.getLogger("unit_tests")
logger.setLevel(logging.DEBUG)
@@ -170,7 +175,7 @@ def pytest_sessionfinish() -> None:
# Delete any test dirs that remain
# In edge cases due to a mixture of pytest parametrization and oslo concurrency,
# some file lock are created after leaving the test. This removes these files!
- test_files_dir=Path(__file__).parent.parent / "openml"
+ test_files_dir = Path(__file__).parent.parent / "openml"
for f in test_files_dir.glob("tests.*"):
if f.is_dir():
shutil.rmtree(f)
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 4598b8985..d132c4233 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -95,25 +95,8 @@ def test__unpack_categories_with_nan_likes(self):
self.assertListEqual(list(clean_series.values), expected_values)
self.assertListEqual(list(clean_series.cat.categories.values), list("ab"))
- def test_get_data_array(self):
- # Basic usage
- rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format="array")
- assert isinstance(rval, np.ndarray)
- assert rval.dtype == np.float32
- assert rval.shape == (898, 39)
- assert len(categorical) == 39
- assert all(isinstance(cat, bool) for cat in categorical)
- assert len(attribute_names) == 39
- assert all(isinstance(att, str) for att in attribute_names)
- assert _ is None
-
- # check that an error is raised when the dataset contains string
- err_msg = "PyOpenML cannot handle string when returning numpy arrays"
- with pytest.raises(PyOpenMLError, match=err_msg):
- self.titanic.get_data(dataset_format="array")
-
def test_get_data_pandas(self):
- data, _, _, _ = self.titanic.get_data(dataset_format="dataframe")
+ data, _, _, _ = self.titanic.get_data()
assert isinstance(data, pd.DataFrame)
assert data.shape[1] == len(self.titanic.features)
assert data.shape[0] == 1309
@@ -137,7 +120,6 @@ def test_get_data_pandas(self):
assert data[col_name].dtype.name == col_dtype[col_name]
X, y, _, _ = self.titanic.get_data(
- dataset_format="dataframe",
target=self.titanic.default_target_attribute,
)
assert isinstance(X, pd.DataFrame)
@@ -160,12 +142,6 @@ def test_get_data_boolean_pandas(self):
assert data["c"].dtype.name == "category"
assert set(data["c"].cat.categories) == {True, False}
- def test_get_data_no_str_data_for_nparrays(self):
- # check that an error is raised when the dataset contains string
- err_msg = "PyOpenML cannot handle string when returning numpy arrays"
- with pytest.raises(PyOpenMLError, match=err_msg):
- self.titanic.get_data(dataset_format="array")
-
def _check_expected_type(self, dtype, is_cat, col):
if is_cat:
expected_type = "category"
@@ -193,16 +169,6 @@ def test_get_data_with_rowid(self):
assert rval.shape == (898, 38)
assert len(categorical) == 38
- def test_get_data_with_target_array(self):
- X, y, _, attribute_names = self.dataset.get_data(dataset_format="array", target="class")
- assert isinstance(X, np.ndarray)
- assert X.dtype == np.float32
- assert X.shape == (898, 38)
- assert y.dtype in [np.int32, np.int64]
- assert y.shape == (898,)
- assert len(attribute_names) == 38
- assert "class" not in attribute_names
-
@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157")
def test_get_data_with_target_pandas(self):
X, y, categorical, attribute_names = self.dataset.get_data(target="class")
@@ -247,13 +213,8 @@ def test_get_data_with_nonexisting_class(self):
# This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
# label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
# indices 4 and 5, and that nothing is mapped to index 3.
- _, y, _, _ = self.dataset.get_data("class", dataset_format="dataframe")
+ _, y, _, _ = self.dataset.get_data("class")
assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"]
- _, y, _, _ = self.dataset.get_data("class", dataset_format="array")
- assert np.min(y) == 0
- assert np.max(y) == 5
- # Check that no label is mapped to 3, since it is reserved for label '4'.
- assert np.sum(y == 3) == 0
def test_get_data_corrupt_pickle(self):
# Lazy loaded dataset, populate cache.
@@ -312,7 +273,8 @@ def test_lazy_loading_metadata(self):
def test_equality_comparison(self):
self.assertEqual(self.iris, self.iris)
self.assertNotEqual(self.iris, self.titanic)
- self.assertNotEqual(self.titanic, 'Wrong_object')
+ self.assertNotEqual(self.titanic, "Wrong_object")
+
class OpenMLDatasetTestOnTestServer(TestBase):
def setUp(self):
@@ -324,14 +286,14 @@ def test_tagging(self):
# tags can be at most 64 alphanumeric (+ underscore) chars
unique_indicator = str(time()).replace(".", "")
tag = f"test_tag_OpenMLDatasetTestOnTestServer_{unique_indicator}"
- datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+ datasets = openml.datasets.list_datasets(tag=tag)
assert datasets.empty
self.dataset.push_tag(tag)
- datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+ datasets = openml.datasets.list_datasets(tag=tag)
assert len(datasets) == 1
assert 125 in datasets["did"]
self.dataset.remove_tag(tag)
- datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
+ datasets = openml.datasets.list_datasets(tag=tag)
assert datasets.empty
def test_get_feature_with_ontology_data_id_11(self):
@@ -345,21 +307,20 @@ def test_get_feature_with_ontology_data_id_11(self):
def test_add_remove_ontology_to_dataset(self):
did = 1
feature_index = 1
- ontology = 'https://www.openml.org/unittest/' + str(time())
+ ontology = "https://www.openml.org/unittest/" + str(time())
openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)
def test_add_same_ontology_multiple_features(self):
did = 1
- ontology = 'https://www.openml.org/unittest/' + str(time())
+ ontology = "https://www.openml.org/unittest/" + str(time())
for i in range(3):
openml.datasets.functions.data_feature_add_ontology(did, i, ontology)
-
def test_add_illegal_long_ontology(self):
did = 1
- ontology = 'http://www.google.com/' + ('a' * 257)
+ ontology = "http://www.google.com/" + ("a" * 257)
try:
openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
assert False
@@ -368,13 +329,14 @@ def test_add_illegal_long_ontology(self):
def test_add_illegal_url_ontology(self):
did = 1
- ontology = 'not_a_url' + str(time())
+ ontology = "not_a_url" + str(time())
try:
openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
assert False
except openml.exceptions.OpenMLServerException as e:
assert e.code == 1106
+
@pytest.mark.production()
class OpenMLDatasetTestSparse(TestBase):
_multiprocess_can_split_ = True
@@ -385,28 +347,8 @@ def setUp(self):
self.sparse_dataset = openml.datasets.get_dataset(4136, download_data=False)
- def test_get_sparse_dataset_array_with_target(self):
- X, y, _, attribute_names = self.sparse_dataset.get_data(
- dataset_format="array",
- target="class",
- )
-
- assert sparse.issparse(X)
- assert X.dtype == np.float32
- assert X.shape == (600, 20000)
-
- assert isinstance(y, np.ndarray)
- assert y.dtype in [np.int32, np.int64]
- assert y.shape == (600,)
-
- assert len(attribute_names) == 20000
- assert "class" not in attribute_names
-
def test_get_sparse_dataset_dataframe_with_target(self):
- X, y, _, attribute_names = self.sparse_dataset.get_data(
- dataset_format="dataframe",
- target="class",
- )
+ X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
assert isinstance(X, pd.DataFrame)
assert isinstance(X.dtypes[0], pd.SparseDtype)
assert X.shape == (600, 20000)
@@ -418,18 +360,6 @@ def test_get_sparse_dataset_dataframe_with_target(self):
assert len(attribute_names) == 20000
assert "class" not in attribute_names
- def test_get_sparse_dataset_array(self):
- rval, _, categorical, attribute_names = self.sparse_dataset.get_data(dataset_format="array")
- assert sparse.issparse(rval)
- assert rval.dtype == np.float32
- assert rval.shape == (600, 20001)
-
- assert len(categorical) == 20001
- assert all(isinstance(cat, bool) for cat in categorical)
-
- assert len(attribute_names) == 20001
- assert all(isinstance(att, str) for att in attribute_names)
-
def test_get_sparse_dataset_dataframe(self):
rval, *_ = self.sparse_dataset.get_data()
assert isinstance(rval, pd.DataFrame)
@@ -439,59 +369,18 @@ def test_get_sparse_dataset_dataframe(self):
)
assert rval.shape == (600, 20001)
- def test_get_sparse_dataset_with_rowid(self):
- self.sparse_dataset.row_id_attribute = ["V256"]
- rval, _, categorical, _ = self.sparse_dataset.get_data(
- dataset_format="array",
- include_row_id=True,
- )
- assert sparse.issparse(rval)
- assert rval.dtype == np.float32
- assert rval.shape == (600, 20001)
- assert len(categorical) == 20001
-
- rval, _, categorical, _ = self.sparse_dataset.get_data(
- dataset_format="array",
- include_row_id=False,
- )
- assert sparse.issparse(rval)
- assert rval.dtype == np.float32
- assert rval.shape == (600, 20000)
- assert len(categorical) == 20000
-
- def test_get_sparse_dataset_with_ignore_attributes(self):
- self.sparse_dataset.ignore_attribute = ["V256"]
- rval, _, categorical, _ = self.sparse_dataset.get_data(
- dataset_format="array",
- include_ignore_attribute=True,
- )
- assert sparse.issparse(rval)
- assert rval.dtype == np.float32
- assert rval.shape == (600, 20001)
-
- assert len(categorical) == 20001
- rval, _, categorical, _ = self.sparse_dataset.get_data(
- dataset_format="array",
- include_ignore_attribute=False,
- )
- assert sparse.issparse(rval)
- assert rval.dtype == np.float32
- assert rval.shape == (600, 20000)
- assert len(categorical) == 20000
-
def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
# TODO: re-add row_id and ignore attributes
self.sparse_dataset.ignore_attribute = ["V256"]
self.sparse_dataset.row_id_attribute = ["V512"]
X, y, categorical, _ = self.sparse_dataset.get_data(
- dataset_format="array",
target="class",
include_row_id=False,
include_ignore_attribute=False,
)
- assert sparse.issparse(X)
- assert X.dtype == np.float32
- assert y.dtype in [np.int32, np.int64]
+ assert all(dtype == pd.SparseDtype(np.float32, fill_value=0.0) for dtype in X.dtypes)
+ # array format returned dense, but now we only return sparse and let the user handle it.
+ assert isinstance(y.dtypes, pd.SparseDtype)
assert X.shape == (600, 19998)
assert len(categorical) == 19998
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 1dc9daab1..fb29009a3 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -113,9 +113,8 @@ def test_tag_untag_dataset(self):
all_tags = _tag_entity("data", 1, tag, untag=True)
assert tag not in all_tags
- def test_list_datasets_output_format(self):
- datasets = openml.datasets.list_datasets(output_format="dataframe")
- assert isinstance(datasets, pd.DataFrame)
+ def test_list_datasets_length(self):
+ datasets = openml.datasets.list_datasets()
assert len(datasets) >= 100
def test_list_datasets_paginate(self):
@@ -123,14 +122,17 @@ def test_list_datasets_paginate(self):
max = 100
for i in range(0, max, size):
datasets = openml.datasets.list_datasets(offset=i, size=size)
- assert size == len(datasets)
- self._check_datasets(datasets)
+ assert len(datasets) == size
+ assert len(datasets.columns) >= 2
+ assert "did" in datasets.columns
+ assert datasets["did"].dtype == int
+ assert "status" in datasets.columns
+ assert datasets["status"].dtype == pd.CategoricalDtype(
+ categories=["in_preparation", "active", "deactivated"],
+ )
def test_list_datasets_empty(self):
- datasets = openml.datasets.list_datasets(
- tag="NoOneWouldUseThisTagAnyway",
- output_format="dataframe",
- )
+ datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
assert datasets.empty
@pytest.mark.production()
@@ -308,8 +310,9 @@ def ensure_absence_of_real_data():
def test_get_dataset_sparse(self):
dataset = openml.datasets.get_dataset(102)
- X, *_ = dataset.get_data(dataset_format="array")
- assert isinstance(X, scipy.sparse.csr_matrix)
+ X, *_ = dataset.get_data()
+ assert isinstance(X, pd.DataFrame)
+ assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes)
def test_download_rowid(self):
# Smoke test which checks that the dataset has the row-id set correctly
@@ -569,11 +572,7 @@ def test_upload_dataset_with_url(self):
def _assert_status_of_dataset(self, *, did: int, status: str):
"""Asserts there is exactly one dataset with id `did` and its current status is `status`"""
# need to use listing fn, as this is immune to cache
- result = openml.datasets.list_datasets(
- data_id=[did],
- status="all",
- output_format="dataframe",
- )
+ result = openml.datasets.list_datasets(data_id=[did], status="all")
result = result.to_dict(orient="index")
# I think we should drop the test that one result is returned,
# the server should never return multiple results?
@@ -1521,8 +1520,8 @@ def test_list_datasets_with_high_size_parameter(self):
# Testing on prod since concurrent deletion of uploded datasets make the test fail
openml.config.server = self.production_server
- datasets_a = openml.datasets.list_datasets(output_format="dataframe")
- datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf)
+ datasets_a = openml.datasets.list_datasets()
+ datasets_b = openml.datasets.list_datasets(size=np.inf)
# Reverting to test server
openml.config.server = self.test_server
@@ -1791,7 +1790,7 @@ def _assert_datasets_have_id_and_valid_status(datasets: pd.DataFrame):
@pytest.fixture(scope="module")
def all_datasets():
- return openml.datasets.list_datasets(output_format="dataframe")
+ return openml.datasets.list_datasets()
def test_list_datasets(all_datasets: pd.DataFrame):
@@ -1803,49 +1802,37 @@ def test_list_datasets(all_datasets: pd.DataFrame):
def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
- tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe")
+ tag_datasets = openml.datasets.list_datasets(tag="study_14")
assert 0 < len(tag_datasets) < len(all_datasets)
_assert_datasets_have_id_and_valid_status(tag_datasets)
def test_list_datasets_by_size():
- datasets = openml.datasets.list_datasets(size=5, output_format="dataframe")
+ datasets = openml.datasets.list_datasets(size=5)
assert len(datasets) == 5
_assert_datasets_have_id_and_valid_status(datasets)
def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
- small_datasets = openml.datasets.list_datasets(
- number_instances="5..100",
- output_format="dataframe",
- )
+ small_datasets = openml.datasets.list_datasets(number_instances="5..100")
assert 0 < len(small_datasets) <= len(all_datasets)
_assert_datasets_have_id_and_valid_status(small_datasets)
def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
- wide_datasets = openml.datasets.list_datasets(
- number_features="50..100",
- output_format="dataframe",
- )
+ wide_datasets = openml.datasets.list_datasets(number_features="50..100")
assert 8 <= len(wide_datasets) < len(all_datasets)
_assert_datasets_have_id_and_valid_status(wide_datasets)
def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
- five_class_datasets = openml.datasets.list_datasets(
- number_classes="5",
- output_format="dataframe",
- )
+ five_class_datasets = openml.datasets.list_datasets(number_classes="5")
assert 3 <= len(five_class_datasets) < len(all_datasets)
_assert_datasets_have_id_and_valid_status(five_class_datasets)
def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
- na_datasets = openml.datasets.list_datasets(
- number_missing_values="5..100",
- output_format="dataframe",
- )
+ na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
assert 5 <= len(na_datasets) < len(all_datasets)
_assert_datasets_have_id_and_valid_status(na_datasets)
@@ -1855,7 +1842,6 @@ def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
tag="study_14",
number_instances="100..1000",
number_missing_values="800..1000",
- output_format="dataframe",
)
assert 1 <= len(combined_filter_datasets) < len(all_datasets)
_assert_datasets_have_id_and_valid_status(combined_filter_datasets)
@@ -1955,8 +1941,12 @@ def test_get_dataset_with_invalid_id() -> None:
openml.datasets.get_dataset(INVALID_ID)
assert e.value.code == 111
+
def test_read_features_from_xml_with_whitespace() -> None:
from openml.datasets.dataset import _read_features
- features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+
+ features_file = (
+ Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+ )
dict = _read_features(features_file)
assert dict[1].nominal_values == [" - 50000.", " 50000+."]
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 7af01384f..37b0ce7c8 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -17,7 +17,6 @@ def _check_list_evaluation_setups(self, **kwargs):
"predictive_accuracy",
**kwargs,
sort_order="desc",
- output_format="dataframe",
)
evals = openml.evaluations.list_evaluations(
"predictive_accuracy",
@@ -250,7 +249,6 @@ def test_list_evaluations_setups_filter_flow(self):
flows=flow_id,
size=size,
sort_order="desc",
- output_format="dataframe",
parameters_in_separate_columns=True,
)
columns = list(evals_cols.columns)
diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
index a0980f5f9..a9ad7e8c1 100644
--- a/tests/test_evaluations/test_evaluations_example.py
+++ b/tests/test_evaluations/test_evaluations_example.py
@@ -18,14 +18,12 @@ def test_example_python_paper(self):
):
import matplotlib.pyplot as plt
import numpy as np
-
import openml
df = openml.evaluations.list_evaluations_setups(
"predictive_accuracy",
flows=[8353],
tasks=[6],
- output_format="dataframe",
parameters_in_separate_columns=True,
) # Choose an SVM flow, for example 8353, and a task.
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index e181aaa15..706a67aa6 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -62,6 +62,42 @@ def fit(self, X, y):
pass
+def _cat_col_selector(X):
+ return X.select_dtypes(include=["object", "category"]).columns
+
+
+def _get_sklearn_preprocessing():
+ from sklearn.compose import ColumnTransformer
+
+ return [
+ (
+ "cat_handling",
+ ColumnTransformer(
+ transformers=[
+ (
+ "cat",
+ sklearn.pipeline.Pipeline(
+ [
+ (
+ "cat_si",
+ SimpleImputer(
+ strategy="constant",
+ fill_value="missing",
+ ),
+ ),
+ ("cat_ohe", OneHotEncoder(handle_unknown="ignore")),
+ ],
+ ),
+ _cat_col_selector,
+ )
+ ],
+ remainder="passthrough",
+ ),
+ ),
+ ("imp", SimpleImputer()),
+ ]
+
+
class TestSklearnExtensionFlowFunctions(TestBase):
# Splitting not helpful, these test's don't rely on the server and take less
# than 1 seconds
@@ -261,7 +297,7 @@ def test_serialize_model(self):
("min_samples_split", "2"),
("min_weight_fraction_leaf", "0.0"),
("presort", presort_val),
- ('monotonic_cst', 'null'),
+ ("monotonic_cst", "null"),
("random_state", "null"),
("splitter", '"best"'),
),
@@ -331,21 +367,23 @@ def test_serialize_model_clustering(self):
n_init = '"auto"'
algorithm = '"auto"' if sklearn_version < Version("1.1") else '"lloyd"'
- fixture_parameters = OrderedDict([
- ("algorithm", algorithm),
- ("copy_x", "true"),
- ("init", '"k-means++"'),
- ("max_iter", "300"),
- ("n_clusters", "8"),
- ("n_init", n_init),
- ("n_jobs", n_jobs_val),
- ("precompute_distances", precomp_val),
- ("random_state", "null"),
- ("tol", "0.0001"),
- ("verbose", "0"),
- ])
-
- if sklearn_version >= Version("1.0" ):
+ fixture_parameters = OrderedDict(
+ [
+ ("algorithm", algorithm),
+ ("copy_x", "true"),
+ ("init", '"k-means++"'),
+ ("max_iter", "300"),
+ ("n_clusters", "8"),
+ ("n_init", n_init),
+ ("n_jobs", n_jobs_val),
+ ("precompute_distances", precomp_val),
+ ("random_state", "null"),
+ ("tol", "0.0001"),
+ ("verbose", "0"),
+ ]
+ )
+
+ if sklearn_version >= Version("1.0"):
fixture_parameters.pop("n_jobs")
fixture_parameters.pop("precompute_distances")
@@ -369,7 +407,9 @@ def test_serialize_model_clustering(self):
@pytest.mark.sklearn()
def test_serialize_model_with_subcomponent(self):
- estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ estimator_name = (
+ "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ )
estimator_param = {estimator_name: sklearn.tree.DecisionTreeClassifier()}
model = sklearn.ensemble.AdaBoostClassifier(
n_estimators=100,
@@ -428,8 +468,7 @@ def test_serialize_model_with_subcomponent(self):
serialization.components[estimator_name].class_name == fixture_subcomponent_class_name
)
assert (
- serialization.components[estimator_name].description
- == fixture_subcomponent_description
+ serialization.components[estimator_name].description == fixture_subcomponent_description
)
self.assertDictEqual(structure, fixture_structure)
@@ -702,7 +741,9 @@ def test_serialize_column_transformer_pipeline(self):
reason="Pipeline processing behaviour updated",
)
def test_serialize_feature_union(self):
- sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+ sparse_parameter = (
+ "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+ )
ohe_params = {sparse_parameter: False}
if Version(sklearn.__version__) >= Version("0.20"):
ohe_params["categories"] = "auto"
@@ -719,7 +760,9 @@ def test_serialize_feature_union(self):
)
structure = serialization.get_structure("name")
# OneHotEncoder was moved to _encoders module in 0.20
- module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ module_name_encoder = (
+ "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ )
scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
fixture_name = (
"sklearn.pipeline.FeatureUnion("
@@ -728,7 +771,7 @@ def test_serialize_feature_union(self):
)
fixture_structure = {
fixture_name: [],
- f"sklearn.preprocessing.{module_name_encoder}." "OneHotEncoder": ["ohe"],
+ f"sklearn.preprocessing.{module_name_encoder}.OneHotEncoder": ["ohe"],
f"sklearn.preprocessing.{scaler_name}.StandardScaler": ["scaler"],
}
assert serialization.name == fixture_name
@@ -765,7 +808,9 @@ def test_serialize_feature_union(self):
@pytest.mark.sklearn()
def test_serialize_feature_union_switched_names(self):
- ohe_params = {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {}
+ ohe_params = (
+ {"categories": "auto"} if Version(sklearn.__version__) >= Version("0.20") else {}
+ )
ohe = sklearn.preprocessing.OneHotEncoder(**ohe_params)
scaler = sklearn.preprocessing.StandardScaler()
fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)])
@@ -787,7 +832,9 @@ def test_serialize_feature_union_switched_names(self):
)
# OneHotEncoder was moved to _encoders module in 0.20
- module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ module_name_encoder = (
+ "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ )
scaler_name = "data" if Version(sklearn.__version__) < Version("0.22") else "_data"
assert (
fu1_serialization.name == "sklearn.pipeline.FeatureUnion("
@@ -836,7 +883,9 @@ def test_serialize_complex_flow(self):
)
structure = serialized.get_structure("name")
# OneHotEncoder was moved to _encoders module in 0.20
- module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ module_name_encoder = (
+ "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ )
ohe_name = "sklearn.preprocessing.%s.OneHotEncoder" % module_name_encoder
scaler_name = "sklearn.preprocessing.{}.StandardScaler".format(
"data" if Version(sklearn.__version__) < Version("0.22") else "_data",
@@ -849,13 +898,13 @@ def test_serialize_complex_flow(self):
weight_name,
tree_name,
)
- pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={}," "boosting={})".format(
+ pipeline_name = "sklearn.pipeline.Pipeline(ohe={},scaler={},boosting={})".format(
ohe_name,
scaler_name,
boosting_name,
)
fixture_name = (
- "sklearn.model_selection._search.RandomizedSearchCV" "(estimator=%s)" % pipeline_name
+ "sklearn.model_selection._search.RandomizedSearchCV(estimator=%s)" % pipeline_name
)
fixture_structure = {
ohe_name: ["estimator", "ohe"],
@@ -1222,7 +1271,7 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
fu = sklearn.pipeline.FeatureUnion((("pca1", pca), ("pca2", pca2)))
fixture = (
- "Found a second occurence of component .*.PCA when trying " "to serialize FeatureUnion"
+ "Found a second occurence of component .*.PCA when trying to serialize FeatureUnion"
)
with pytest.raises(ValueError, match=fixture):
self.extension.model_to_flow(fu)
@@ -1294,7 +1343,9 @@ def test_paralizable_check(self):
# using this param distribution should not raise an exception
legal_param_dist = {"n_estimators": [2, 3, 4]}
- estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ estimator_name = (
+ "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ )
legal_models = [
sklearn.ensemble.RandomForestClassifier(),
sklearn.ensemble.RandomForestClassifier(n_jobs=5),
@@ -1506,7 +1557,9 @@ def test_deserialize_complex_with_defaults(self):
pipe_adjusted = sklearn.clone(pipe_orig)
impute_strategy = "median" if Version(sklearn.__version__) < Version("0.23") else "mean"
sparse = Version(sklearn.__version__) >= Version("0.23")
- sparse_parameter = "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+ sparse_parameter = (
+ "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output"
+ )
estimator_name = (
"base_estimator" if Version(sklearn.__version__) < Version("1.2") else "estimator"
)
@@ -1532,7 +1585,9 @@ def test_deserialize_complex_with_defaults(self):
@pytest.mark.sklearn()
def test_openml_param_name_to_sklearn(self):
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
- estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ estimator_name = (
+ "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ )
boosting = sklearn.ensemble.AdaBoostClassifier(
**{estimator_name: sklearn.tree.DecisionTreeClassifier()},
)
@@ -1569,17 +1624,21 @@ def test_openml_param_name_to_sklearn(self):
def test_obtain_parameter_values_flow_not_from_server(self):
model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
flow = self.extension.model_to_flow(model)
- logistic_name = "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic"
+ logistic_name = (
+ "logistic" if Version(sklearn.__version__) < Version("0.22") else "_logistic"
+ )
msg = f"Flow sklearn.linear_model.{logistic_name}.LogisticRegression has no flow_id!"
with pytest.raises(ValueError, match=msg):
self.extension.obtain_parameter_values(flow)
- estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ estimator_name = (
+ "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ )
model = sklearn.ensemble.AdaBoostClassifier(
**{
estimator_name: sklearn.linear_model.LogisticRegression(
- solver="lbfgs",
+ solver="lbfgs",
),
}
)
@@ -1650,7 +1709,7 @@ def test_run_model_on_task(self):
("dummy", sklearn.dummy.DummyClassifier()),
],
)
- openml.runs.run_model_on_task(pipe, task, dataset_format="array")
+ openml.runs.run_model_on_task(pipe, task)
@pytest.mark.sklearn()
def test_seed_model(self):
@@ -1714,13 +1773,13 @@ def test_run_model_on_fold_classification_1_array(self):
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
- X_train = X[train_indices]
- y_train = y[train_indices]
- X_test = X[test_indices]
- y_test = y[test_indices]
+ X_train = X.iloc[train_indices]
+ y_train = y.iloc[train_indices]
+ X_test = X.iloc[test_indices]
+ y_test = y.iloc[test_indices]
pipeline = sklearn.pipeline.Pipeline(
- steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeClassifier())],
+ steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.tree.DecisionTreeClassifier())],
)
# TODO add some mocking here to actually test the innards of this function, too!
res = self.extension._run_model_on_fold(
@@ -1751,7 +1810,9 @@ def test_run_model_on_fold_classification_1_array(self):
assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
# check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+ fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+ lambda: collections.defaultdict(dict)
+ )
for measure in user_defined_measures:
fold_evaluations[measure][0][0] = user_defined_measures[measure]
@@ -1778,7 +1839,7 @@ def test_run_model_on_fold_classification_1_dataframe(self):
task = openml.tasks.get_task(1) # anneal; crossvalidation
# diff test_run_model_on_fold_classification_1_array()
- X, y = task.get_X_and_y(dataset_format="dataframe")
+ X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
@@ -1786,7 +1847,9 @@ def test_run_model_on_fold_classification_1_dataframe(self):
y_test = y.iloc[test_indices]
# Helper functions to return required columns for ColumnTransformer
- sparse = {"sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False}
+ sparse = {
+ "sparse" if Version(sklearn.__version__) < Version("1.4") else "sparse_output": False
+ }
cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore", **sparse),
@@ -1825,7 +1888,9 @@ def test_run_model_on_fold_classification_1_dataframe(self):
assert np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))
# check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+ fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+ lambda: collections.defaultdict(dict)
+ )
for measure in user_defined_measures:
fold_evaluations[measure][0][0] = user_defined_measures[measure]
@@ -1846,14 +1911,19 @@ def test_run_model_on_fold_classification_2(self):
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
- X_train = X[train_indices]
- y_train = y[train_indices]
- X_test = X[test_indices]
- y_test = y[test_indices]
+ X_train = X.iloc[train_indices]
+ y_train = y.iloc[train_indices]
+ X_test = X.iloc[test_indices]
+ y_test = y.iloc[test_indices]
pipeline = sklearn.model_selection.GridSearchCV(
- sklearn.tree.DecisionTreeClassifier(),
- {"max_depth": [1, 2]},
+ sklearn.pipeline.Pipeline(
+ steps=[
+ *_get_sklearn_preprocessing(),
+ ("clf", sklearn.tree.DecisionTreeClassifier()),
+ ],
+ ),
+ {"clf__max_depth": [1, 2]},
)
# TODO add some mocking here to actually test the innards of this function, too!
res = self.extension._run_model_on_fold(
@@ -1878,7 +1948,9 @@ def test_run_model_on_fold_classification_2(self):
assert np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape))
# check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+ fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+ lambda: collections.defaultdict(dict)
+ )
for measure in user_defined_measures:
fold_evaluations[measure][0][0] = user_defined_measures[measure]
@@ -1900,7 +1972,7 @@ class HardNaiveBayes(sklearn.naive_bayes.GaussianNB):
# class for testing a naive bayes classifier that does not allow soft
# predictions
def predict_proba(*args, **kwargs):
- raise AttributeError("predict_proba is not available when " "probability=False")
+ raise AttributeError("predict_proba is not available when probability=False")
# task 1 (test server) is important: it is a task with an unused class
tasks = [
@@ -1919,17 +1991,17 @@ def predict_proba(*args, **kwargs):
fold=0,
sample=0,
)
- X_train = X[train_indices]
- y_train = y[train_indices]
- X_test = X[test_indices]
+ X_train = X.iloc[train_indices]
+ y_train = y.iloc[train_indices]
+ X_test = X.iloc[test_indices]
clf1 = sklearn.pipeline.Pipeline(
steps=[
- ("imputer", SimpleImputer()),
+ *_get_sklearn_preprocessing(),
("estimator", sklearn.naive_bayes.GaussianNB()),
],
)
clf2 = sklearn.pipeline.Pipeline(
- steps=[("imputer", SimpleImputer()), ("estimator", HardNaiveBayes())],
+ steps=[*_get_sklearn_preprocessing(), ("estimator", HardNaiveBayes())],
)
pred_1, proba_1, _, _ = self.extension._run_model_on_fold(
@@ -1974,10 +2046,10 @@ def test_run_model_on_fold_regression(self):
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
- X_train = X[train_indices]
- y_train = y[train_indices]
- X_test = X[test_indices]
- y_test = y[test_indices]
+ X_train = X.iloc[train_indices]
+ y_train = y.iloc[train_indices]
+ X_test = X.iloc[test_indices]
+ y_test = y.iloc[test_indices]
pipeline = sklearn.pipeline.Pipeline(
steps=[("imp", SimpleImputer()), ("clf", sklearn.tree.DecisionTreeRegressor())],
@@ -2001,7 +2073,9 @@ def test_run_model_on_fold_regression(self):
assert y_hat_proba is None
# check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+ fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+ lambda: collections.defaultdict(dict)
+ )
for measure in user_defined_measures:
fold_evaluations[measure][0][0] = user_defined_measures[measure]
@@ -2023,10 +2097,10 @@ def test_run_model_on_fold_clustering(self):
openml.config.server = self.production_server
task = openml.tasks.get_task(126033)
- X = task.get_X(dataset_format="array")
+ X = task.get_X()
pipeline = sklearn.pipeline.Pipeline(
- steps=[("imp", SimpleImputer()), ("clf", sklearn.cluster.KMeans())],
+ steps=[*_get_sklearn_preprocessing(), ("clf", sklearn.cluster.KMeans())],
)
# TODO add some mocking here to actually test the innards of this function, too!
res = self.extension._run_model_on_fold(
@@ -2045,7 +2119,9 @@ def test_run_model_on_fold_clustering(self):
assert y_hat_proba is None
# check user defined measures
- fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(lambda: collections.defaultdict(dict))
+ fold_evaluations: dict[str, dict[int, dict[int, float]]] = collections.defaultdict(
+ lambda: collections.defaultdict(dict)
+ )
for measure in user_defined_measures:
fold_evaluations[measure][0][0] = user_defined_measures[measure]
@@ -2080,7 +2156,7 @@ def test__extract_trace_data(self):
X, y = task.get_X_and_y()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
- clf.fit(X[train], y[train])
+ clf.fit(X.iloc[train], y.iloc[train])
# check num layers of MLP
assert clf.best_estimator_.hidden_layer_sizes in param_grid["hidden_layer_sizes"]
@@ -2186,7 +2262,6 @@ def test_run_on_model_with_empty_steps(self):
X, y, categorical_ind, feature_names = dataset.get_data(
target=dataset.default_target_attribute,
- dataset_format="array",
)
categorical_ind = np.array(categorical_ind)
(cat_idx,) = np.where(categorical_ind)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index dafbeaf3c..dcf074c8f 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -101,20 +101,20 @@ def test_get_structure(self):
assert subflow.flow_id == sub_flow_id
def test_tagging(self):
- flows = openml.flows.list_flows(size=1, output_format="dataframe")
+ flows = openml.flows.list_flows(size=1)
flow_id = flows["id"].iloc[0]
flow = openml.flows.get_flow(flow_id)
# tags can be at most 64 alphanumeric (+ underscore) chars
unique_indicator = str(time.time()).replace(".", "")
tag = f"test_tag_TestFlow_{unique_indicator}"
- flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+ flows = openml.flows.list_flows(tag=tag)
assert len(flows) == 0
flow.push_tag(tag)
- flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+ flows = openml.flows.list_flows(tag=tag)
assert len(flows) == 1
assert flow_id in flows["id"]
flow.remove_tag(tag)
- flows = openml.flows.list_flows(tag=tag, output_format="dataframe")
+ flows = openml.flows.list_flows(tag=tag)
assert len(flows) == 0
def test_from_xml_to_xml(self):
@@ -156,7 +156,9 @@ def test_from_xml_to_xml(self):
@pytest.mark.sklearn()
def test_to_xml_from_xml(self):
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
- estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ estimator_name = (
+ "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ )
boosting = sklearn.ensemble.AdaBoostClassifier(
**{estimator_name: sklearn.tree.DecisionTreeClassifier()},
)
@@ -269,12 +271,14 @@ def test_semi_legal_flow(self):
# TODO: Test if parameters are set correctly!
# should not throw error as it contains two differentiable forms of
# Bagging i.e., Bagging(Bagging(J48)) and Bagging(J48)
- estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ estimator_name = (
+ "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ )
semi_legal = sklearn.ensemble.BaggingClassifier(
**{
estimator_name: sklearn.ensemble.BaggingClassifier(
**{
- estimator_name:sklearn.tree.DecisionTreeClassifier(),
+ estimator_name: sklearn.tree.DecisionTreeClassifier(),
}
)
}
@@ -428,7 +432,9 @@ def test_sklearn_to_upload_to_flow(self):
percentile=30,
)
fu = sklearn.pipeline.FeatureUnion(transformer_list=[("pca", pca), ("fs", fs)])
- estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ estimator_name = (
+ "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
+ )
boosting = sklearn.ensemble.AdaBoostClassifier(
**{estimator_name: sklearn.tree.DecisionTreeClassifier()},
)
@@ -499,7 +505,9 @@ def test_sklearn_to_upload_to_flow(self):
assert new_flow is not flow
# OneHotEncoder was moved to _encoders module in 0.20
- module_name_encoder = "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ module_name_encoder = (
+ "_encoders" if Version(sklearn.__version__) >= Version("0.20") else "data"
+ )
if Version(sklearn.__version__) < Version("0.22"):
fixture_name = (
f"{sentinel}sklearn.model_selection._search.RandomizedSearchCV("
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index b3d5be1a6..a25c2d740 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -51,7 +51,7 @@ def test_list_flows(self):
openml.config.server = self.production_server
# We can only perform a smoke test here because we test on dynamic
# data from the internet...
- flows = openml.flows.list_flows(output_format="dataframe")
+ flows = openml.flows.list_flows()
# 3000 as the number of flows on openml.org
assert len(flows) >= 1500
for flow in flows.to_dict(orient="index").values():
@@ -62,20 +62,20 @@ def test_list_flows_output_format(self):
openml.config.server = self.production_server
# We can only perform a smoke test here because we test on dynamic
# data from the internet...
- flows = openml.flows.list_flows(output_format="dataframe")
+ flows = openml.flows.list_flows()
assert isinstance(flows, pd.DataFrame)
assert len(flows) >= 1500
@pytest.mark.production()
def test_list_flows_empty(self):
openml.config.server = self.production_server
- flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123", output_format="dataframe")
+ flows = openml.flows.list_flows(tag="NoOneEverUsesThisTag123")
assert flows.empty
@pytest.mark.production()
def test_list_flows_by_tag(self):
openml.config.server = self.production_server
- flows = openml.flows.list_flows(tag="weka", output_format="dataframe")
+ flows = openml.flows.list_flows(tag="weka")
assert len(flows) >= 5
for flow in flows.to_dict(orient="index").values():
self._check_flow(flow)
@@ -86,7 +86,7 @@ def test_list_flows_paginate(self):
size = 10
maximum = 100
for i in range(0, maximum, size):
- flows = openml.flows.list_flows(offset=i, size=size, output_format="dataframe")
+ flows = openml.flows.list_flows(offset=i, size=size)
assert size >= len(flows)
for flow in flows.to_dict(orient="index").values():
self._check_flow(flow)
@@ -199,14 +199,18 @@ def test_are_flows_equal_ignore_parameter_values(self):
new_flow.parameters["a"] = 7
with pytest.raises(ValueError) as excinfo:
openml.flows.functions.assert_flows_equal(flow, new_flow)
- assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(excinfo.value)
+ assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
+ excinfo.value
+ )
openml.flows.functions.assert_flows_equal(flow, new_flow, ignore_parameter_values=True)
del new_flow.parameters["a"]
with pytest.raises(ValueError) as excinfo:
openml.flows.functions.assert_flows_equal(flow, new_flow)
- assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(excinfo.value)
+ assert str(paramaters) in str(excinfo.value) and str(new_flow.parameters) in str(
+ excinfo.value
+ )
self.assertRaisesRegex(
ValueError,
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index 51123b0d8..da6857b6e 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -17,7 +17,7 @@
class TestConfig(openml.testing.TestBase):
def test_too_long_uri(self):
with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
- openml.datasets.list_datasets(data_id=list(range(10000)), output_format="dataframe")
+ openml.datasets.list_datasets(data_id=list(range(10000)))
@unittest.mock.patch("time.sleep")
@unittest.mock.patch("requests.Session")
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index ce46b6548..58a0dddf5 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -26,21 +26,21 @@ class TestRun(TestBase):
# less than 1 seconds
def test_tagging(self):
- runs = openml.runs.list_runs(size=1, output_format="dataframe")
+ runs = openml.runs.list_runs(size=1)
assert not runs.empty, "Test server state is incorrect"
run_id = runs["run_id"].iloc[0]
run = openml.runs.get_run(run_id)
# tags can be at most 64 alphanumeric (+ underscore) chars
unique_indicator = str(time()).replace(".", "")
tag = f"test_tag_TestRun_{unique_indicator}"
- runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+ runs = openml.runs.list_runs(tag=tag)
assert len(runs) == 0
run.push_tag(tag)
- runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+ runs = openml.runs.list_runs(tag=tag)
assert len(runs) == 1
assert run_id in runs["run_id"]
run.remove_tag(tag)
- runs = openml.runs.list_runs(tag=tag, output_format="dataframe")
+ runs = openml.runs.list_runs(tag=tag)
assert len(runs) == 0
@staticmethod
@@ -204,17 +204,40 @@ def test_to_from_filesystem_no_model(self):
with self.assertRaises(ValueError, msg="Could not find model.pkl"):
openml.runs.OpenMLRun.from_filesystem(cache_path)
+ @staticmethod
+ def _cat_col_selector(X):
+ return X.select_dtypes(include=["object", "category"]).columns
+
@staticmethod
def _get_models_tasks_for_tests():
+ from sklearn.compose import ColumnTransformer
+ from sklearn.preprocessing import OneHotEncoder
+
+ basic_preprocessing = [
+ (
+ "cat_handling",
+ ColumnTransformer(
+ transformers=[
+ (
+ "cat",
+ OneHotEncoder(handle_unknown="ignore"),
+ TestRun._cat_col_selector,
+ )
+ ],
+ remainder="passthrough",
+ ),
+ ),
+ ("imp", SimpleImputer()),
+ ]
model_clf = Pipeline(
[
- ("imputer", SimpleImputer(strategy="mean")),
+ *basic_preprocessing,
("classifier", DummyClassifier(strategy="prior")),
],
)
model_reg = Pipeline(
[
- ("imputer", SimpleImputer(strategy="mean")),
+ *basic_preprocessing,
(
"regressor",
# LR because dummy does not produce enough float-like values
@@ -263,9 +286,8 @@ def assert_run_prediction_data(task, run, model):
assert_method = np.testing.assert_array_almost_equal
if task.task_type == "Supervised Classification":
- y_pred = np.take(task.class_labels, y_pred)
- y_test = np.take(task.class_labels, y_test)
assert_method = np.testing.assert_array_equal
+ y_test = y_test.values
# Assert correctness
assert_method(y_pred, saved_y_pred)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 2bd9ee0ed..7235075c0 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -29,6 +29,7 @@
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
+from sklearn.compose import ColumnTransformer
import openml
import openml._api_calls
@@ -130,9 +131,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
time.sleep(10)
continue
- assert (
- len(run.evaluations) > 0
- ), "Expect not-None evaluations to always contain elements."
+ assert len(run.evaluations) > 0, (
+ "Expect not-None evaluations to always contain elements."
+ )
return
raise RuntimeError(
@@ -271,7 +272,7 @@ def _remove_random_state(flow):
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
- assert np.count_nonzero(np.isnan(X)) == n_missing_vals
+ assert X.isna().sum().sum() == n_missing_vals
run = openml.runs.run_flow_on_task(
flow=flow,
task=task,
@@ -306,7 +307,7 @@ def _remove_random_state(flow):
flow_server = self.extension.model_to_flow(clf_server)
if flow.class_name not in classes_without_random_state:
- error_msg = "Flow class %s (id=%d) does not have a random " "state parameter" % (
+ error_msg = "Flow class %s (id=%d) does not have a random state parameter" % (
flow.class_name,
flow.flow_id,
)
@@ -400,7 +401,7 @@ def _check_sample_evaluations(
@pytest.mark.sklearn()
def test_run_regression_on_classif_task(self):
- task_id = 115 # diabetes; crossvalidation
+ task_id = 259 # collins; crossvalidation; has numeric targets
clf = LinearRegression()
task = openml.tasks.get_task(task_id)
@@ -413,7 +414,6 @@ def test_run_regression_on_classif_task(self):
model=clf,
task=task,
avoid_duplicate_runs=False,
- dataset_format="array",
)
@pytest.mark.sklearn()
@@ -480,7 +480,7 @@ def determine_grid_size(param_grid):
grid_iterations += determine_grid_size(sub_grid)
return grid_iterations
else:
- raise TypeError("Param Grid should be of type list " "(GridSearch only) or dict")
+ raise TypeError("Param Grid should be of type list (GridSearch only) or dict")
run = self._perform_run(
task_id,
@@ -1287,7 +1287,7 @@ def test_run_with_illegal_flow_id_1(self):
flow_new = self.extension.model_to_flow(clf)
flow_new.flow_id = -1
- expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
+ expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
with pytest.raises(openml.exceptions.PyOpenMLError, match=expected_message_regex):
openml.runs.run_flow_on_task(
task=task,
@@ -1327,7 +1327,7 @@ def test_run_with_illegal_flow_id_1_after_load(self):
run.to_filesystem(cache_path)
loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
- expected_message_regex = "Local flow_id does not match server flow_id: " "'-1' vs '[0-9]+'"
+ expected_message_regex = "Local flow_id does not match server flow_id: '-1' vs '[0-9]+'"
self.assertRaisesRegex(
openml.exceptions.PyOpenMLError,
expected_message_regex,
@@ -1355,7 +1355,6 @@ def test__run_task_get_arffcontent(self):
model=clf,
task=task,
add_local_measures=True,
- dataset_format="dataframe",
)
arff_datacontent, trace, fold_evaluations, _ = res
# predictions
@@ -1436,25 +1435,21 @@ def _check_run(self, run):
def test_get_runs_list(self):
# TODO: comes from live, no such lists on test
openml.config.server = self.production_server
- runs = openml.runs.list_runs(id=[2], show_errors=True, output_format="dataframe")
+ runs = openml.runs.list_runs(id=[2], display_errors=True)
assert len(runs) == 1
for run in runs.to_dict(orient="index").values():
self._check_run(run)
def test_list_runs_empty(self):
- runs = openml.runs.list_runs(task=[0], output_format="dataframe")
+ runs = openml.runs.list_runs(task=[0])
assert runs.empty
- def test_list_runs_output_format(self):
- runs = openml.runs.list_runs(size=1000, output_format="dataframe")
- assert isinstance(runs, pd.DataFrame)
-
@pytest.mark.production()
def test_get_runs_list_by_task(self):
# TODO: comes from live, no such lists on test
openml.config.server = self.production_server
task_ids = [20]
- runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
+ runs = openml.runs.list_runs(task=task_ids)
assert len(runs) >= 590
for run in runs.to_dict(orient="index").values():
assert run["task_id"] in task_ids
@@ -1462,7 +1457,7 @@ def test_get_runs_list_by_task(self):
num_runs = len(runs)
task_ids.append(21)
- runs = openml.runs.list_runs(task=task_ids, output_format="dataframe")
+ runs = openml.runs.list_runs(task=task_ids)
assert len(runs) >= num_runs + 1
for run in runs.to_dict(orient="index").values():
assert run["task_id"] in task_ids
@@ -1475,7 +1470,7 @@ def test_get_runs_list_by_uploader(self):
# 29 is Dominik Kirchhoff
uploader_ids = [29]
- runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
+ runs = openml.runs.list_runs(uploader=uploader_ids)
assert len(runs) >= 2
for run in runs.to_dict(orient="index").values():
assert run["uploader"] in uploader_ids
@@ -1484,7 +1479,7 @@ def test_get_runs_list_by_uploader(self):
uploader_ids.append(274)
- runs = openml.runs.list_runs(uploader=uploader_ids, output_format="dataframe")
+ runs = openml.runs.list_runs(uploader=uploader_ids)
assert len(runs) >= num_runs + 1
for run in runs.to_dict(orient="index").values():
assert run["uploader"] in uploader_ids
@@ -1495,7 +1490,7 @@ def test_get_runs_list_by_flow(self):
# TODO: comes from live, no such lists on test
openml.config.server = self.production_server
flow_ids = [1154]
- runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
+ runs = openml.runs.list_runs(flow=flow_ids)
assert len(runs) >= 1
for run in runs.to_dict(orient="index").values():
assert run["flow_id"] in flow_ids
@@ -1503,7 +1498,7 @@ def test_get_runs_list_by_flow(self):
num_runs = len(runs)
flow_ids.append(1069)
- runs = openml.runs.list_runs(flow=flow_ids, output_format="dataframe")
+ runs = openml.runs.list_runs(flow=flow_ids)
assert len(runs) >= num_runs + 1
for run in runs.to_dict(orient="index").values():
assert run["flow_id"] in flow_ids
@@ -1517,12 +1512,7 @@ def test_get_runs_pagination(self):
size = 10
max = 100
for i in range(0, max, size):
- runs = openml.runs.list_runs(
- offset=i,
- size=size,
- uploader=uploader_ids,
- output_format="dataframe",
- )
+ runs = openml.runs.list_runs(offset=i, size=size, uploader=uploader_ids)
assert size >= len(runs)
for run in runs.to_dict(orient="index").values():
assert run["uploader"] in uploader_ids
@@ -1545,23 +1535,22 @@ def test_get_runs_list_by_filters(self):
# self.assertRaises(openml.exceptions.OpenMLServerError,
# openml.runs.list_runs)
- runs = openml.runs.list_runs(id=ids, output_format="dataframe")
+ runs = openml.runs.list_runs(id=ids)
assert len(runs) == 2
- runs = openml.runs.list_runs(task=tasks, output_format="dataframe")
+ runs = openml.runs.list_runs(task=tasks)
assert len(runs) >= 2
- runs = openml.runs.list_runs(uploader=uploaders_2, output_format="dataframe")
+ runs = openml.runs.list_runs(uploader=uploaders_2)
assert len(runs) >= 10
- runs = openml.runs.list_runs(flow=flows, output_format="dataframe")
+ runs = openml.runs.list_runs(flow=flows)
assert len(runs) >= 100
runs = openml.runs.list_runs(
id=ids,
task=tasks,
uploader=uploaders_1,
- output_format="dataframe",
)
assert len(runs) == 2
@@ -1570,7 +1559,7 @@ def test_get_runs_list_by_tag(self):
# TODO: comes from live, no such lists on test
# Unit test works on production server only
openml.config.server = self.production_server
- runs = openml.runs.list_runs(tag="curves", output_format="dataframe")
+ runs = openml.runs.list_runs(tag="curves")
assert len(runs) >= 1
@pytest.mark.sklearn()
@@ -1601,7 +1590,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
task=task,
extension=self.extension,
add_local_measures=True,
- dataset_format="dataframe",
)
# 2 folds, 5 repeats; keep in mind that this task comes from the test
# server, the task on the live server is different
@@ -1645,7 +1633,6 @@ def test_run_on_dataset_with_missing_labels_array(self):
task=task,
extension=self.extension,
add_local_measures=True,
- dataset_format="array", # diff test_run_on_dataset_with_missing_labels_dataframe()
)
# 2 folds, 5 repeats; keep in mind that this task comes from the test
# server, the task on the live server is different
@@ -1767,11 +1754,28 @@ def test_format_prediction_task_regression(self):
def test__run_task_get_arffcontent_2(self, parallel_mock):
"""Tests if a run executed in parallel is collated correctly."""
task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp
- x, y = task.get_X_and_y(dataset_format="dataframe")
+ x, y = task.get_X_and_y()
num_instances = x.shape[0]
line_length = 6 + len(task.class_labels)
loss = "log" if Version(sklearn.__version__) < Version("1.3") else "log_loss"
- clf = SGDClassifier(loss=loss, random_state=1)
+ clf = sklearn.pipeline.Pipeline(
+ [
+ (
+ "cat_handling",
+ ColumnTransformer(
+ transformers=[
+ (
+ "cat",
+ OneHotEncoder(handle_unknown="ignore"),
+ x.select_dtypes(include=["object", "category"]).columns,
+ )
+ ],
+ remainder="passthrough",
+ ),
+ ),
+ ("clf", SGDClassifier(loss=loss, random_state=1)),
+ ]
+ )
n_jobs = 2
backend = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
with parallel_backend(backend, n_jobs=n_jobs):
@@ -1780,7 +1784,6 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
model=clf,
task=task,
add_local_measures=True,
- dataset_format="array", # "dataframe" would require handling of categoricals
n_jobs=n_jobs,
)
# This unit test will fail if joblib is unable to distribute successfully since the
@@ -1797,16 +1800,16 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
assert len(res[2]) == 7
assert len(res[3]) == 7
expected_scores = [
- 0.965625,
- 0.94375,
- 0.946875,
+ 0.9625,
0.953125,
- 0.96875,
0.965625,
- 0.9435736677115988,
- 0.9467084639498433,
- 0.9749216300940439,
- 0.9655172413793104,
+ 0.9125,
+ 0.98125,
+ 0.975,
+ 0.9247648902821317,
+ 0.9404388714733543,
+ 0.9780564263322884,
+ 0.9623824451410659,
]
scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
np.testing.assert_array_almost_equal(
@@ -1825,7 +1828,7 @@ def test__run_task_get_arffcontent_2(self, parallel_mock):
def test_joblib_backends(self, parallel_mock):
"""Tests evaluation of a run using various joblib backends and n_jobs."""
task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp
- x, y = task.get_X_and_y(dataset_format="dataframe")
+ x, y = task.get_X_and_y()
num_instances = x.shape[0]
line_length = 6 + len(task.class_labels)
@@ -1841,14 +1844,31 @@ def test_joblib_backends(self, parallel_mock):
(1, "sequential", 40),
]:
clf = sklearn.model_selection.RandomizedSearchCV(
- estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
+ estimator=sklearn.pipeline.Pipeline(
+ [
+ (
+ "cat_handling",
+ ColumnTransformer(
+ transformers=[
+ (
+ "cat",
+ OneHotEncoder(handle_unknown="ignore"),
+ x.select_dtypes(include=["object", "category"]).columns,
+ )
+ ],
+ remainder="passthrough",
+ ),
+ ),
+ ("clf", sklearn.ensemble.RandomForestClassifier(n_estimators=5)),
+ ]
+ ),
param_distributions={
- "max_depth": [3, None],
- "max_features": [1, 2, 3, 4],
- "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
- "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
- "bootstrap": [True, False],
- "criterion": ["gini", "entropy"],
+ "clf__max_depth": [3, None],
+ "clf__max_features": [1, 2, 3, 4],
+ "clf__min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+ "clf__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+ "clf__bootstrap": [True, False],
+ "clf__criterion": ["gini", "entropy"],
},
random_state=1,
cv=sklearn.model_selection.StratifiedKFold(
@@ -1865,7 +1885,6 @@ def test_joblib_backends(self, parallel_mock):
model=clf,
task=task,
add_local_measures=True,
- dataset_format="array", # "dataframe" would require handling of categoricals
n_jobs=n_jobs,
)
assert type(res[0]) == list
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 259cb98b4..b17d876b9 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -4,7 +4,6 @@
import hashlib
import time
import unittest.mock
-from typing import Dict
import pandas as pd
import pytest
@@ -156,22 +155,15 @@ def test_list_setups_empty(self):
def test_list_setups_output_format(self):
openml.config.server = self.production_server
flow_id = 6794
- setups = openml.setups.list_setups(flow=flow_id, output_format="object", size=10)
- assert isinstance(setups, Dict)
+ setups = openml.setups.list_setups(flow=flow_id, size=10)
+ assert isinstance(setups, dict)
assert isinstance(setups[next(iter(setups.keys()))], openml.setups.setup.OpenMLSetup)
assert len(setups) == 10
- setups = openml.setups.list_setups(flow=flow_id, output_format="dataframe", size=10)
+ setups = openml.setups.list_setups(flow=flow_id, size=10, output_format="dataframe")
assert isinstance(setups, pd.DataFrame)
assert len(setups) == 10
- # TODO: [0.15] Remove section as `dict` is no longer supported.
- with pytest.warns(FutureWarning):
- setups = openml.setups.list_setups(flow=flow_id, output_format="dict", size=10)
- assert isinstance(setups, Dict)
- assert isinstance(setups[next(iter(setups.keys()))], Dict)
- assert len(setups) == 10
-
def test_setuplist_offset(self):
size = 10
setups = openml.setups.list_setups(offset=0, size=size)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index d01a1dcf4..8652d5547 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -1,7 +1,6 @@
# License: BSD 3-Clause
from __future__ import annotations
-import pandas as pd
import pytest
import unittest
@@ -184,20 +183,21 @@ def test_publish_study(self):
self.assertSetEqual(set(study_downloaded.tasks), set(fixt_task_ids))
# test whether the list run function also handles study data fine
- run_ids = openml.runs.list_runs(study=study.id)
- self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
+ run_ids = openml.runs.list_runs(study=study.id) # returns DF
+ self.assertSetEqual(set(run_ids["run_id"]), set(study_downloaded.runs))
# test whether the list evaluation function also handles study data fine
- run_ids = openml.evaluations.list_evaluations(
+ run_ids = openml.evaluations.list_evaluations( # returns list of objects
"predictive_accuracy",
size=None,
study=study.id,
+ output_format="object", # making the default explicit
)
self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
# attach more runs, since we fetch 11 here, at least one is non-overlapping
run_list_additional = openml.runs.list_runs(size=11, offset=10)
- run_list_additional = set(run_list_additional) - set(run_ids)
+ run_list_additional = set(run_list_additional["run_id"]) - set(run_ids)
openml.study.attach_to_study(study.id, list(run_list_additional))
study_downloaded = openml.study.get_study(study.id)
# verify again
@@ -228,7 +228,7 @@ def test_study_attach_illegal(self):
benchmark_suite=None,
name="study with illegal runs",
description="none",
- run_ids=list(run_list.keys()),
+ run_ids=list(run_list["run_id"]),
)
study.publish()
TestBase._mark_entity_for_removal("study", study.id)
@@ -236,26 +236,23 @@ def test_study_attach_illegal(self):
study_original = openml.study.get_study(study.id)
with pytest.raises(
- openml.exceptions.OpenMLServerException, match="Problem attaching entities."
+ openml.exceptions.OpenMLServerException,
+ match="Problem attaching entities.",
):
# run id does not exists
openml.study.attach_to_study(study.id, [0])
with pytest.raises(
- openml.exceptions.OpenMLServerException, match="Problem attaching entities."
+ openml.exceptions.OpenMLServerException,
+ match="Problem attaching entities.",
):
# some runs already attached
- openml.study.attach_to_study(study.id, list(run_list_more.keys()))
+ openml.study.attach_to_study(study.id, list(run_list_more["run_id"]))
study_downloaded = openml.study.get_study(study.id)
self.assertListEqual(study_original.runs, study_downloaded.runs)
@unittest.skip("It is unclear when we can expect the test to pass or fail.")
def test_study_list(self):
- study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
+ study_list = openml.study.list_studies(status="in_preparation")
# might fail if server is recently reset
assert len(study_list) >= 2
-
- @unittest.skip("It is unclear when we can expect the test to pass or fail.")
- def test_study_list_output_format(self):
- study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
- assert isinstance(study_list, pd.DataFrame)
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index 661e8eced..bb4545154 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -1,7 +1,7 @@
# License: BSD 3-Clause
from __future__ import annotations
-import numpy as np
+import pandas as pd
from openml.tasks import TaskType, get_task
@@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1):
def test_get_X_and_Y(self):
X, Y = super().test_get_X_and_Y()
assert X.shape == (768, 8)
- assert isinstance(X, np.ndarray)
+ assert isinstance(X, pd.DataFrame)
assert Y.shape == (768,)
- assert isinstance(Y, np.ndarray)
- assert Y.dtype == int
+ assert isinstance(Y, pd.Series)
+ assert pd.api.types.is_categorical_dtype(Y)
def test_download_task(self):
task = super().test_download_task()
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 0e781c8ff..885f80a27 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -1,7 +1,7 @@
# License: BSD 3-Clause
from __future__ import annotations
-import numpy as np
+import pandas as pd
from openml.tasks import TaskType, get_task
@@ -20,10 +20,10 @@ def setUp(self, n_levels: int = 1):
def test_get_X_and_Y(self):
X, Y = super().test_get_X_and_Y()
assert X.shape == (768, 8)
- assert isinstance(X, np.ndarray)
+ assert isinstance(X, pd.DataFrame)
assert Y.shape == (768,)
- assert isinstance(Y, np.ndarray)
- assert Y.dtype == int
+ assert isinstance(Y, pd.Series)
+ assert pd.api.types.is_categorical_dtype(Y)
def test_download_task(self):
task = super().test_download_task()
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 29a8254df..36decc534 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -3,7 +3,7 @@
import ast
-import numpy as np
+import pandas as pd
import openml
from openml.exceptions import OpenMLServerException
@@ -51,10 +51,10 @@ def setUp(self, n_levels: int = 1):
def test_get_X_and_Y(self):
X, Y = super().test_get_X_and_Y()
assert X.shape == (194, 32)
- assert isinstance(X, np.ndarray)
+ assert isinstance(X, pd.DataFrame)
assert Y.shape == (194,)
- assert isinstance(Y, np.ndarray)
- assert Y.dtype == float
+ assert isinstance(Y, pd.Series)
+ assert pd.api.types.is_numeric_dtype(Y)
def test_download_task(self):
task = super().test_download_task()
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index 00ce1f276..9c90b7e03 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -3,7 +3,7 @@
import unittest
-import numpy as np
+import pandas as pd
from openml.tasks import get_task
@@ -27,7 +27,7 @@ def setUpClass(cls):
def setUp(self, n_levels: int = 1):
super().setUp()
- def test_get_X_and_Y(self) -> tuple[np.ndarray, np.ndarray]:
+ def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
task = get_task(self.task_id)
X, Y = task.get_X_and_y()
return X, Y
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index ec5a8caf5..311ffd365 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -71,7 +71,7 @@ def test_upload_task(self):
)
def _get_compatible_rand_dataset(self) -> list:
- active_datasets = list_datasets(status="active", output_format="dataframe")
+ active_datasets = list_datasets(status="active")
# depending on the task type, find either datasets
# with only symbolic features or datasets with only
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 046184791..856352ac2 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -57,7 +57,7 @@ def test__get_estimation_procedure_list(self):
def test_list_clustering_task(self):
# as shown by #383, clustering tasks can give list/dict casting problems
openml.config.server = self.production_server
- openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10, output_format="dataframe")
+ openml.tasks.list_tasks(task_type=TaskType.CLUSTERING, size=10)
# the expected outcome is that it doesn't crash. No assertions.
def _check_task(self, task):
@@ -72,34 +72,30 @@ def _check_task(self, task):
def test_list_tasks_by_type(self):
num_curves_tasks = 198 # number is flexible, check server if fails
ttid = TaskType.LEARNING_CURVE
- tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
+ tasks = openml.tasks.list_tasks(task_type=ttid)
assert len(tasks) >= num_curves_tasks
for task in tasks.to_dict(orient="index").values():
assert ttid == task["ttid"]
self._check_task(task)
- def test_list_tasks_output_format(self):
+ def test_list_tasks_length(self):
ttid = TaskType.LEARNING_CURVE
- tasks = openml.tasks.list_tasks(task_type=ttid, output_format="dataframe")
- assert isinstance(tasks, pd.DataFrame)
+ tasks = openml.tasks.list_tasks(task_type=ttid)
assert len(tasks) > 100
def test_list_tasks_empty(self):
- tasks = cast(
- pd.DataFrame,
- openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag", output_format="dataframe"),
- )
+ tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
assert tasks.empty
def test_list_tasks_by_tag(self):
num_basic_tasks = 100 # number is flexible, check server if fails
- tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
+ tasks = openml.tasks.list_tasks(tag="OpenML100")
assert len(tasks) >= num_basic_tasks
for task in tasks.to_dict(orient="index").values():
self._check_task(task)
def test_list_tasks(self):
- tasks = openml.tasks.list_tasks(output_format="dataframe")
+ tasks = openml.tasks.list_tasks()
assert len(tasks) >= 900
for task in tasks.to_dict(orient="index").values():
self._check_task(task)
@@ -108,7 +104,7 @@ def test_list_tasks_paginate(self):
size = 10
max = 100
for i in range(0, max, size):
- tasks = openml.tasks.list_tasks(offset=i, size=size, output_format="dataframe")
+ tasks = openml.tasks.list_tasks(offset=i, size=size)
assert size >= len(tasks)
for task in tasks.to_dict(orient="index").values():
self._check_task(task)
@@ -123,12 +119,7 @@ def test_list_tasks_per_type_paginate(self):
]
for j in task_types:
for i in range(0, max, size):
- tasks = openml.tasks.list_tasks(
- task_type=j,
- offset=i,
- size=size,
- output_format="dataframe",
- )
+ tasks = openml.tasks.list_tasks(task_type=j, offset=i, size=size)
assert size >= len(tasks)
for task in tasks.to_dict(orient="index").values():
assert j == task["ttid"]
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 552fbe949..4480c2cbc 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -20,14 +20,14 @@ def test_tagging(self):
# tags can be at most 64 alphanumeric (+ underscore) chars
unique_indicator = str(time()).replace(".", "")
tag = f"test_tag_OpenMLTaskMethodsTest_{unique_indicator}"
- tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+ tasks = openml.tasks.list_tasks(tag=tag)
assert len(tasks) == 0
task.push_tag(tag)
- tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+ tasks = openml.tasks.list_tasks(tag=tag)
assert len(tasks) == 1
assert 1 in tasks["tid"]
task.remove_tag(tag)
- tasks = openml.tasks.list_tasks(tag=tag, output_format="dataframe")
+ tasks = openml.tasks.list_tasks(tag=tag)
assert len(tasks) == 0
def test_get_train_and_test_split_indices(self):
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index d900671b7..3b4a34b57 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -3,7 +3,6 @@
import os
import unittest.mock
import pytest
-import shutil
import openml
from openml.testing import _check_dataset
@@ -34,7 +33,7 @@ def min_number_setups_on_test_server() -> int:
@pytest.fixture()
def min_number_runs_on_test_server() -> int:
- """After a reset at least 50 runs are on the test server"""
+ """After a reset at least 21 runs are on the test server"""
return 21
@@ -52,19 +51,11 @@ def _mocked_perform_api_call(call, request_method):
@pytest.mark.server()
def test_list_all():
openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
- openml.utils._list_all(
- listing_call=openml.tasks.functions._list_tasks,
- list_output_format="dataframe",
- )
@pytest.mark.server()
def test_list_all_for_tasks(min_number_tasks_on_test_server):
- tasks = openml.tasks.list_tasks(
- batch_size=1000,
- size=min_number_tasks_on_test_server,
- output_format="dataframe",
- )
+ tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
assert min_number_tasks_on_test_server == len(tasks)
@@ -73,20 +64,18 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
# By setting the batch size one lower than the minimum we guarantee at least two
# batches and at the same time do as few batches (roundtrips) as possible.
batch_size = min_number_tasks_on_test_server - 1
- res = openml.utils._list_all(
+ batches = openml.utils._list_all(
listing_call=openml.tasks.functions._list_tasks,
- list_output_format="dataframe",
batch_size=batch_size,
)
- assert min_number_tasks_on_test_server <= len(res)
+ assert len(batches) >= 2
+ assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
@pytest.mark.server()
def test_list_all_for_datasets(min_number_datasets_on_test_server):
datasets = openml.datasets.list_datasets(
- batch_size=100,
size=min_number_datasets_on_test_server,
- output_format="dataframe",
)
assert min_number_datasets_on_test_server == len(datasets)
@@ -96,11 +85,7 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
@pytest.mark.server()
def test_list_all_for_flows(min_number_flows_on_test_server):
- flows = openml.flows.list_flows(
- batch_size=25,
- size=min_number_flows_on_test_server,
- output_format="dataframe",
- )
+ flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
assert min_number_flows_on_test_server == len(flows)
@@ -115,7 +100,7 @@ def test_list_all_for_setups(min_number_setups_on_test_server):
@pytest.mark.server()
@pytest.mark.flaky() # Other tests might need to upload runs first
def test_list_all_for_runs(min_number_runs_on_test_server):
- runs = openml.runs.list_runs(batch_size=25, size=min_number_runs_on_test_server)
+ runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
assert min_number_runs_on_test_server == len(runs)
@@ -133,12 +118,7 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
@pytest.mark.server()
@unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
def test_list_all_few_results_available(_perform_api_call):
- datasets = openml.datasets.list_datasets(
- size=1000,
- data_name="iris",
- data_version=1,
- output_format="dataframe",
- )
+ datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
assert len(datasets) == 1, "only one iris dataset version 1 should be present"
assert _perform_api_call.call_count == 1, "expect just one call to get one dataset"
@@ -171,4 +151,4 @@ def test_correct_test_server_download_state():
"""
task = openml.tasks.get_task(119)
dataset = task.get_dataset()
- assert len(dataset.features) == dataset.get_data(dataset_format="dataframe")[0].shape[1]
\ No newline at end of file
+ assert len(dataset.features) == dataset.get_data()[0].shape[1]