diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py index 1d3bb5d6f..0176328b6 100644 --- a/examples/20_basic/simple_flows_and_runs_tutorial.py +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -23,7 +23,7 @@ # NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20 dataset = openml.datasets.get_dataset(20) X, y, categorical_indicator, attribute_names = dataset.get_data( - dataset_format="array", target=dataset.default_target_attribute + target=dataset.default_target_attribute ) clf = neighbors.KNeighborsClassifier(n_neighbors=3) clf.fit(X, y) diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index 78ada4fde..764cb8f36 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -64,23 +64,16 @@ ############################################################################ # Get the actual data. # -# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy -# sparse matrix, or as a Pandas DataFrame. The format is -# controlled with the parameter ``dataset_format`` which can be either 'array' -# (default) or 'dataframe'. Let's first build our dataset from a NumPy array -# and manually create a dataframe. -X, y, categorical_indicator, attribute_names = dataset.get_data( - dataset_format="array", target=dataset.default_target_attribute -) -eeg = pd.DataFrame(X, columns=attribute_names) -eeg["class"] = y -print(eeg[:10]) +# openml-python returns data as pandas dataframes (stored in the `eeg` variable below), +# and also some additional metadata that we don't care about right now. +eeg, *_ = dataset.get_data() ############################################################################ -# Instead of manually creating the dataframe, you can already request a -# dataframe with the correct dtypes. +# You can optionally choose to have openml separate out a column from the +# dataset. In particular, many datasets for supervised problems have a set +# `default_target_attribute` which may help identify the target variable. X, y, categorical_indicator, attribute_names = dataset.get_data( - target=dataset.default_target_attribute, dataset_format="dataframe" + target=dataset.default_target_attribute ) print(X.head()) print(X.info()) @@ -91,6 +84,9 @@ # data file. The dataset object can be used as normal. # Whenever you use any functionality that requires the data, # such as `get_data`, the data will be downloaded. +# Starting from 0.15, not downloading data will be the default behavior instead. +# The data will be downloading automatically when you try to access it through +# openml objects, e.g., using `dataset.features`. dataset = openml.datasets.get_dataset(1471, download_data=False) ############################################################################ @@ -99,8 +95,8 @@ # * Explore the data visually. eegs = eeg.sample(n=1000) _ = pd.plotting.scatter_matrix( - eegs.iloc[:100, :4], - c=eegs[:100]["class"], + X.iloc[:100, :4], + c=y[:100], figsize=(10, 10), marker="o", hist_kwds={"bins": 20}, diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 05b8c8cce..38b0d23cf 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -27,7 +27,7 @@ # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68 dataset = openml.datasets.get_dataset(68) X, y, categorical_indicator, attribute_names = dataset.get_data( - dataset_format="array", target=dataset.default_target_attribute + target=dataset.default_target_attribute ) clf = neighbors.KNeighborsClassifier(n_neighbors=1) clf.fit(X, y) @@ -38,7 +38,7 @@ # * e.g. categorical features -> do feature encoding dataset = openml.datasets.get_dataset(17) X, y, categorical_indicator, attribute_names = dataset.get_data( - dataset_format="array", target=dataset.default_target_attribute + target=dataset.default_target_attribute ) print(f"Categorical features: {categorical_indicator}") transformer = compose.ColumnTransformer( @@ -160,7 +160,7 @@ ] ) -run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array") +run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False) myrun = run.publish() print(f"Uploaded to {myrun.openml_url}") @@ -172,7 +172,7 @@ # To perform the following line offline, it is required to have been called before # such that the task is cached on the local openml cache directory: -task = openml.tasks.get_task(6) +task = openml.tasks.get_task(96) # The following lines can then be executed offline: run = openml.runs.run_model_on_task( @@ -180,7 +180,6 @@ task, avoid_duplicate_runs=False, upload_flow=False, - dataset_format="array", ) # The run may be stored offline, and the flow will be stored along with it: diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index ce6a53bb1..dcdef162d 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -716,9 +716,11 @@ def get_data( on the server in the dataset. dataset_format : string (default='dataframe') The format of returned dataset. - If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix. + If ``array``, the returned dataset will be a NumPy array or a SciPy sparse + matrix. Support for ``array`` will be removed in 0.15. If ``dataframe``, the returned dataset will be a Pandas DataFrame. + Returns ------- X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns) @@ -730,6 +732,16 @@ def get_data( attribute_names : List[str] List of attribute names. """ + # TODO: [0.15] + if dataset_format == "array": + warnings.warn( + "Support for `dataset_format='array'` will be removed in 0.15," + "start using `dataset_format='dataframe' to ensure your code " + "will continue to work. You can use the dataframe's `to_numpy` " + "function to continue using numpy arrays.", + category=FutureWarning, + stacklevel=2, + ) data, categorical, attribute_names = self._load_data() to_exclude = [] diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 944c75b80..36e0ada1c 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -1,5 +1,5 @@ # License: BSD 3-Clause - +import warnings from abc import ABC from collections import OrderedDict from enum import Enum @@ -256,6 +256,16 @@ def get_X_and_y( tuple - X and y """ + # TODO: [0.15] + if dataset_format == "array": + warnings.warn( + "Support for `dataset_format='array'` will be removed in 0.15," + "start using `dataset_format='dataframe' to ensure your code " + "will continue to work. You can use the dataframe's `to_numpy` " + "function to continue using numpy arrays.", + category=FutureWarning, + stacklevel=2, + ) dataset = self.get_dataset() if self.task_type_id not in ( TaskType.SUPERVISED_CLASSIFICATION,