Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/20_basic/simple_flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
dataset = openml.datasets.get_dataset(20)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="array", target=dataset.default_target_attribute
target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)
Expand Down
28 changes: 12 additions & 16 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,16 @@
############################################################################
# Get the actual data.
#
# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy
# sparse matrix, or as a Pandas DataFrame. The format is
# controlled with the parameter ``dataset_format`` which can be either 'array'
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
# and manually create a dataframe.
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="array", target=dataset.default_target_attribute
)
eeg = pd.DataFrame(X, columns=attribute_names)
eeg["class"] = y
print(eeg[:10])
# openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
# and also some additional metadata that we don't care about right now.
eeg, *_ = dataset.get_data()

############################################################################
# Instead of manually creating the dataframe, you can already request a
# dataframe with the correct dtypes.
# You can optionally choose to have openml separate out a column from the
# dataset. In particular, many datasets for supervised problems have a set
# `default_target_attribute` which may help identify the target variable.
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute, dataset_format="dataframe"
target=dataset.default_target_attribute
)
print(X.head())
print(X.info())
Expand All @@ -91,6 +84,9 @@
# data file. The dataset object can be used as normal.
# Whenever you use any functionality that requires the data,
# such as `get_data`, the data will be downloaded.
# Starting from 0.15, not downloading data will be the default behavior instead.
# The data will be downloading automatically when you try to access it through
# openml objects, e.g., using `dataset.features`.
dataset = openml.datasets.get_dataset(1471, download_data=False)

############################################################################
Expand All @@ -99,8 +95,8 @@
# * Explore the data visually.
eegs = eeg.sample(n=1000)
_ = pd.plotting.scatter_matrix(
eegs.iloc[:100, :4],
c=eegs[:100]["class"],
X.iloc[:100, :4],
c=y[:100],
figsize=(10, 10),
marker="o",
hist_kwds={"bins": 20},
Expand Down
9 changes: 4 additions & 5 deletions examples/30_extended/flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
dataset = openml.datasets.get_dataset(68)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="array", target=dataset.default_target_attribute
target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)
Expand All @@ -38,7 +38,7 @@
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(17)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="array", target=dataset.default_target_attribute
target=dataset.default_target_attribute
)
print(f"Categorical features: {categorical_indicator}")
transformer = compose.ColumnTransformer(
Expand Down Expand Up @@ -160,7 +160,7 @@
]
)

run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
myrun = run.publish()
print(f"Uploaded to {myrun.openml_url}")

Expand All @@ -172,15 +172,14 @@

# To perform the following line offline, it is required to have been called before
# such that the task is cached on the local openml cache directory:
task = openml.tasks.get_task(6)
task = openml.tasks.get_task(96)

# The following lines can then be executed offline:
run = openml.runs.run_model_on_task(
pipe,
task,
avoid_duplicate_runs=False,
upload_flow=False,
dataset_format="array",
)

# The run may be stored offline, and the flow will be stored along with it:
Expand Down
14 changes: 13 additions & 1 deletion openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,9 +716,11 @@ def get_data(
on the server in the dataset.
dataset_format : string (default='dataframe')
The format of returned dataset.
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse
matrix. Support for ``array`` will be removed in 0.15.
If ``dataframe``, the returned dataset will be a Pandas DataFrame.


Returns
-------
X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
Expand All @@ -730,6 +732,16 @@ def get_data(
attribute_names : List[str]
List of attribute names.
"""
# TODO: [0.15]
if dataset_format == "array":
warnings.warn(
"Support for `dataset_format='array'` will be removed in 0.15,"
"start using `dataset_format='dataframe' to ensure your code "
"will continue to work. You can use the dataframe's `to_numpy` "
"function to continue using numpy arrays.",
category=FutureWarning,
stacklevel=2,
)
data, categorical, attribute_names = self._load_data()

to_exclude = []
Expand Down
12 changes: 11 additions & 1 deletion openml/tasks/task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# License: BSD 3-Clause

import warnings
from abc import ABC
from collections import OrderedDict
from enum import Enum
Expand Down Expand Up @@ -256,6 +256,16 @@ def get_X_and_y(
tuple - X and y

"""
# TODO: [0.15]
if dataset_format == "array":
warnings.warn(
"Support for `dataset_format='array'` will be removed in 0.15,"
"start using `dataset_format='dataframe' to ensure your code "
"will continue to work. You can use the dataframe's `to_numpy` "
"function to continue using numpy arrays.",
category=FutureWarning,
stacklevel=2,
)
dataset = self.get_dataset()
if self.task_type_id not in (
TaskType.SUPERVISED_CLASSIFICATION,
Expand Down