Skip to content
2 changes: 1 addition & 1 deletion .github/workflows/ubuntu-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
- name: Run tests
run: |
if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long --cov-report=xml'; fi
pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv $codecov
pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv $codecov
- name: Check for files left behind by test
if: ${{ always() }}
run: |
Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ build: false

test_script:
- "cd C:\\projects\\openml-python"
- "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread -sv"
- "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread --dist load -sv"
21 changes: 5 additions & 16 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import numpy as np
import pandas as pd
import scipy.sparse
from warnings import warn

from openml.base import OpenMLBase
from .data_feature import OpenMLDataFeature
Expand All @@ -34,7 +33,7 @@ class OpenMLDataset(OpenMLBase):
Name of the dataset.
description : str
Description of the dataset.
format : str
data_format : str
Format of the dataset which can be either 'arff' or 'sparse_arff'.
cache_format : str
Format for caching the dataset which can be either 'feather' or 'pickle'.
Expand Down Expand Up @@ -103,7 +102,6 @@ def __init__(
self,
name,
description,
format=None,
data_format="arff",
cache_format="pickle",
dataset_id=None,
Expand Down Expand Up @@ -178,16 +176,8 @@ def find_invalid_characters(string, pattern):
)

self.cache_format = cache_format
if format is None:
self.format = data_format
else:
warn(
"The format parameter in the init will be deprecated "
"in the future."
"Please use data_format instead",
DeprecationWarning,
)
self.format = format
# Has to be called format, otherwise there will be an XML upload error
self.format = data_format
self.creator = creator
self.contributor = contributor
self.collection_date = collection_date
Expand Down Expand Up @@ -456,12 +446,11 @@ def _parse_data_from_arff(
col.append(
self._unpack_categories(X[column_name], categories_names[column_name])
)
elif attribute_dtype[column_name] in ('floating',
'integer'):
elif attribute_dtype[column_name] in ("floating", "integer"):
X_col = X[column_name]
if X_col.min() >= 0 and X_col.max() <= 255:
try:
X_col_uint = X_col.astype('uint8')
X_col_uint = X_col.astype("uint8")
if (X_col == X_col_uint).all():
col.append(X_col_uint)
continue
Expand Down
2 changes: 1 addition & 1 deletion openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -1748,7 +1748,7 @@ def _prediction_to_probabilities(
proba_y.shape[1], len(task.class_labels),
)
warnings.warn(message)
openml.config.logger.warn(message)
openml.config.logger.warning(message)

for i, col in enumerate(task.class_labels):
# adding missing columns with 0 probability
Expand Down
2 changes: 1 addition & 1 deletion openml/flows/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":

if not self.description:
logger = logging.getLogger(__name__)
logger.warn("Flow % has empty description", self.name)
logger.warning("Flow % has empty description", self.name)

flow_parameters = []
for key in self.parameters:
Expand Down
2 changes: 1 addition & 1 deletion openml/study/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_study(
"of things have changed since then. Please use `get_suite('OpenML100')` instead."
)
warnings.warn(message, DeprecationWarning)
openml.config.logger.warn(message)
openml.config.logger.warning(message)
study = _get_study(study_id, entity_type="task")
return cast(OpenMLBenchmarkSuite, study) # type: ignore
else:
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def delete_remote_files(tracker) -> None:
openml.utils._delete_entity(entity_type, entity)
logger.info("Deleted ({}, {})".format(entity_type, entity))
except Exception as e:
logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e))
logger.warning("Cannot delete ({},{}): {}".format(entity_type, entity, e))


def pytest_sessionstart() -> None:
Expand Down
17 changes: 4 additions & 13 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# License: BSD 3-Clause

from time import time
from warnings import filterwarnings, catch_warnings

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -120,11 +119,11 @@ def test_get_data_no_str_data_for_nparrays(self):

def _check_expected_type(self, dtype, is_cat, col):
if is_cat:
expected_type = 'category'
elif not col.isna().any() and (col.astype('uint8') == col).all():
expected_type = 'uint8'
expected_type = "category"
elif not col.isna().any() and (col.astype("uint8") == col).all():
expected_type = "uint8"
else:
expected_type = 'float64'
expected_type = "float64"

self.assertEqual(dtype.name, expected_type)

Expand Down Expand Up @@ -192,14 +191,6 @@ def test_get_data_with_ignore_attributes(self):
self.assertEqual(rval.shape, (898, 38))
self.assertEqual(len(categorical), 38)

def test_dataset_format_constructor(self):

with catch_warnings():
filterwarnings("error")
self.assertRaises(
DeprecationWarning, openml.OpenMLDataset, "Test", "Test", format="arff"
)

def test_get_data_with_nonexisting_class(self):
# This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
# label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
Expand Down
48 changes: 35 additions & 13 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import random
from itertools import product
from unittest import mock
import shutil

import arff
import time
Expand Down Expand Up @@ -373,9 +374,9 @@ def test_get_dataset_by_name(self):
def test_get_dataset_uint8_dtype(self):
dataset = openml.datasets.get_dataset(1)
self.assertEqual(type(dataset), OpenMLDataset)
self.assertEqual(dataset.name, 'anneal')
self.assertEqual(dataset.name, "anneal")
df, _, _, _ = dataset.get_data()
self.assertEqual(df['carbon'].dtype, 'uint8')
self.assertEqual(df["carbon"].dtype, "uint8")

def test_get_dataset(self):
# This is the only non-lazy load to ensure default behaviour works.
Expand Down Expand Up @@ -1154,27 +1155,31 @@ def test_publish_fetch_ignore_attribute(self):
# test if publish was successful
self.assertIsInstance(dataset.id, int)

downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id)
self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute)

def _wait_for_dataset_being_processed(self, dataset_id):
downloaded_dataset = None
# fetching from server
# loop till timeout or fetch not successful
max_waiting_time_seconds = 400
max_waiting_time_seconds = 600
# time.time() works in seconds
start_time = time.time()
while time.time() - start_time < max_waiting_time_seconds:
try:
downloaded_dataset = openml.datasets.get_dataset(dataset.id)
downloaded_dataset = openml.datasets.get_dataset(dataset_id)
break
except Exception as e:
# returned code 273: Dataset not processed yet
# returned code 362: No qualities found
TestBase.logger.error(
"Failed to fetch dataset:{} with '{}'.".format(dataset.id, str(e))
"Failed to fetch dataset:{} with '{}'.".format(dataset_id, str(e))
)
time.sleep(10)
continue
if downloaded_dataset is None:
raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset.id))
self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute)
raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset_id))
return downloaded_dataset

def test_create_dataset_row_id_attribute_error(self):
# meta-information
Expand Down Expand Up @@ -1347,7 +1352,7 @@ def test_get_dataset_cache_format_feather(self):
self.assertEqual(len(categorical), X.shape[1])
self.assertEqual(len(attribute_names), X.shape[1])

def test_data_edit(self):
def test_data_edit_non_critical_field(self):
# Case 1
# All users can edit non-critical fields of datasets
desc = (
Expand All @@ -1368,14 +1373,31 @@ def test_data_edit(self):
edited_dataset = openml.datasets.get_dataset(did)
self.assertEqual(edited_dataset.description, desc)

def test_data_edit_critical_field(self):
# Case 2
# only owners (or admin) can edit all critical fields of datasets
# this is a dataset created by CI, so it is editable by this test
did = 315
result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2")
# for this, we need to first clone a dataset to do changes
did = fork_dataset(1)
self._wait_for_dataset_being_processed(did)
result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
self.assertEqual(did, result)
edited_dataset = openml.datasets.get_dataset(did)
self.assertEqual(edited_dataset.ignore_attribute, ["col_2"])

n_tries = 10
# we need to wait for the edit to be reflected on the server
for i in range(n_tries):
edited_dataset = openml.datasets.get_dataset(did)
try:
self.assertEqual(edited_dataset.default_target_attribute, "shape", edited_dataset)
self.assertEqual(edited_dataset.ignore_attribute, ["oil"], edited_dataset)
break
except AssertionError as e:
if i == n_tries - 1:
raise e
time.sleep(10)
# Delete the cache dir to get the newer version of the dataset
shutil.rmtree(
os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did))
)

def test_data_edit_errors(self):
# Check server exception when no field to edit is provided
Expand Down
14 changes: 11 additions & 3 deletions tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def determine_grid_size(param_grid):
# suboptimal (slow), and not guaranteed to work if evaluation
# engine is behind.
# TODO: mock this? We have the arff already on the server
self._wait_for_processed_run(run.run_id, 400)
self._wait_for_processed_run(run.run_id, 600)
try:
model_prime = openml.runs.initialize_model_from_trace(
run_id=run.run_id, repeat=0, fold=0,
Expand Down Expand Up @@ -519,7 +519,7 @@ def _run_and_upload_regression(
)

def test_run_and_upload_logistic_regression(self):
lr = LogisticRegression(solver="lbfgs")
lr = LogisticRegression(solver="lbfgs", max_iter=1000)
task_id = self.TEST_SERVER_TASK_SIMPLE[0]
n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
Expand Down Expand Up @@ -605,7 +605,8 @@ def get_ct_cf(nominal_indices, numeric_indices):
LooseVersion(sklearn.__version__) < "0.20",
reason="columntransformer introduction in 0.20.0",
)
def test_run_and_upload_knn_pipeline(self):
@unittest.mock.patch("warnings.warn")
def test_run_and_upload_knn_pipeline(self, warnings_mock):

cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
Expand Down Expand Up @@ -635,11 +636,18 @@ def test_run_and_upload_knn_pipeline(self):
n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS[1]
n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS[2]
self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501")
# The warning raised is:
# The total space of parameters 8 is smaller than n_iter=10.
# Running 8 iterations. For exhaustive searches, use GridSearchCV.'
# It is raised three times because we once run the model to upload something and then run
# it again twice to compare that the predictions are reproducible.
self.assertEqual(warnings_mock.call_count, 3)

def test_run_and_upload_gridsearch(self):
gridsearch = GridSearchCV(
BaggingClassifier(base_estimator=SVC()),
{"base_estimator__C": [0.01, 0.1, 10], "base_estimator__gamma": [0.01, 0.1, 10]},
cv=3,
)
task_id = self.TEST_SERVER_TASK_SIMPLE[0]
n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tasks/test_task_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ def test_get_train_and_test_split_indices(self):
self.assertEqual(681, train_indices[-1])
self.assertEqual(583, test_indices[0])
self.assertEqual(24, test_indices[-1])
self.assertRaisesRegexp(
self.assertRaisesRegex(
ValueError, "Fold 10 not known", task.get_train_test_split_indices, 10, 0
)
self.assertRaisesRegexp(
self.assertRaisesRegex(
ValueError, "Repeat 10 not known", task.get_train_test_split_indices, 0, 10
)