diff --git a/doc/progress.rst b/doc/progress.rst index c3aaf8d14..7dc633342 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,6 +8,7 @@ Changelog 0.11.1 ~~~~~~ +* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test. 0.11.0 ~~~~~~ diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 28bde17f6..c2eb8ee75 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -333,14 +333,23 @@ def _load_features_from_file(features_file: str) -> Dict: return xml_dict["oml:data_features"] -def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]: +def check_datasets_active( + dataset_ids: List[int], + raise_error_if_not_exist: bool = True, +) -> Dict[int, bool]: """ Check if the dataset ids provided are active. + Raises an error if a dataset_id in the given list + of dataset_ids does not exist on the server. + Parameters ---------- dataset_ids : List[int] A list of integers representing dataset ids. + raise_error_if_not_exist : bool (default=True) + Flag that if activated can raise an error, if one or more of the + given dataset ids do not exist on the server. Returns ------- @@ -353,7 +362,8 @@ def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]: for did in dataset_ids: dataset = dataset_list.get(did, None) if dataset is None: - raise ValueError("Could not find dataset {} in OpenML dataset list.".format(did)) + if raise_error_if_not_exist: + raise ValueError(f'Could not find dataset {did} in OpenML dataset list.') else: active[did] = dataset["status"] == "active" diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index c6e6f78f8..707b6f9c5 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -227,9 +227,13 @@ def test_list_datasets_empty(self): def test_check_datasets_active(self): # Have to test on live because there is no deactivated dataset on the test server. openml.config.server = self.production_server - active = openml.datasets.check_datasets_active([2, 17]) + active = openml.datasets.check_datasets_active( + [2, 17, 79], + raise_error_if_not_exist=False, + ) self.assertTrue(active[2]) self.assertFalse(active[17]) + self.assertIsNone(active.get(79)) self.assertRaisesRegex( ValueError, "Could not find dataset 79 in OpenML dataset list.",