From 99679d988c9cc24a459dd6a8e7ee46e12a7541a1 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 16 Sep 2021 18:13:26 +0200 Subject: [PATCH 1/4] Update function signatures for create_study|suite Publish explicitly empty suites and downloading any empty suites not currently supported. --- doc/progress.rst | 2 +- openml/study/functions.py | 24 ++++++++++++--------- tests/test_study/test_study_functions.py | 27 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/doc/progress.rst b/doc/progress.rst index 937c60eb2..110eb664c 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,7 +8,7 @@ Changelog 0.13.0 ~~~~~~ - + * FIX#1110: Make arguments to ``create_study`` and ``create_suite`` that are defined as optional actually optional. * MAIN#1088: Do CI for Windows on Github Actions instead of Appveyor. diff --git a/openml/study/functions.py b/openml/study/functions.py index ee877ddf2..2e852e1b8 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -177,9 +177,9 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: def create_study( name: str, description: str, - run_ids: List[int], - alias: Optional[str], - benchmark_suite: Optional[int], + run_ids: Optional[List[int]] = None, + alias: Optional[str] = None, + benchmark_suite: Optional[int] = None, ) -> OpenMLStudy: """ Creates an OpenML study (collection of data, tasks, flows, setups and run), @@ -188,16 +188,19 @@ def create_study( Parameters ---------- - alias : str (optional) - a string ID, unique on server (url-friendly) benchmark_suite : int (optional) the benchmark suite (another study) upon which this study is ran. name : str the name of the study (meta-info) description : str brief description (meta-info) - run_ids : list - a list of run ids associated with this study + run_ids : list, optional + a list of run ids associated with this study, + these can also be added later with ``attach_to_study``. + alias : str (optional) + a string ID, unique on server (url-friendly) + benchmark_suite: int (optional) + the ID of the suite for which this study contains run results Returns ------- @@ -223,7 +226,7 @@ def create_study( def create_benchmark_suite( - name: str, description: str, task_ids: List[int], alias: Optional[str], + name: str, description: str, task_ids: List[int], alias: Optional[str] = None, ) -> OpenMLBenchmarkSuite: """ Creates an OpenML benchmark suite (collection of entity types, where @@ -231,14 +234,15 @@ def create_benchmark_suite( Parameters ---------- - alias : str (optional) - a string ID, unique on server (url-friendly) name : str the name of the study (meta-info) description : str brief description (meta-info) task_ids : list a list of task ids associated with this study + more can be added later with ``attach_to_suite``. + alias : str (optional) + a string ID, unique on server (url-friendly) Returns ------- diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index e028ba2bd..7be1c71c0 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -1,4 +1,5 @@ # License: BSD 3-Clause +from typing import Optional, List import openml import openml.study @@ -114,6 +115,32 @@ def test_publish_benchmark_suite(self): self.assertEqual(study_downloaded.status, "deactivated") # can't delete study, now it's not longer in preparation + def _test_publish_empty_study_is_allowed(self, explicit: bool): + runs: Optional[List[int]] = [] if explicit else None + kind = "explicit" if explicit else "implicit" + + study = openml.study.create_study( + name=f"empty-study-{kind}", + description=f"a study with no runs attached {kind}ly", + run_ids=runs, + ) + + study.publish() + TestBase._mark_entity_for_removal("study", study.id) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id)) + + self.assertGreater(study.id, 0) + study_downloaded = openml.study.get_study(study.id) + self.assertEqual(study_downloaded.main_entity_type, "run") + self.assertListEqual(study_downloaded.runs, []) + openml.study.delete_study(study.id) + + def test_publish_empty_study_explicit(self): + self._test_publish_empty_study_is_allowed(explicit=True) + + def test_publish_empty_study_implicit(self): + self._test_publish_empty_study_is_allowed(explicit=False) + @pytest.mark.flaky() def test_publish_study(self): # get some random runs to attach From b6784bee3205f8609d12e96e0496a613f867d3bd Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 13 Oct 2021 17:04:39 +0200 Subject: [PATCH 2/4] Refactor _get_study --- openml/study/functions.py | 41 +++++++----------------- tests/test_study/test_study_functions.py | 4 +-- 2 files changed, 13 insertions(+), 32 deletions(-) diff --git a/openml/study/functions.py b/openml/study/functions.py index 2e852e1b8..144c089b3 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -3,7 +3,6 @@ from typing import cast, Dict, List, Optional, Union import warnings -import dateutil.parser import xmltodict import pandas as pd @@ -94,7 +93,6 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: description = result_dict["oml:description"] status = result_dict["oml:status"] creation_date = result_dict["oml:creation_date"] - creation_date_as_date = dateutil.parser.parse(creation_date) creator = result_dict["oml:creator"] # tags is legacy. remove once no longer needed. @@ -106,35 +104,18 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: current_tag["window_start"] = tag["oml:window_start"] tags.append(current_tag) - if "oml:data" in result_dict: - datasets = [int(x) for x in result_dict["oml:data"]["oml:data_id"]] - else: - raise ValueError("No datasets attached to study {}!".format(id_)) - if "oml:tasks" in result_dict: - tasks = [int(x) for x in result_dict["oml:tasks"]["oml:task_id"]] - else: - raise ValueError("No tasks attached to study {}!".format(id_)) + def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: + if result_dict.get(key) is not None: + return [int(oml_id) for oml_id in result_dict[key][subkey]] + return None - if main_entity_type in ["runs", "run"]: + datasets = get_nested_ids_from_result_dict("oml:data", "oml:data_id") + tasks = get_nested_ids_from_result_dict("oml:tasks", "oml:task_id") - if "oml:flows" in result_dict: - flows = [int(x) for x in result_dict["oml:flows"]["oml:flow_id"]] - else: - raise ValueError("No flows attached to study {}!".format(id_)) - if "oml:setups" in result_dict: - setups = [int(x) for x in result_dict["oml:setups"]["oml:setup_id"]] - else: - raise ValueError("No setups attached to study {}!".format(id_)) - if "oml:runs" in result_dict: - runs = [ - int(x) for x in result_dict["oml:runs"]["oml:run_id"] - ] # type: Optional[List[int]] - else: - if creation_date_as_date < dateutil.parser.parse("2019-01-01"): - # Legacy studies did not require runs - runs = None - else: - raise ValueError("No runs attached to study {}!".format(id_)) + if main_entity_type in ["runs", "run"]: + flows = get_nested_ids_from_result_dict("oml:flows", "oml:flow_id") + setups = get_nested_ids_from_result_dict("oml:setups", "oml:setup_id") + runs = get_nested_ids_from_result_dict("oml:runs", "oml:run_id") study = OpenMLStudy( study_id=study_id, @@ -220,7 +201,7 @@ def create_study( data=None, tasks=None, flows=None, - runs=run_ids, + runs=run_ids if run_ids != [] else None, setups=None, ) diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index 7be1c71c0..adb46db81 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -132,7 +132,7 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool): self.assertGreater(study.id, 0) study_downloaded = openml.study.get_study(study.id) self.assertEqual(study_downloaded.main_entity_type, "run") - self.assertListEqual(study_downloaded.runs, []) + self.assertIsNone(study_downloaded.runs) openml.study.delete_study(study.id) def test_publish_empty_study_explicit(self): @@ -241,7 +241,7 @@ def test_study_attach_illegal(self): def test_study_list(self): study_list = openml.study.list_studies(status="in_preparation") - # might fail if server is recently resetted + # might fail if server is recently reset self.assertGreaterEqual(len(study_list), 2) def test_study_list_output_format(self): From 47043cd4d0f7ac830d8f23ec793b194efea55e73 Mon Sep 17 00:00:00 2001 From: Pieter Gijsbers Date: Wed, 27 Oct 2021 14:55:10 +0200 Subject: [PATCH 3/4] Update doc/progress.rst Co-authored-by: Matthias Feurer --- doc/progress.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/progress.rst b/doc/progress.rst index 110eb664c..401550a4d 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -8,7 +8,7 @@ Changelog 0.13.0 ~~~~~~ - * FIX#1110: Make arguments to ``create_study`` and ``create_suite`` that are defined as optional actually optional. + * FIX#1110: Make arguments to ``create_study`` and ``create_suite`` that are defined as optional by the OpenML XSD actually optional. * MAIN#1088: Do CI for Windows on Github Actions instead of Appveyor. From 870e2b76ec2bf1c6e7b52cd803b37359abbc1cf0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 27 Oct 2021 15:13:14 +0200 Subject: [PATCH 4/4] Remove explicit delete study call Since unit test cleanup mechanism should take care of that. --- tests/test_study/test_study_functions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index adb46db81..904df4d3a 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -133,7 +133,6 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool): study_downloaded = openml.study.get_study(study.id) self.assertEqual(study_downloaded.main_entity_type, "run") self.assertIsNone(study_downloaded.runs) - openml.study.delete_study(study.id) def test_publish_empty_study_explicit(self): self._test_publish_empty_study_is_allowed(explicit=True)