diff --git a/openml/_api_calls.py b/openml/_api_calls.py index b74b50cb4..093b7c22d 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -208,6 +208,8 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None: for file_object in client.list_objects(bucket, prefix=prefix, recursive=True): if file_object.object_name is None: raise ValueError(f"Object name is None for object {file_object!r}") + if file_object.etag is None: + raise ValueError(f"Object etag is None for object {file_object!r}") marker = destination / file_object.etag if marker.exists(): diff --git a/openml/config.py b/openml/config.py index b21c981e2..d6841896e 100644 --- a/openml/config.py +++ b/openml/config.py @@ -251,7 +251,10 @@ def _setup(config: _Config | None = None) -> None: if not config_dir.exists(): config_dir.mkdir(exist_ok=True, parents=True) except PermissionError: - pass + openml_logger.warning( + f"No permission to create OpenML directory at {config_dir}!" + " This can result in OpenML-Python not working properly." + ) if config is None: config = _parse_config(config_file) @@ -269,27 +272,16 @@ def _setup(config: _Config | None = None) -> None: try: cache_exists = _root_cache_directory.exists() - except PermissionError: - cache_exists = False - - # create the cache subdirectory - try: - if not _root_cache_directory.exists(): + # create the cache subdirectory + if not cache_exists: _root_cache_directory.mkdir(exist_ok=True, parents=True) + _create_log_handlers() except PermissionError: openml_logger.warning( - f"No permission to create openml cache directory at {_root_cache_directory}!" - " This can result in OpenML-Python not working properly.", + f"No permission to create OpenML directory at {_root_cache_directory}!" + " This can result in OpenML-Python not working properly." ) - - if cache_exists: - _create_log_handlers() - else: _create_log_handlers(create_file_handler=False) - openml_logger.warning( - f"No permission to create OpenML directory at {config_dir}! This can result in " - " OpenML-Python not working properly.", - ) def set_field_in_config_file(field: str, value: Any) -> None: diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 510f767d5..c6af4a481 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -206,9 +206,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 avoid_duplicate_runs : bool, optional (default=True) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. - avoid_duplicate_runs : bool, optional (default=True) - If True, the run will throw an error if the setup/task combination is already present on - the server. This feature requires an internet connection. flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) diff --git a/openml/study/functions.py b/openml/study/functions.py index 7fdc6f636..d01df78c2 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -78,7 +78,7 @@ def get_study( return study -def _get_study(id_: int | str, entity_type: str) -> BaseStudy: +def _get_study(id_: int | str, entity_type: str) -> BaseStudy: # noqa: C901 xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get") force_list_tags = ( "oml:data_id", @@ -93,6 +93,12 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: alias = result_dict.get("oml:alias", None) main_entity_type = result_dict["oml:main_entity_type"] + # Parses edge cases where the server returns a string with a newline character for empty values. + none_value_indicator = "\n " + for key in result_dict: + if result_dict[key] == none_value_indicator: + result_dict[key] = None + if entity_type != main_entity_type: raise ValueError( f"Unexpected entity type '{main_entity_type}' reported by the server" diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index c6df73e0a..37cf6591d 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -36,8 +36,12 @@ def test_retry_on_database_error(self, Session_class_mock, _): assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20 + class FakeObject(NamedTuple): object_name: str + etag: str + """We use the etag of a Minio object as the name of a marker if we already downloaded it.""" + class FakeMinio: def __init__(self, objects: Iterable[FakeObject] | None = None): @@ -60,7 +64,7 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None: some_url = f"https://not.real.com/bucket/{some_object_path}" mock_minio.return_value = FakeMinio( objects=[ - FakeObject(some_object_path), + FakeObject(object_name=some_object_path, etag=str(hash(some_object_path))), ], ) @@ -71,3 +75,27 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None: time_modified = (tmp_path / some_filename).stat().st_mtime assert time_created == time_modified + + +@mock.patch.object(minio, "Minio") +def test_download_minio_failure(mock_minio, tmp_path: Path) -> None: + some_prefix, some_filename = "some/prefix", "dataset.arff" + some_object_path = f"{some_prefix}/{some_filename}" + some_url = f"https://not.real.com/bucket/{some_object_path}" + mock_minio.return_value = FakeMinio( + objects=[ + FakeObject(object_name=None, etag="tmp"), + ], + ) + + with pytest.raises(ValueError): + _download_minio_bucket(source=some_url, destination=tmp_path) + + mock_minio.return_value = FakeMinio( + objects=[ + FakeObject(object_name="tmp", etag=None), + ], + ) + + with pytest.raises(ValueError): + _download_minio_bucket(source=some_url, destination=tmp_path) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index a92cd0cfd..bb596aded 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -6,6 +6,7 @@ import unittest.mock from copy import copy from pathlib import Path +import platform import pytest @@ -17,6 +18,10 @@ class TestConfig(openml.testing.TestBase): @unittest.mock.patch("openml.config.openml_logger.warning") @unittest.mock.patch("openml.config._create_log_handlers") @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") + @unittest.skipIf( + platform.uname().release.endswith(("-Microsoft", "microsoft-standard-WSL2")), + "WSL does nto support chmod as we would need here, see https://github.com/microsoft/WSL/issues/81", + ) def test_non_writable_home(self, log_handler_mock, warnings_mock): with tempfile.TemporaryDirectory(dir=self.workdir) as td: os.chmod(td, 0o444) @@ -24,7 +29,7 @@ def test_non_writable_home(self, log_handler_mock, warnings_mock): _dd["cachedir"] = Path(td) / "something-else" openml.config._setup(_dd) - assert warnings_mock.call_count == 2 + assert warnings_mock.call_count == 1 assert log_handler_mock.call_count == 1 assert not log_handler_mock.call_args_list[0][1]["create_file_handler"] assert openml.config._root_cache_directory == Path(td) / "something-else" @@ -121,7 +126,7 @@ def test_example_configuration_start_twice(self): def test_configuration_file_not_overwritten_on_load(): - """ Regression test for #1337 """ + """Regression test for #1337""" config_file_content = "apikey = abcd" with tempfile.TemporaryDirectory() as tmpdir: config_file_path = Path(tmpdir) / "config" @@ -136,9 +141,10 @@ def test_configuration_file_not_overwritten_on_load(): assert config_file_content == new_file_content assert "abcd" == read_config["apikey"] + def test_configuration_loads_booleans(tmp_path): config_file_content = "avoid_duplicate_runs=true\nshow_progress=false" - with (tmp_path/"config").open("w") as config_file: + with (tmp_path / "config").open("w") as config_file: config_file.write(config_file_content) read_config = openml.config._parse_config(tmp_path) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 40a778d8b..55a53fc40 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -119,7 +119,6 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): # time.time() works in seconds start_time = time.time() while time.time() - start_time < max_waiting_time_seconds: - try: openml.runs.get_run_trace(run_id) except openml.exceptions.OpenMLServerException: @@ -131,7 +130,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): time.sleep(10) continue - assert len(run.evaluations) > 0, "Expect not-None evaluations to always contain elements." + assert ( + len(run.evaluations) > 0 + ), "Expect not-None evaluations to always contain elements." return raise RuntimeError( @@ -557,7 +558,7 @@ def determine_grid_size(param_grid): fold_evaluations=run.fold_evaluations, num_repeats=1, num_folds=num_folds, - task_type=task_type + task_type=task_type, ) # Check if run string and print representation do not run into an error @@ -796,7 +797,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): @pytest.mark.sklearn() def test_run_and_upload_gridsearch(self): - estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + estimator_name = ( + "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" + ) gridsearch = GridSearchCV( BaggingClassifier(**{estimator_name: SVC()}), {f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]}, @@ -1826,7 +1829,9 @@ def test_joblib_backends(self, parallel_mock): num_instances = x.shape[0] line_length = 6 + len(task.class_labels) - backend_choice = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" + backend_choice = ( + "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing" + ) for n_jobs, backend, call_count in [ (1, backend_choice, 10), (2, backend_choice, 10), @@ -1877,14 +1882,23 @@ def test_joblib_backends(self, parallel_mock): reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) def test_delete_run(self): - rs = 1 + rs = np.random.randint(1, 2**32 - 1) clf = sklearn.pipeline.Pipeline( - steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())], + steps=[ + (f"test_server_imputer_{rs}", SimpleImputer()), + ("estimator", DecisionTreeClassifier()), + ], ) task = openml.tasks.get_task(32) # diabetes; crossvalidation - run = openml.runs.run_model_on_task(model=clf, task=task, seed=rs) + run = openml.runs.run_model_on_task( + model=clf, task=task, seed=rs, avoid_duplicate_runs=False + ) run.publish() + + with pytest.raises(openml.exceptions.OpenMLRunsExistError): + openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True) + TestBase._mark_entity_for_removal("run", run.run_id) TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")