diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index f81ddd23a..107827238 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -908,8 +908,18 @@ def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]: list """ for feature in self.features.values(): - if (feature.name == target_name) and (feature.data_type == "nominal"): - return feature.nominal_values + if feature.name == target_name: + if feature.data_type == "nominal": + return feature.nominal_values + + if feature.data_type == "string": + # Rel.: #1311 + # The target is invalid for a classification task if the feature type is string + # and not nominal. For such miss-configured tasks, we silently fix it here as + # we can safely interpreter string as nominal. + df, *_ = self.get_data() + return list(df[feature.name].unique()) + return None def get_features_by_type( # noqa: C901 diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 9fbb9259a..f3d269dc1 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -626,11 +626,18 @@ def test__retrieve_class_labels(self): openml.config.set_root_cache_directory(self.static_cache_dir) labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels() assert labels == ["1", "2", "3", "4", "5", "U"] + labels = openml.datasets.get_dataset(2, download_data=False).retrieve_class_labels( target_name="product-type", ) assert labels == ["C", "H", "G"] + # Test workaround for string-typed class labels + custom_ds = openml.datasets.get_dataset(2, download_data=False) + custom_ds.features[31].data_type = "string" + labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name) + assert labels == ["COIL", "SHEET"] + def test_upload_dataset_with_url(self): dataset = OpenMLDataset( "%s-UploadTestWithURL" % self._get_sentinel(),