diff --git a/.travis.yml b/.travis.yml index 80f3bda42..9fd33403c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,20 +15,21 @@ env: - TEST_DIR=/tmp/test_dir/ - MODULE=openml matrix: - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true" - - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2" - # Checks for older scikit-learn versions (which also don't nicely work with - # Python3.7) - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0 + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" COVERAGE="true" DOCPUSH="true" SKIP_TESTS="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" RUN_FLAKE8="true" SKIP_TESTS="true" + - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.2" + # Checks for older scikit-learn versions (which also don't nicely work with + # Python3.7) + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0 # Travis issue # https://github.com/travis-ci/travis-ci/issues/8920 diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index e129b7718..b15260fb4 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -66,7 +66,7 @@ ############################################################################ # Get the actual data. # -# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy +# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy # sparse matrix, or as a Pandas DataFrame. The format is # controlled with the parameter ``dataset_format`` which can be either 'array' # (default) or 'dataframe'. Let's first build our dataset from a NumPy array diff --git a/examples/30_extended/flow_id_tutorial.py b/examples/30_extended/flow_id_tutorial.py index ef3689ea1..e77df8d1a 100644 --- a/examples/30_extended/flow_id_tutorial.py +++ b/examples/30_extended/flow_id_tutorial.py @@ -15,6 +15,11 @@ import openml + +# Activating test server +openml.config.start_using_configuration_for_example() + + clf = sklearn.tree.DecisionTreeClassifier() #################################################################################################### @@ -69,3 +74,6 @@ # This also works with the actual model (generalizing the first part of this example): flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) print(flow_ids) + +# Deactivating test server +openml.config.stop_using_configuration_for_example() diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index be438e728..a46bf9699 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -37,6 +37,11 @@ import sklearn.ensemble import sklearn.impute import sklearn.preprocessing +from sklearn.pipeline import make_pipeline, Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import OneHotEncoder, FunctionTransformer +from sklearn.experimental import enable_hist_gradient_boosting openml.config.start_using_configuration_for_example() @@ -52,22 +57,39 @@ # we will create a fairly complex model, with many preprocessing components and # many potential hyperparameters. Of course, the model can be as complex and as # easy as you want it to be -model_original = sklearn.pipeline.make_pipeline( - sklearn.impute.SimpleImputer(), sklearn.ensemble.RandomForestClassifier() -) +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.decomposition import TruncatedSVD + + +# Helper functions to return required columns for ColumnTransformer +def cont(X): + return X.dtypes != "category" + + +def cat(X): + return X.dtypes == "category" + + +cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore", sparse=False), + TruncatedSVD(), +) +ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)]) +model_original = sklearn.pipeline.Pipeline( + steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),] +) # Let's change some hyperparameters. Of course, in any good application we # would tune them using, e.g., Random Search or Bayesian Optimization, but for # the purpose of this tutorial we set them to some specific values that might # or might not be optimal hyperparameters_original = { - "simpleimputer__strategy": "median", - "randomforestclassifier__criterion": "entropy", - "randomforestclassifier__max_features": 0.2, - "randomforestclassifier__min_samples_leaf": 1, - "randomforestclassifier__n_estimators": 16, - "randomforestclassifier__random_state": 42, + "estimator__loss": "auto", + "estimator__learning_rate": 0.15, + "estimator__max_iter": 50, + "estimator__min_samples_leaf": 1, } model_original.set_params(**hyperparameters_original) diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index b9202d7ce..c02a5c038 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -17,8 +17,11 @@ import numpy as np import sklearn.tree -import sklearn.pipeline -import sklearn.impute +from sklearn.pipeline import make_pipeline, Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.decomposition import TruncatedSVD +from sklearn.preprocessing import OneHotEncoder, FunctionTransformer import openml @@ -68,7 +71,7 @@ ) print(evaluations.head()) -############################################################################ +###########################################################from openml.testing import cat, cont################# # Uploading studies # ================= # @@ -78,12 +81,30 @@ openml.config.start_using_configuration_for_example() -# Very simple classifier which ignores the feature type +# Model that can handle missing values +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import HistGradientBoostingClassifier + + +# Helper functions to return required columns for ColumnTransformer +def cont(X): + return X.dtypes != "category" + + +def cat(X): + return X.dtypes == "category" + + +cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore", sparse=False), + TruncatedSVD(), +) +ct = ColumnTransformer( + [("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)] +) clf = sklearn.pipeline.Pipeline( - steps=[ - ("imputer", sklearn.impute.SimpleImputer()), - ("estimator", sklearn.tree.DecisionTreeClassifier(max_depth=5)), - ] + steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),] ) suite = openml.study.get_suite(1) diff --git a/openml/__init__.py b/openml/__init__.py index 621703332..0bab3b1d5 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -113,6 +113,7 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None) "study", "utils", "_api_calls", + "__version__", ] # Load the scikit-learn extension by default diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 0f3037a74..550747eac 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -815,12 +815,12 @@ def edit_dataset( ) -> int: """ Edits an OpenMLDataset. - Specify atleast one field to edit, apart from data_id + Specify at least one field to edit, apart from data_id - For certain fields, a new dataset version is created : attributes, data, default_target_attribute, ignore_attribute, row_id_attribute. - - For other fields, the uploader can edit the exisiting version. - Noone except the uploader can edit the exisitng version. + - For other fields, the uploader can edit the existing version. + No one except the uploader can edit the existing version. Parameters ---------- diff --git a/openml/exceptions.py b/openml/exceptions.py index 07eb64e6c..781784ee2 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -27,7 +27,7 @@ def __init__(self, message: str, code: int = None, url: str = None): self.url = url super().__init__(message) - def __repr__(self): + def __str__(self): return "%s returned code %s: %s" % (self.url, self.code, self.message,) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 2b94d2cfd..edb14487b 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -11,7 +11,7 @@ from re import IGNORECASE import sys import time -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast import warnings import numpy as np @@ -1546,7 +1546,7 @@ def _run_model_on_fold( fold_no: int, y_train: Optional[np.ndarray] = None, X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None, - ) -> Tuple[np.ndarray, np.ndarray, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]: + ) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]: """Run a model on a repeat,fold,subsample triplet of the task and return prediction information. @@ -1579,24 +1579,21 @@ def _run_model_on_fold( Returns ------- - arff_datacontent : List[List] - Arff representation (list of lists) of the predictions that were - generated by this fold (required to populate predictions.arff) - arff_tracecontent : List[List] - Arff representation (list of lists) of the trace data that was generated by this - fold - (will be used to populate trace.arff, leave it empty if the model did not perform - any - hyperparameter optimization). + pred_y : np.ndarray + Predictions on the training/test set, depending on the task type. + For supervised tasks, predicitons are on the test set. + For unsupervised tasks, predicitons are on the training set. + proba_y : pd.DataFrame + Predicted probabilities for the test set. + None, if task is not Classification or Learning Curve prediction. user_defined_measures : OrderedDict[str, float] User defined measures that were generated on this fold - model : Any - The model trained on this repeat,fold,subsample triple. Will be used to generate - trace - information later on (in ``obtain_arff_trace``). + trace : Optional[OpenMLRunTrace]] + arff trace object from a fitted model and the trace content obtained by + repeatedly calling ``run_model_on_task`` """ - def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarray: + def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame: """Transforms predicted probabilities to match with OpenML class indices. Parameters @@ -1609,16 +1606,31 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra Returns ------- - np.ndarray + pd.DataFrame """ + + if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): + if task.class_labels is not None: + if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str): + # mapping (decoding) the predictions to the categories + # creating a separate copy to not change the expected pred_y type + y = [task.class_labels[pred] for pred in y] + else: + raise ValueError("The task has no class labels") + else: + return None + # y: list or numpy array of predictions # model_classes: sklearn classifier mapping from original array id to # prediction index id - if not isinstance(classes, list): - raise ValueError("please convert model classes to list prior to " "calling this fn") - result = np.zeros((len(y), len(classes)), dtype=np.float32) - for obs, prediction_idx in enumerate(y): - result[obs][prediction_idx] = 1.0 + if not isinstance(model_classes, list): + raise ValueError("please convert model classes to list prior to calling this fn") + # DataFrame allows more accurate mapping of classes as column names + result = pd.DataFrame( + 0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32 + ) + for obs, prediction in enumerate(y): + result.loc[obs, prediction] = 1.0 return result if isinstance(task, OpenMLSupervisedTask): @@ -1677,6 +1689,16 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra else: model_classes = used_estimator.classes_ + if not isinstance(model_classes, list): + model_classes = model_classes.tolist() + + # to handle the case when dataset is numpy and categories are encoded + # however the class labels stored in task are still categories + if isinstance(y_train, np.ndarray) and isinstance( + cast(List, task.class_labels)[0], str + ): + model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes] + modelpredict_start_cputime = time.process_time() modelpredict_start_walltime = time.time() @@ -1708,9 +1730,10 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra try: proba_y = model_copy.predict_proba(X_test) - except AttributeError: + proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy + except AttributeError: # predict_proba is not available when probability=False if task.class_labels is not None: - proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels)) + proba_y = _prediction_to_probabilities(pred_y, model_classes) else: raise ValueError("The task has no class labels") @@ -1726,20 +1749,24 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra # then we need to add a column full of zeros into the probabilities # for class 3 because the rest of the library expects that the # probabilities are ordered the same way as the classes are ordered). - proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels))) - for idx, model_class in enumerate(model_classes): - proba_y_new[:, model_class] = proba_y[:, idx] - proba_y = proba_y_new - - if proba_y.shape[1] != len(task.class_labels): message = "Estimator only predicted for {}/{} classes!".format( proba_y.shape[1], len(task.class_labels), ) warnings.warn(message) openml.config.logger.warn(message) + + for i, col in enumerate(task.class_labels): + # adding missing columns with 0 probability + if col not in model_classes: + proba_y[col] = 0 + proba_y = proba_y[task.class_labels] else: raise ValueError("The task has no class labels") + if not np.all(set(proba_y.columns) == set(task.class_labels)): + missing_cols = list(set(task.class_labels) - set(proba_y.columns)) + raise ValueError("Predicted probabilities missing for the columns: ", missing_cols) + elif isinstance(task, OpenMLRegressionTask): proba_y = None diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 47939c867..5aaf70a9d 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -263,7 +263,13 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": for key in self.components: component_dict = OrderedDict() # type: 'OrderedDict[str, Dict]' component_dict["oml:identifier"] = key - component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"] + if self.components[key] in ["passthrough", "drop"]: + component_dict["oml:flow"] = { + "oml-python:serialized_object": "component_reference", + "value": {"key": self.components[key], "step_name": self.components[key]}, + } + else: + component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"] for key_ in component_dict: # We only need to check if the key is a string, because the diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 2b767eaa1..99007aa2a 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -52,6 +52,7 @@ def run_model_on_task( add_local_measures: bool = True, upload_flow: bool = False, return_flow: bool = False, + dataset_format: str = "dataframe", ) -> Union[OpenMLRun, Tuple[OpenMLRun, OpenMLFlow]]: """Run the model on the dataset defined by the task. @@ -79,6 +80,9 @@ def run_model_on_task( If False, do not upload the flow to OpenML. return_flow : bool (default=False) If True, returns the OpenMLFlow generated from the model in addition to the OpenMLRun. + dataset_format : str (default='dataframe') + If 'array', the dataset is passed to the model as a numpy array. + If 'dataframe', the dataset is passed to the model as a pandas dataframe. Returns ------- @@ -125,6 +129,7 @@ def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTas seed=seed, add_local_measures=add_local_measures, upload_flow=upload_flow, + dataset_format=dataset_format, ) if return_flow: return run, flow @@ -139,6 +144,7 @@ def run_flow_on_task( seed: int = None, add_local_measures: bool = True, upload_flow: bool = False, + dataset_format: str = "dataframe", ) -> OpenMLRun: """Run the model provided by the flow on the dataset defined by task. @@ -171,6 +177,9 @@ def run_flow_on_task( upload_flow : bool (default=False) If True, upload the flow to OpenML if it does not exist yet. If False, do not upload the flow to OpenML. + dataset_format : str (default='dataframe') + If 'array', the dataset is passed to the model as a numpy array. + If 'dataframe', the dataset is passed to the model as a pandas dataframe. Returns ------- @@ -248,6 +257,7 @@ def run_flow_on_task( task=task, extension=flow.extension, add_local_measures=add_local_measures, + dataset_format=dataset_format, ) data_content, trace, fold_evaluations, sample_evaluations = res @@ -407,6 +417,7 @@ def _run_task_get_arffcontent( task: OpenMLTask, extension: "Extension", add_local_measures: bool, + dataset_format: str, ) -> Tuple[ List[List], Optional[OpenMLRunTrace], @@ -437,14 +448,23 @@ def _run_task_get_arffcontent( repeat=rep_no, fold=fold_no, sample=sample_no ) if isinstance(task, OpenMLSupervisedTask): - x, y = task.get_X_and_y(dataset_format="array") - train_x = x[train_indices] - train_y = y[train_indices] - test_x = x[test_indices] - test_y = y[test_indices] + x, y = task.get_X_and_y(dataset_format=dataset_format) + if dataset_format == "dataframe": + train_x = x.iloc[train_indices] + train_y = y.iloc[train_indices] + test_x = x.iloc[test_indices] + test_y = y.iloc[test_indices] + else: + train_x = x[train_indices] + train_y = y[train_indices] + test_x = x[test_indices] + test_y = y[test_indices] elif isinstance(task, OpenMLClusteringTask): - x = task.get_X(dataset_format="array") - train_x = x[train_indices] + x = task.get_X(dataset_format=dataset_format) + if dataset_format == "dataframe": + train_x = x.iloc[train_indices] + else: + train_x = x[train_indices] train_y = None test_x = None test_y = None @@ -480,17 +500,33 @@ def _calculate_local_measure(sklearn_fn, openml_name): if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): for i, tst_idx in enumerate(test_indices): - if task.class_labels is not None: + prediction = ( + task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i] + ) + if isinstance(test_y, pd.Series): + test_prediction = ( + task.class_labels[test_y.iloc[i]] + if isinstance(test_y.iloc[i], int) + else test_y.iloc[i] + ) + else: + test_prediction = ( + task.class_labels[test_y[i]] + if isinstance(test_y[i], int) + else test_y[i] + ) + pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i] + arff_line = format_prediction( task=task, repeat=rep_no, fold=fold_no, sample=sample_no, index=tst_idx, - prediction=task.class_labels[pred_y[i]], - truth=task.class_labels[test_y[i]], - proba=dict(zip(task.class_labels, proba_y[i])), + prediction=prediction, + truth=test_prediction, + proba=dict(zip(task.class_labels, pred_prob)), ) else: raise ValueError("The task has no class labels") @@ -504,14 +540,15 @@ def _calculate_local_measure(sklearn_fn, openml_name): elif isinstance(task, OpenMLRegressionTask): - for i in range(0, len(test_indices)): + for i, _ in enumerate(test_indices): + test_prediction = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i] arff_line = format_prediction( task=task, repeat=rep_no, fold=fold_no, index=test_indices[i], prediction=pred_y[i], - truth=test_y[i], + truth=test_prediction, ) arff_datacontent.append(arff_line) @@ -522,7 +559,8 @@ def _calculate_local_measure(sklearn_fn, openml_name): ) elif isinstance(task, OpenMLClusteringTask): - for i in range(0, len(test_indices)): + + for i, _ in enumerate(test_indices): arff_line = [test_indices[i], pred_y[i]] # row_id, cluster ID arff_datacontent.append(arff_line) diff --git a/openml/testing.py b/openml/testing.py index 0b4c50972..da07b0ed7 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -258,4 +258,21 @@ def _check_fold_timing_evaluations( from sklearn.preprocessing import Imputer as SimpleImputer -__all__ = ["TestBase", "SimpleImputer"] +class CustomImputer(SimpleImputer): + """Duplicate class alias for sklearn's SimpleImputer + + Helps bypass the sklearn extension duplicate operation check + """ + + pass + + +def cont(X): + return X.dtypes != "category" + + +def cat(X): + return X.dtypes == "category" + + +__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont"] diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 90f69df17..d34dc2ad3 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -13,6 +13,7 @@ from packaging import version import numpy as np +import pandas as pd import scipy.optimize import scipy.stats import sklearn.base @@ -39,7 +40,7 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase, SimpleImputer +from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont this_directory = os.path.dirname(os.path.abspath(__file__)) @@ -537,8 +538,7 @@ def test_serialize_column_transformer(self): fixture = ( "sklearn.compose._column_transformer.ColumnTransformer(" "numeric=sklearn.preprocessing.{}.StandardScaler," - "nominal=sklearn.preprocessing._encoders.OneHotEncoder," - "drop=drop)".format(scaler_name) + "nominal=sklearn.preprocessing._encoders.OneHotEncoder,drop=drop)".format(scaler_name) ) fixture_short_name = "sklearn.ColumnTransformer" @@ -564,13 +564,7 @@ def test_serialize_column_transformer(self): "drop": ["drop"], } - serialization, new_model = self._serialization_test_helper( - model, - X=None, - y=None, - subcomponent_parameters=["transformers", "numeric", "nominal"], - dependencies_mock_call_count=(4, 8), - ) + serialization = self.extension.model_to_flow(model) structure = serialization.get_structure("name") self.assertEqual(serialization.name, fixture) self.assertEqual(serialization.custom_name, fixture_short_name) @@ -1566,12 +1560,15 @@ def setUp(self): # Test methods for performing runs with this extension module def test_run_model_on_task(self): - class MyPipe(sklearn.pipeline.Pipeline): - pass - task = openml.tasks.get_task(1) - pipe = MyPipe([("imp", SimpleImputer()), ("dummy", sklearn.dummy.DummyClassifier())]) - openml.runs.run_model_on_task(pipe, task) + # using most_frequent imputer since dataset has mixed types and to keep things simple + pipe = sklearn.pipeline.Pipeline( + [ + ("imp", SimpleImputer(strategy="most_frequent")), + ("dummy", sklearn.dummy.DummyClassifier()), + ] + ) + openml.runs.run_model_on_task(pipe, task, dataset_format="array") def test_seed_model(self): # randomized models that are initialized without seeds, can be seeded @@ -1627,7 +1624,7 @@ def test_seed_model_raises(self): with self.assertRaises(ValueError): self.extension.seed_model(model=clf, seed=42) - def test_run_model_on_fold_classification_1(self): + def test_run_model_on_fold_classification_1_array(self): task = openml.tasks.get_task(1) X, y = task.get_X_and_y() @@ -1656,14 +1653,87 @@ def test_run_model_on_fold_classification_1(self): # predictions self.assertIsInstance(y_hat, np.ndarray) self.assertEqual(y_hat.shape, y_test.shape) - self.assertIsInstance(y_hat_proba, np.ndarray) + self.assertIsInstance(y_hat_proba, pd.DataFrame) self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6)) np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) # The class '4' (at index 3) is not present in the training data. We check that the # predicted probabilities for that class are zero! - np.testing.assert_array_almost_equal(y_hat_proba[:, 3], np.zeros(y_test.shape)) + np.testing.assert_array_almost_equal( + y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape) + ) for i in (0, 1, 2, 4, 5): - self.assertTrue(np.any(y_hat_proba[:, i] != np.zeros(y_test.shape))) + self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))) + + # check user defined measures + fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) + for measure in user_defined_measures: + fold_evaluations[measure][0][0] = user_defined_measures[measure] + + # trace. SGD does not produce any + self.assertIsNone(trace) + + self._check_fold_timing_evaluations( + fold_evaluations, + num_repeats=1, + num_folds=1, + task_type=task.task_type_id, + check_scores=False, + ) + + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.21", + reason="SimpleImputer, ColumnTransformer available only after 0.19 and " + "Pipeline till 0.20 doesn't support indexing and 'passthrough'", + ) + def test_run_model_on_fold_classification_1_dataframe(self): + from sklearn.compose import ColumnTransformer + + task = openml.tasks.get_task(1) + + # diff test_run_model_on_fold_classification_1_array() + X, y = task.get_X_and_y(dataset_format="dataframe") + train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] + + # Helper functions to return required columns for ColumnTransformer + cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore", sparse=False), + ) + cont_imp = make_pipeline(CustomImputer(strategy="mean"), StandardScaler()) + ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) + pipeline = sklearn.pipeline.Pipeline( + steps=[("transform", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + ) + # TODO add some mocking here to actually test the innards of this function, too! + res = self.extension._run_model_on_fold( + model=pipeline, + task=task, + fold_no=0, + rep_no=0, + X_train=X_train, + y_train=y_train, + X_test=X_test, + ) + + y_hat, y_hat_proba, user_defined_measures, trace = res + + # predictions + self.assertIsInstance(y_hat, np.ndarray) + self.assertEqual(y_hat.shape, y_test.shape) + self.assertIsInstance(y_hat_proba, pd.DataFrame) + self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 6)) + np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) + # The class '4' (at index 3) is not present in the training data. We check that the + # predicted probabilities for that class are zero! + np.testing.assert_array_almost_equal( + y_hat_proba.iloc[:, 3].to_numpy(), np.zeros(y_test.shape) + ) + for i in (0, 1, 2, 4, 5): + self.assertTrue(np.any(y_hat_proba.iloc[:, i].to_numpy() != np.zeros(y_test.shape))) # check user defined measures fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) @@ -1710,11 +1780,11 @@ def test_run_model_on_fold_classification_2(self): # predictions self.assertIsInstance(y_hat, np.ndarray) self.assertEqual(y_hat.shape, y_test.shape) - self.assertIsInstance(y_hat_proba, np.ndarray) + self.assertIsInstance(y_hat_proba, pd.DataFrame) self.assertEqual(y_hat_proba.shape, (y_test.shape[0], 2)) np.testing.assert_array_almost_equal(np.sum(y_hat_proba, axis=1), np.ones(y_test.shape)) for i in (0, 1): - self.assertTrue(np.any(y_hat_proba[:, i] != np.zeros(y_test.shape))) + self.assertTrue(np.any(y_hat_proba.to_numpy()[:, i] != np.zeros(y_test.shape))) # check user defined measures fold_evaluations = collections.defaultdict(lambda: collections.defaultdict(dict)) @@ -1791,14 +1861,14 @@ def predict_proba(*args, **kwargs): np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0])) # Test that there are predictions other than ones and zeros self.assertLess( - np.sum(proba_1 == 0) + np.sum(proba_1 == 1), + np.sum(proba_1.to_numpy() == 0) + np.sum(proba_1.to_numpy() == 1), X_test.shape[0] * len(task.class_labels), ) np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0])) # Test that there are only ones and zeros predicted self.assertEqual( - np.sum(proba_2 == 0) + np.sum(proba_2 == 1), + np.sum(proba_2.to_numpy() == 0) + np.sum(proba_2.to_numpy() == 1), X_test.shape[0] * len(task.class_labels), ) @@ -2099,3 +2169,49 @@ def test_sklearn_serialization_with_none_step(self): ) with self.assertRaisesRegex(ValueError, msg): self.extension.model_to_flow(clf) + + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="columntransformer introduction in 0.20.0", + ) + def test_failed_serialization_of_custom_class(self): + """Test to check if any custom class inherited from sklearn expectedly fails serialization + """ + try: + from sklearn.impute import SimpleImputer + except ImportError: + # for lower versions + from sklearn.preprocessing import Imputer as SimpleImputer + + class CustomImputer(SimpleImputer): + pass + + def cont(X): + return X.dtypes != "category" + + def cat(X): + return X.dtypes == "category" + + import sklearn.metrics + import sklearn.tree + from sklearn.pipeline import Pipeline, make_pipeline + from sklearn.compose import ColumnTransformer + from sklearn.preprocessing import OneHotEncoder, StandardScaler + + cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + ) + cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) + clf = Pipeline( + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + ) # build a sklearn classifier + + task = openml.tasks.get_task(253) # data with mixed types from test server + try: + _ = openml.runs.run_model_on_task(clf, task) + except AttributeError as e: + if e.args[0] == "module '__main__' has no attribute '__version__'": + raise AttributeError(e) + else: + raise Exception(e) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index dcc7b0b96..89f01c72e 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -21,7 +21,7 @@ import pandas as pd import openml.extensions.sklearn -from openml.testing import TestBase, SimpleImputer +from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction from openml.runs.trace import OpenMLRunTrace from openml.tasks import TaskType @@ -31,13 +31,13 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.dummy import DummyClassifier -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.feature_selection import VarianceThreshold from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression from sklearn.ensemble import RandomForestClassifier, BaggingClassifier from sklearn.svm import SVC from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold -from sklearn.pipeline import Pipeline +from sklearn.pipeline import Pipeline, make_pipeline class TestRun(TestBase): @@ -348,9 +348,13 @@ def test_run_regression_on_classif_task(self): clf = LinearRegression() task = openml.tasks.get_task(task_id) - with self.assertRaises(AttributeError): + # internally dataframe is loaded and targets are categorical + # which LinearRegression() cannot handle + with self.assertRaisesRegex( + AttributeError, "'LinearRegression' object has no attribute 'classes_'" + ): openml.runs.run_model_on_task( - model=clf, task=task, avoid_duplicate_runs=False, + model=clf, task=task, avoid_duplicate_runs=False, dataset_format="array", ) def test_check_erronous_sklearn_flow_fails(self): @@ -553,18 +557,26 @@ def test_run_and_upload_column_transformer_pipeline(self): def get_ct_cf(nominal_indices, numeric_indices): inner = sklearn.compose.ColumnTransformer( transformers=[ - ("numeric", sklearn.preprocessing.StandardScaler(), nominal_indices), ( - "nominal", - sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"), + "numeric", + make_pipeline( + SimpleImputer(strategy="mean"), sklearn.preprocessing.StandardScaler() + ), numeric_indices, ), + ( + "nominal", + make_pipeline( + CustomImputer(strategy="most_frequent"), + sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"), + ), + nominal_indices, + ), ], remainder="passthrough", ) return sklearn.pipeline.Pipeline( steps=[ - ("imputer", sklearn.impute.SimpleImputer(strategy="constant", fill_value=-1)), ("transformer", inner), ("classifier", sklearn.tree.DecisionTreeClassifier()), ] @@ -590,25 +602,36 @@ def get_ct_cf(nominal_indices, numeric_indices): sentinel=sentinel, ) - def test_run_and_upload_decision_tree_pipeline(self): + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="columntransformer introduction in 0.20.0", + ) + def test_run_and_upload_knn_pipeline(self): + + cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + ) + cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + from sklearn.compose import ColumnTransformer + from sklearn.neighbors import KNeighborsClassifier + + ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) pipeline2 = Pipeline( steps=[ - ("Imputer", SimpleImputer(strategy="median")), + ("Imputer", ct), ("VarianceThreshold", VarianceThreshold()), ( "Estimator", RandomizedSearchCV( - DecisionTreeClassifier(), - { - "min_samples_split": [2 ** x for x in range(1, 8)], - "min_samples_leaf": [2 ** x for x in range(0, 7)], - }, + KNeighborsClassifier(), + {"n_neighbors": [x for x in range(2, 10)]}, cv=3, n_iter=10, ), ), ] ) + task_id = self.TEST_SERVER_TASK_MISSING_VALS[0] n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS[1] n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS[2] @@ -732,19 +755,31 @@ def test_learning_curve_task_2(self): ) self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.21", + reason="Pipelines don't support indexing (used for the assert check)", + ) def test_initialize_cv_from_run(self): - randomsearch = RandomizedSearchCV( - RandomForestClassifier(n_estimators=5), - { - "max_depth": [3, None], - "max_features": [1, 2, 3, 4], - "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], - "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"], - }, - cv=StratifiedKFold(n_splits=2, shuffle=True), - n_iter=2, + randomsearch = Pipeline( + [ + ("enc", OneHotEncoder(handle_unknown="ignore")), + ( + "rs", + RandomizedSearchCV( + RandomForestClassifier(n_estimators=5), + { + "max_depth": [3, None], + "max_features": [1, 2, 3, 4], + "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], + "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "bootstrap": [True, False], + "criterion": ["gini", "entropy"], + }, + cv=StratifiedKFold(n_splits=2, shuffle=True), + n_iter=2, + ), + ), + ] ) task = openml.tasks.get_task(11) @@ -759,8 +794,8 @@ def test_initialize_cv_from_run(self): modelR = openml.runs.initialize_model_from_run(run_id=run.run_id) modelS = openml.setups.initialize_model(setup_id=run.setup_id) - self.assertEqual(modelS.cv.random_state, 62501) - self.assertEqual(modelR.cv.random_state, 62501) + self.assertEqual(modelS[-1].cv.random_state, 62501) + self.assertEqual(modelR[-1].cv.random_state, 62501) def _test_local_evaluations(self, run): @@ -793,12 +828,23 @@ def _test_local_evaluations(self, run): self.assertGreaterEqual(alt_scores[idx], 0) self.assertLessEqual(alt_scores[idx], 1) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="SimpleImputer doesn't handle mixed type DataFrame as input", + ) def test_local_run_swapped_parameter_order_model(self): # construct sci-kit learn classifier clf = Pipeline( steps=[ - ("imputer", SimpleImputer(strategy="median")), + ( + "imputer", + make_pipeline( + SimpleImputer(strategy="most_frequent"), + OneHotEncoder(handle_unknown="ignore"), + ), + ), + # random forest doesn't take categoricals ("estimator", RandomForestClassifier()), ] ) @@ -813,13 +859,18 @@ def test_local_run_swapped_parameter_order_model(self): self._test_local_evaluations(run) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="SimpleImputer doesn't handle mixed type DataFrame as input", + ) def test_local_run_swapped_parameter_order_flow(self): # construct sci-kit learn classifier clf = Pipeline( steps=[ - ("imputer", SimpleImputer(strategy="median")), - ("estimator", RandomForestClassifier()), + ("imputer", SimpleImputer(strategy="most_frequent")), + ("encoder", OneHotEncoder(handle_unknown="ignore")), + ("estimator", RandomForestClassifier(n_estimators=10)), ] ) @@ -834,13 +885,18 @@ def test_local_run_swapped_parameter_order_flow(self): self._test_local_evaluations(run) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="SimpleImputer doesn't handle mixed type DataFrame as input", + ) def test_local_run_metric_score(self): # construct sci-kit learn classifier clf = Pipeline( steps=[ - ("imputer", SimpleImputer(strategy="median")), - ("estimator", RandomForestClassifier()), + ("imputer", SimpleImputer(strategy="most_frequent")), + ("encoder", OneHotEncoder(handle_unknown="ignore")), + ("estimator", RandomForestClassifier(n_estimators=10)), ] ) @@ -863,15 +919,19 @@ def test_online_run_metric_score(self): self._test_local_evaluations(run) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="SimpleImputer doesn't handle mixed type DataFrame as input", + ) def test_initialize_model_from_run(self): clf = sklearn.pipeline.Pipeline( steps=[ - ("Imputer", SimpleImputer(strategy="median")), + ("Imputer", SimpleImputer(strategy="most_frequent")), ("VarianceThreshold", VarianceThreshold(threshold=0.05)), ("Estimator", GaussianNB()), ] ) - task = openml.tasks.get_task(11) + task = openml.tasks.get_task(1198) run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=False,) run_ = run.publish() TestBase._mark_entity_for_removal("run", run_.run_id) @@ -887,7 +947,7 @@ def test_initialize_model_from_run(self): openml.flows.assert_flows_equal(flowR, flowL) openml.flows.assert_flows_equal(flowS, flowL) - self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"median"') + self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"most_frequent"') self.assertEqual(flowS.components["VarianceThreshold"].parameters["threshold"], "0.05") @pytest.mark.flaky() @@ -939,6 +999,10 @@ def test_get_run_trace(self): run_trace = openml.runs.get_run_trace(run_id) self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="SimpleImputer doesn't handle mixed type DataFrame as input", + ) def test__run_exists(self): # would be better to not sentinel these clfs, # so we do not have to perform the actual runs @@ -1080,6 +1144,10 @@ def test_run_with_illegal_flow_id_1_after_load(self): openml.exceptions.PyOpenMLError, expected_message_regex, loaded_run.publish ) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="OneHotEncoder cannot handle mixed type DataFrame as input", + ) def test__run_task_get_arffcontent(self): task = openml.tasks.get_task(7) num_instances = 3196 @@ -1088,9 +1156,16 @@ def test__run_task_get_arffcontent(self): flow = unittest.mock.Mock() flow.name = "dummy" - clf = SGDClassifier(loss="log", random_state=1) + clf = make_pipeline( + OneHotEncoder(handle_unknown="ignore"), SGDClassifier(loss="log", random_state=1) + ) res = openml.runs.functions._run_task_get_arffcontent( - flow=flow, extension=self.extension, model=clf, task=task, add_local_measures=True, + flow=flow, + extension=self.extension, + model=clf, + task=task, + add_local_measures=True, + dataset_format="dataframe", ) arff_datacontent, trace, fold_evaluations, _ = res # predictions @@ -1288,24 +1363,81 @@ def test_get_runs_list_by_tag(self): runs = openml.runs.list_runs(tag="curves") self.assertGreaterEqual(len(runs), 1) - def test_run_on_dataset_with_missing_labels(self): + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="columntransformer introduction in 0.20.0", + ) + def test_run_on_dataset_with_missing_labels_dataframe(self): # Check that _run_task_get_arffcontent works when one of the class # labels only declared in the arff file, but is not present in the # actual data - flow = unittest.mock.Mock() flow.name = "dummy" task = openml.tasks.get_task(2) + from sklearn.compose import ColumnTransformer + + cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + ) + cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) model = Pipeline( - steps=[ - ("Imputer", SimpleImputer(strategy="median")), - ("Estimator", DecisionTreeClassifier()), - ] + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + ) # build a sklearn classifier + + data_content, _, _, _ = _run_task_get_arffcontent( + flow=flow, + model=model, + task=task, + extension=self.extension, + add_local_measures=True, + dataset_format="dataframe", ) + # 2 folds, 5 repeats; keep in mind that this task comes from the test + # server, the task on the live server is different + self.assertEqual(len(data_content), 4490) + for row in data_content: + # repeat, fold, row_id, 6 confidences, prediction and correct label + self.assertEqual(len(row), 12) + + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="columntransformer introduction in 0.20.0", + ) + def test_run_on_dataset_with_missing_labels_array(self): + # Check that _run_task_get_arffcontent works when one of the class + # labels only declared in the arff file, but is not present in the + # actual data + flow = unittest.mock.Mock() + flow.name = "dummy" + task = openml.tasks.get_task(2) + # task_id=2 on test server has 38 columns with 6 numeric columns + cont_idx = [3, 4, 8, 32, 33, 34] + cat_idx = list(set(np.arange(38)) - set(cont_idx)) + cont = np.array([False] * 38) + cat = np.array([False] * 38) + cont[cont_idx] = True + cat[cat_idx] = True + + from sklearn.compose import ColumnTransformer + + cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + ) + cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) + model = Pipeline( + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] + ) # build a sklearn classifier data_content, _, _, _ = _run_task_get_arffcontent( - flow=flow, model=model, task=task, extension=self.extension, add_local_measures=True, + flow=flow, + model=model, + task=task, + extension=self.extension, + add_local_measures=True, + dataset_format="array", # diff test_run_on_dataset_with_missing_labels_dataframe() ) # 2 folds, 5 repeats; keep in mind that this task comes from the test # server, the task on the live server is different diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index 14e2405f2..fdb2747ec 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,12 +1,20 @@ # License: BSD 3-Clause -from openml.testing import TestBase, SimpleImputer +from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont + +import sklearn +import unittest +from distutils.version import LooseVersion class TestStudyFunctions(TestBase): _multiprocess_can_split_ = True """Test the example code of Bischl et al. (2018)""" + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="columntransformer introduction in 0.20.0", + ) def test_Figure1a(self): """Test listing in Figure 1a on a single task and the old OpenML100 study. @@ -29,16 +37,19 @@ def test_Figure1a(self): """ # noqa: E501 import openml import sklearn.metrics - import sklearn.pipeline - import sklearn.preprocessing import sklearn.tree + from sklearn.pipeline import Pipeline, make_pipeline + from sklearn.compose import ColumnTransformer + from sklearn.preprocessing import OneHotEncoder, StandardScaler benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite - clf = sklearn.pipeline.Pipeline( - steps=[ - ("imputer", SimpleImputer()), - ("estimator", sklearn.tree.DecisionTreeClassifier()), - ] + cat_imp = make_pipeline( + SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") + ) + cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) + clf = Pipeline( + steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] ) # build a sklearn classifier for task_id in benchmark_suite.tasks[:1]: # iterate over all tasks task = openml.tasks.get_task(task_id) # download the OpenML task