diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 4a3015bdc..2b94d2cfd 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -56,6 +56,10 @@ ] SIMPLE_TYPES = tuple([bool, int, float, str] + SIMPLE_NUMPY_TYPES) +SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough") +COMPONENT_REFERENCE = "component_reference" +COMPOSITION_STEP_CONSTANT = "composition_step_constant" + class SklearnExtension(Extension): """Connect scikit-learn to OpenML-Python.""" @@ -249,8 +253,11 @@ def _deserialize_sklearn( ) -> Any: """Recursive function to deserialize a scikit-learn flow. - This function delegates all work to the respective functions to deserialize special data - structures etc. + This function inspects an object to deserialize and decides how to do so. This function + delegates all work to the respective functions to deserialize special data structures etc. + This function works on everything that has been serialized to OpenML: OpenMLFlow, + components (which are flows themselves), functions, hyperparameter distributions (for + random search) and the actual hyperparameter values themselves. Parameters ---------- @@ -258,8 +265,9 @@ def _deserialize_sklearn( the object to deserialize (can be flow object, or any serialized parameter value that is accepted by) - components : dict - + components : Optional[dict] + Components of the current flow being de-serialized. These will not be used when + de-serializing the actual flow, but when de-serializing a component reference. initialize_with_defaults : bool, optional (default=False) If this flag is set, the hyperparameter values of flows will be @@ -307,11 +315,16 @@ def _deserialize_sklearn( rval = self._deserialize_rv_frozen(value) elif serialized_type == "function": rval = self._deserialize_function(value) - elif serialized_type == "component_reference": + elif serialized_type in (COMPOSITION_STEP_CONSTANT, COMPONENT_REFERENCE): + if serialized_type == COMPOSITION_STEP_CONSTANT: + pass + elif serialized_type == COMPONENT_REFERENCE: + value = self._deserialize_sklearn( + value, recursion_depth=depth_pp, strict_version=strict_version + ) + else: + raise NotImplementedError(serialized_type) assert components is not None # Necessary for mypy - value = self._deserialize_sklearn( - value, recursion_depth=depth_pp, strict_version=strict_version - ) step_name = value["step_name"] key = value["key"] component = self._deserialize_sklearn( @@ -407,6 +420,13 @@ def _serialize_sklearn(self, o: Any, parent_model: Optional[Any] = None) -> Any: if self.is_estimator(o): # is the main model or a submodel rval = self._serialize_model(o) + elif ( + isinstance(o, (list, tuple)) + and len(o) == 2 + and o[1] in SKLEARN_PIPELINE_STRING_COMPONENTS + and isinstance(parent_model, sklearn.pipeline._BaseComposition) + ): + rval = o elif isinstance(o, (list, tuple)): # TODO: explain what type of parameter is here rval = [self._serialize_sklearn(element, parent_model) for element in o] @@ -711,8 +731,13 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: for key in subcomponents: if isinstance(subcomponents[key], OpenMLFlow): name = subcomponents[key].name - elif isinstance(subcomponents[key], str): # 'drop', 'passthrough' can be passed + elif ( + isinstance(subcomponents[key], str) + and subcomponents[key] in SKLEARN_PIPELINE_STRING_COMPONENTS + ): name = subcomponents[key] + else: + raise TypeError(type(subcomponents[key])) if key in subcomponents_explicit: sub_components_names += "," + key + "=" + name else: @@ -727,17 +752,8 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: # Get the external versions of all sub-components external_version = self._get_external_version_string(model, subcomponents) - - dependencies = "\n".join( - [ - self._format_external_version("sklearn", sklearn.__version__,), - "numpy>=1.6.1", - "scipy>=0.9", - ] - ) - - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) - sklearn_version_formatted = sklearn_version.replace("==", "_") + dependencies = self._get_dependencies() + tags = self._get_tags() sklearn_description = self._get_sklearn_description(model) flow = OpenMLFlow( @@ -750,17 +766,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: parameters=parameters, parameters_meta_info=parameters_meta_info, external_version=external_version, - tags=[ - "openml-python", - "sklearn", - "scikit-learn", - "python", - sklearn_version_formatted, - # TODO: add more tags based on the scikit-learn - # module a flow is in? For example automatically - # annotate a class of sklearn.svm.SVC() with the - # tag svm? - ], + tags=tags, extension=self, language="English", # TODO fill in dependencies! @@ -769,6 +775,31 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: return flow + def _get_dependencies(self) -> str: + dependencies = "\n".join( + [ + self._format_external_version("sklearn", sklearn.__version__,), + "numpy>=1.6.1", + "scipy>=0.9", + ] + ) + return dependencies + + def _get_tags(self) -> List[str]: + sklearn_version = self._format_external_version("sklearn", sklearn.__version__) + sklearn_version_formatted = sklearn_version.replace("==", "_") + return [ + "openml-python", + "sklearn", + "scikit-learn", + "python", + sklearn_version_formatted, + # TODO: add more tags based on the scikit-learn + # module a flow is in? For example automatically + # annotate a class of sklearn.svm.SVC() with the + # tag svm? + ] + def _get_external_version_string( self, model: Any, sub_components: Dict[str, OpenMLFlow], ) -> str: @@ -777,22 +808,25 @@ def _get_external_version_string( # version of all subcomponents, which themselves already contain all # requirements for their subcomponents. The external version string is a # sorted concatenation of all modules which are present in this run. - model_package_name = model.__module__.split(".")[0] - module = importlib.import_module(model_package_name) - model_package_version_number = module.__version__ # type: ignore - external_version = self._format_external_version( - model_package_name, model_package_version_number, - ) - openml_version = self._format_external_version("openml", openml.__version__) - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) external_versions = set() - external_versions.add(external_version) + + # The model is None if the flow is a placeholder flow such as 'passthrough' or 'drop' + if model is not None: + model_package_name = model.__module__.split(".")[0] + module = importlib.import_module(model_package_name) + model_package_version_number = module.__version__ # type: ignore + external_version = self._format_external_version( + model_package_name, model_package_version_number, + ) + external_versions.add(external_version) + + openml_version = self._format_external_version("openml", openml.__version__) + sklearn_version = self._format_external_version("sklearn", sklearn.__version__) external_versions.add(openml_version) external_versions.add(sklearn_version) for visitee in sub_components.values(): - # 'drop', 'passthrough', None can be passed as estimators - if isinstance(visitee, str): + if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: continue for external_version in visitee.external_version.split(","): external_versions.add(external_version) @@ -807,7 +841,7 @@ def _check_multiple_occurence_of_component_in_flow( while len(to_visit_stack) > 0: visitee = to_visit_stack.pop() - if isinstance(visitee, str): # 'drop', 'passthrough' can be passed as estimators + if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: known_sub_components.add(visitee) elif visitee.name in known_sub_components: raise ValueError( @@ -865,8 +899,15 @@ def flatten_all(list_): ) # Check that all list elements are of simple types. - nested_list_of_simple_types = is_non_empty_list_of_lists_with_same_type and all( - [isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)] + nested_list_of_simple_types = ( + is_non_empty_list_of_lists_with_same_type + and all([isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)]) + and all( + [ + len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS + for rv in rval + ] + ) ) if is_non_empty_list_of_lists_with_same_type and not nested_list_of_simple_types: @@ -879,16 +920,18 @@ def flatten_all(list_): for i, sub_component_tuple in enumerate(rval): identifier = sub_component_tuple[0] sub_component = sub_component_tuple[1] - # sub_component_type = type(sub_component_tuple) + sub_component_type = type(sub_component_tuple) if not 2 <= len(sub_component_tuple) <= 3: # length 2 is for {VotingClassifier.estimators, # Pipeline.steps, FeatureUnion.transformer_list} # length 3 is for ColumnTransformer - msg = "Length of tuple does not match assumptions" + msg = "Length of tuple of type {} does not match assumptions".format( + sub_component_type + ) raise ValueError(msg) if isinstance(sub_component, str): - if sub_component != "drop" and sub_component != "passthrough": + if sub_component not in SKLEARN_PIPELINE_STRING_COMPONENTS: msg = ( "Second item of tuple does not match assumptions. " "If string, can be only 'drop' or 'passthrough' but" @@ -921,15 +964,45 @@ def flatten_all(list_): # when deserializing the parameter sub_components_explicit.add(identifier) - sub_components[identifier] = sub_component - component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]] - component_reference["oml-python:serialized_object"] = "component_reference" - cr_value = OrderedDict() # type: Dict[str, Any] - cr_value["key"] = identifier - cr_value["step_name"] = identifier - if len(sub_component_tuple) == 3: - cr_value["argument_1"] = sub_component_tuple[2] - component_reference["value"] = cr_value + if isinstance(sub_component, str): + + external_version = self._get_external_version_string(None, {}) + dependencies = self._get_dependencies() + tags = self._get_tags() + + sub_components[identifier] = OpenMLFlow( + name=sub_component, + description="Placeholder flow for scikit-learn's string pipeline " + "members", + components=OrderedDict(), + parameters=OrderedDict(), + parameters_meta_info=OrderedDict(), + external_version=external_version, + tags=tags, + language="English", + dependencies=dependencies, + model=None, + ) + component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]] + component_reference[ + "oml-python:serialized_object" + ] = COMPOSITION_STEP_CONSTANT + cr_value = OrderedDict() # type: Dict[str, Any] + cr_value["key"] = identifier + cr_value["step_name"] = identifier + if len(sub_component_tuple) == 3: + cr_value["argument_1"] = sub_component_tuple[2] + component_reference["value"] = cr_value + else: + sub_components[identifier] = sub_component + component_reference = OrderedDict() + component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE + cr_value = OrderedDict() + cr_value["key"] = identifier + cr_value["step_name"] = identifier + if len(sub_component_tuple) == 3: + cr_value["argument_1"] = sub_component_tuple[2] + component_reference["value"] = cr_value parameter_value.append(component_reference) # Here (and in the elif and else branch below) are the only @@ -949,7 +1022,7 @@ def flatten_all(list_): sub_components[k] = rval sub_components_explicit.add(k) component_reference = OrderedDict() - component_reference["oml-python:serialized_object"] = "component_reference" + component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE cr_value = OrderedDict() cr_value["key"] = k cr_value["step_name"] = None @@ -1052,25 +1125,28 @@ def _deserialize_model( ) parameter_dict[name] = rval - module_name = model_name.rsplit(".", 1) - model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - - if keep_defaults: - # obtain all params with a default - param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__) - - # delete the params that have a default from the dict, - # so they get initialized with their default value - # except [...] - for param in param_defaults: - # [...] the ones that also have a key in the components dict. - # As OpenML stores different flows for ensembles with different - # (base-)components, in OpenML terms, these are not considered - # hyperparameters but rather constants (i.e., changing them would - # result in a different flow) - if param not in components.keys(): - del parameter_dict[param] - return model_class(**parameter_dict) + if model_name is None and flow.name in SKLEARN_PIPELINE_STRING_COMPONENTS: + return flow.name + else: + module_name = model_name.rsplit(".", 1) + model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) + + if keep_defaults: + # obtain all params with a default + param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__) + + # delete the params that have a default from the dict, + # so they get initialized with their default value + # except [...] + for param in param_defaults: + # [...] the ones that also have a key in the components dict. + # As OpenML stores different flows for ensembles with different + # (base-)components, in OpenML terms, these are not considered + # hyperparameters but rather constants (i.e., changing them would + # result in a different flow) + if param not in components.keys(): + del parameter_dict[param] + return model_class(**parameter_dict) def _check_dependencies(self, dependencies: str, strict_version: bool = True) -> None: if not dependencies: @@ -1730,8 +1806,14 @@ def is_subcomponent_specification(values): return False if len(item) < 2: return False - if not isinstance(item[1], openml.flows.OpenMLFlow): - return False + if not isinstance(item[1], (openml.flows.OpenMLFlow, str)): + if ( + isinstance(item[1], str) + and item[1] in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + pass + else: + return False return True # _flow is openml flow object, _param dict maps from flow name to flow @@ -1739,10 +1821,15 @@ def is_subcomponent_specification(values): # unit tests / sentinels) this way, for flows without subflows we do # not have to rely on _flow_dict exp_parameters = set(_flow.parameters) - exp_components = set(_flow.components) - model_parameters = set([mp for mp in component_model.get_params() if "__" not in mp]) - if len((exp_parameters | exp_components) ^ model_parameters) != 0: - flow_params = sorted(exp_parameters | exp_components) + if ( + isinstance(component_model, str) + and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + model_parameters = set() + else: + model_parameters = set([mp for mp in component_model.get_params(deep=False)]) + if len(exp_parameters.symmetric_difference(model_parameters)) != 0: + flow_params = sorted(exp_parameters) model_params = sorted(model_parameters) raise ValueError( "Parameters of the model do not match the " @@ -1750,6 +1837,44 @@ def is_subcomponent_specification(values): "flow:\nexpected flow parameters: " "%s\nmodel parameters: %s" % (flow_params, model_params) ) + exp_components = set(_flow.components) + if ( + isinstance(component_model, str) + and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + model_components = set() + else: + _ = set([mp for mp in component_model.get_params(deep=False)]) + model_components = set( + [ + mp + for mp in component_model.get_params(deep=True) + if "__" not in mp and mp not in _ + ] + ) + if len(exp_components.symmetric_difference(model_components)) != 0: + is_problem = True + if len(exp_components - model_components) > 0: + # If an expected component is not returned as a component by get_params(), + # this means that it is also a parameter -> we need to check that this is + # actually the case + difference = exp_components - model_components + component_in_model_parameters = [] + for component in difference: + if component in model_parameters: + component_in_model_parameters.append(True) + else: + component_in_model_parameters.append(False) + is_problem = not all(component_in_model_parameters) + if is_problem: + flow_components = sorted(exp_components) + model_components = sorted(model_components) + raise ValueError( + "Subcomponents of the model do not match the " + "parameters expected by the " + "flow:\nexpected flow subcomponents: " + "%s\nmodel subcomponents: %s" % (flow_components, model_components) + ) _params = [] for _param_name in _flow.parameters: @@ -1778,20 +1903,37 @@ def is_subcomponent_specification(values): subcomponent_identifier = subcomponent[0] subcomponent_flow = subcomponent[1] if not isinstance(subcomponent_identifier, str): - raise TypeError("Subcomponent identifier should be " "string") - if not isinstance(subcomponent_flow, openml.flows.OpenMLFlow): - raise TypeError("Subcomponent flow should be string") + raise TypeError( + "Subcomponent identifier should be of type string, " + "but is {}".format(type(subcomponent_identifier)) + ) + if not isinstance(subcomponent_flow, (openml.flows.OpenMLFlow, str)): + if ( + isinstance(subcomponent_flow, str) + and subcomponent_flow in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + pass + else: + raise TypeError( + "Subcomponent flow should be of type flow, but is {}".format( + type(subcomponent_flow) + ) + ) current = { - "oml-python:serialized_object": "component_reference", + "oml-python:serialized_object": COMPONENT_REFERENCE, "value": { "key": subcomponent_identifier, "step_name": subcomponent_identifier, }, } if len(subcomponent) == 3: - if not isinstance(subcomponent[2], list): - raise TypeError("Subcomponent argument should be" " list") + if not isinstance(subcomponent[2], list) and not isinstance( + subcomponent[2], OrderedDict + ): + raise TypeError( + "Subcomponent argument should be list or OrderedDict" + ) current["value"]["argument_1"] = subcomponent[2] parsed_values.append(current) parsed_values = json.dumps(parsed_values) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index acc93b024..90f69df17 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -71,88 +71,137 @@ def setUp(self): self.extension = SklearnExtension() - def test_serialize_model(self): - with mock.patch.object(self.extension, "_check_dependencies") as check_dependencies_mock: - model = sklearn.tree.DecisionTreeClassifier( - criterion="entropy", max_features="auto", max_leaf_nodes=2000 - ) + def _serialization_test_helper( + self, model, X, y, subcomponent_parameters, dependencies_mock_call_count=(1, 2) + ): - tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" - fixture_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name) - fixture_short_name = "sklearn.DecisionTreeClassifier" - # str obtained from self.extension._get_sklearn_description(model) - fixture_description = "A decision tree classifier." - version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ - - presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"' - # min_impurity_decrease has been introduced in 0.20 - # min_impurity_split has been deprecated in 0.20 - if LooseVersion(sklearn.__version__) < "0.19": - fixture_parameters = OrderedDict( - ( - ("class_weight", "null"), - ("criterion", '"entropy"'), - ("max_depth", "null"), - ("max_features", '"auto"'), - ("max_leaf_nodes", "2000"), - ("min_impurity_split", "1e-07"), - ("min_samples_leaf", "1"), - ("min_samples_split", "2"), - ("min_weight_fraction_leaf", "0.0"), - ("presort", "false"), - ("random_state", "null"), - ("splitter", '"best"'), - ) - ) - else: - fixture_parameters = OrderedDict( - ( - ("class_weight", "null"), - ("criterion", '"entropy"'), - ("max_depth", "null"), - ("max_features", '"auto"'), - ("max_leaf_nodes", "2000"), - ("min_impurity_decrease", "0.0"), - ("min_impurity_split", "null"), - ("min_samples_leaf", "1"), - ("min_samples_split", "2"), - ("min_weight_fraction_leaf", "0.0"), - ("presort", presort_val), - ("random_state", "null"), - ("splitter", '"best"'), - ) - ) - if LooseVersion(sklearn.__version__) >= "0.22": - fixture_parameters.update({"ccp_alpha": "0.0"}) - fixture_parameters.move_to_end("ccp_alpha", last=False) - - structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []} + # Regex pattern for memory addresses of style 0x7f8e0f31ecf8 + pattern = re.compile("0x[0-9a-f]{12}") + with mock.patch.object(self.extension, "_check_dependencies") as check_dependencies_mock: serialization = self.extension.model_to_flow(model) - structure = serialization.get_structure("name") - self.assertEqual(serialization.name, fixture_name) - self.assertEqual(serialization.class_name, fixture_name) - self.assertEqual(serialization.custom_name, fixture_short_name) - self.assertEqual(serialization.description, fixture_description) - self.assertEqual(serialization.parameters, fixture_parameters) - self.assertEqual(serialization.dependencies, version_fixture) - self.assertDictEqual(structure, structure_fixture) + if X is not None: + model.fit(X, y) new_model = self.extension.flow_to_model(serialization) # compares string representations of the dict, as it potentially # contains complex objects that can not be compared with == op - # Only in Python 3.x, as Python 2 has Unicode issues - if sys.version_info[0] >= 3: - self.assertEqual(str(model.get_params()), str(new_model.get_params())) + self.assertEqual( + re.sub(pattern, str(model.get_params()), ""), + re.sub(pattern, str(new_model.get_params()), ""), + ) self.assertEqual(type(new_model), type(model)) self.assertIsNot(new_model, model) - self.assertEqual(new_model.get_params(), model.get_params()) - new_model.fit(self.X, self.y) + if X is not None: + new_model.fit(self.X, self.y) - self.assertEqual(check_dependencies_mock.call_count, 1) + self.assertEqual(check_dependencies_mock.call_count, dependencies_mock_call_count[0]) + + xml = serialization._to_dict() + new_model2 = self.extension.flow_to_model(OpenMLFlow._from_dict(xml)) + self.assertEqual( + re.sub(pattern, str(model.get_params()), ""), + re.sub(pattern, str(new_model2.get_params()), ""), + ) + + self.assertEqual(type(new_model2), type(model)) + self.assertIsNot(new_model2, model) + + if X is not None: + new_model2.fit(self.X, self.y) + + self.assertEqual(check_dependencies_mock.call_count, dependencies_mock_call_count[1]) + + if subcomponent_parameters: + for nm in (new_model, new_model2): + new_model_params = nm.get_params() + model_params = model.get_params() + for subcomponent_parameter in subcomponent_parameters: + self.assertEqual( + type(new_model_params[subcomponent_parameter]), + type(model_params[subcomponent_parameter]), + ) + self.assertIsNot( + new_model_params[subcomponent_parameter], + model_params[subcomponent_parameter], + ) + del new_model_params[subcomponent_parameter] + del model_params[subcomponent_parameter] + self.assertEqual(new_model_params, model_params) + + return serialization, new_model + + def test_serialize_model(self): + model = sklearn.tree.DecisionTreeClassifier( + criterion="entropy", max_features="auto", max_leaf_nodes=2000 + ) + + tree_name = "tree" if LooseVersion(sklearn.__version__) < "0.22" else "_classes" + fixture_name = "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name) + fixture_short_name = "sklearn.DecisionTreeClassifier" + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = "A decision tree classifier." + version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ + + presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"' + # min_impurity_decrease has been introduced in 0.20 + # min_impurity_split has been deprecated in 0.20 + if LooseVersion(sklearn.__version__) < "0.19": + fixture_parameters = OrderedDict( + ( + ("class_weight", "null"), + ("criterion", '"entropy"'), + ("max_depth", "null"), + ("max_features", '"auto"'), + ("max_leaf_nodes", "2000"), + ("min_impurity_split", "1e-07"), + ("min_samples_leaf", "1"), + ("min_samples_split", "2"), + ("min_weight_fraction_leaf", "0.0"), + ("presort", "false"), + ("random_state", "null"), + ("splitter", '"best"'), + ) + ) + else: + fixture_parameters = OrderedDict( + ( + ("class_weight", "null"), + ("criterion", '"entropy"'), + ("max_depth", "null"), + ("max_features", '"auto"'), + ("max_leaf_nodes", "2000"), + ("min_impurity_decrease", "0.0"), + ("min_impurity_split", "null"), + ("min_samples_leaf", "1"), + ("min_samples_split", "2"), + ("min_weight_fraction_leaf", "0.0"), + ("presort", presort_val), + ("random_state", "null"), + ("splitter", '"best"'), + ) + ) + if LooseVersion(sklearn.__version__) >= "0.22": + fixture_parameters.update({"ccp_alpha": "0.0"}) + fixture_parameters.move_to_end("ccp_alpha", last=False) + + structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []} + + serialization, _ = self._serialization_test_helper( + model, X=self.X, y=self.y, subcomponent_parameters=None + ) + structure = serialization.get_structure("name") + + self.assertEqual(serialization.name, fixture_name) + self.assertEqual(serialization.class_name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) + self.assertEqual(serialization.description, fixture_description) + self.assertEqual(serialization.parameters, fixture_parameters) + self.assertEqual(serialization.dependencies, version_fixture) + self.assertDictEqual(structure, structure_fixture) def test_can_handle_flow(self): openml.config.server = self.production_server @@ -165,79 +214,67 @@ def test_can_handle_flow(self): openml.config.server = self.test_server def test_serialize_model_clustering(self): - with mock.patch.object(self.extension, "_check_dependencies") as check_dependencies_mock: - model = sklearn.cluster.KMeans() + model = sklearn.cluster.KMeans() - cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans" - fixture_name = "sklearn.cluster.{}.KMeans".format(cluster_name) - fixture_short_name = "sklearn.KMeans" - # str obtained from self.extension._get_sklearn_description(model) - fixture_description = "K-Means clustering{}".format( - "" if LooseVersion(sklearn.__version__) < "0.22" else "." - ) - version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ + cluster_name = "k_means_" if LooseVersion(sklearn.__version__) < "0.22" else "_kmeans" + fixture_name = "sklearn.cluster.{}.KMeans".format(cluster_name) + fixture_short_name = "sklearn.KMeans" + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = "K-Means clustering{}".format( + "" if LooseVersion(sklearn.__version__) < "0.22" else "." + ) + version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ - n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' - precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' + n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' + precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' - # n_jobs default has changed to None in 0.20 - if LooseVersion(sklearn.__version__) < "0.20": - fixture_parameters = OrderedDict( - ( - ("algorithm", '"auto"'), - ("copy_x", "true"), - ("init", '"k-means++"'), - ("max_iter", "300"), - ("n_clusters", "8"), - ("n_init", "10"), - ("n_jobs", "1"), - ("precompute_distances", '"auto"'), - ("random_state", "null"), - ("tol", "0.0001"), - ("verbose", "0"), - ) + # n_jobs default has changed to None in 0.20 + if LooseVersion(sklearn.__version__) < "0.20": + fixture_parameters = OrderedDict( + ( + ("algorithm", '"auto"'), + ("copy_x", "true"), + ("init", '"k-means++"'), + ("max_iter", "300"), + ("n_clusters", "8"), + ("n_init", "10"), + ("n_jobs", "1"), + ("precompute_distances", '"auto"'), + ("random_state", "null"), + ("tol", "0.0001"), + ("verbose", "0"), ) - else: - fixture_parameters = OrderedDict( - ( - ("algorithm", '"auto"'), - ("copy_x", "true"), - ("init", '"k-means++"'), - ("max_iter", "300"), - ("n_clusters", "8"), - ("n_init", "10"), - ("n_jobs", n_jobs_val), - ("precompute_distances", precomp_val), - ("random_state", "null"), - ("tol", "0.0001"), - ("verbose", "0"), - ) + ) + else: + fixture_parameters = OrderedDict( + ( + ("algorithm", '"auto"'), + ("copy_x", "true"), + ("init", '"k-means++"'), + ("max_iter", "300"), + ("n_clusters", "8"), + ("n_init", "10"), + ("n_jobs", n_jobs_val), + ("precompute_distances", precomp_val), + ("random_state", "null"), + ("tol", "0.0001"), + ("verbose", "0"), ) - fixture_structure = {"sklearn.cluster.{}.KMeans".format(cluster_name): []} - - serialization = self.extension.model_to_flow(model) - structure = serialization.get_structure("name") - - self.assertEqual(serialization.name, fixture_name) - self.assertEqual(serialization.class_name, fixture_name) - self.assertEqual(serialization.custom_name, fixture_short_name) - self.assertEqual(serialization.description, fixture_description) - self.assertEqual(serialization.parameters, fixture_parameters) - self.assertEqual(serialization.dependencies, version_fixture) - self.assertDictEqual(structure, fixture_structure) - - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - self.assertEqual(str(model.get_params()), str(new_model.get_params())) - - self.assertEqual(type(new_model), type(model)) - self.assertIsNot(new_model, model) + ) + fixture_structure = {"sklearn.cluster.{}.KMeans".format(cluster_name): []} - self.assertEqual(new_model.get_params(), model.get_params()) - new_model.fit(self.X) + serialization, _ = self._serialization_test_helper( + model, X=None, y=None, subcomponent_parameters=None + ) + structure = serialization.get_structure("name") - self.assertEqual(check_dependencies_mock.call_count, 1) + self.assertEqual(serialization.name, fixture_name) + self.assertEqual(serialization.class_name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) + self.assertEqual(serialization.description, fixture_description) + self.assertEqual(serialization.parameters, fixture_parameters) + self.assertEqual(serialization.dependencies, version_fixture) + self.assertDictEqual(structure, fixture_structure) def test_serialize_model_with_subcomponent(self): model = sklearn.ensemble.AdaBoostClassifier( @@ -273,7 +310,13 @@ def test_serialize_model_with_subcomponent(self): "sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): ["base_estimator"], } - serialization = self.extension.model_to_flow(model) + serialization, _ = self._serialization_test_helper( + model, + X=self.X, + y=self.y, + subcomponent_parameters=["base_estimator"], + dependencies_mock_call_count=(2, 4), + ) structure = serialization.get_structure("name") self.assertEqual(serialization.name, fixture_name) @@ -293,24 +336,6 @@ def test_serialize_model_with_subcomponent(self): ) self.assertDictEqual(structure, fixture_structure) - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - self.assertEqual(str(model.get_params()), str(new_model.get_params())) - - self.assertEqual(type(new_model), type(model)) - self.assertIsNot(new_model, model) - - self.assertIsNot(new_model.base_estimator, model.base_estimator) - self.assertEqual(new_model.base_estimator.get_params(), model.base_estimator.get_params()) - new_model_params = new_model.get_params() - del new_model_params["base_estimator"] - model_params = model.get_params() - del model_params["base_estimator"] - - self.assertEqual(new_model_params, model_params) - new_model.fit(self.X, self.y) - def test_serialize_pipeline(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) dummy = sklearn.dummy.DummyClassifier(strategy="prior") @@ -350,7 +375,13 @@ def test_serialize_pipeline(self): "sklearn.dummy.DummyClassifier": ["dummy"], } - serialization = self.extension.model_to_flow(model) + serialization, new_model = self._serialization_test_helper( + model, + X=self.X, + y=self.y, + subcomponent_parameters=["scaler", "dummy", "steps"], + dependencies_mock_call_count=(3, 6), + ) structure = serialization.get_structure("name") self.assertEqual(serialization.name, fixture_name) @@ -390,32 +421,10 @@ def test_serialize_pipeline(self): self.assertIsInstance(serialization.components["scaler"], OpenMLFlow) self.assertIsInstance(serialization.components["dummy"], OpenMLFlow) - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - # Only in Python 3.x, as Python 2 has Unicode issues - if sys.version_info[0] >= 3: - self.assertEqual(str(model.get_params()), str(new_model.get_params())) - - self.assertEqual(type(new_model), type(model)) - self.assertIsNot(new_model, model) - self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps]) self.assertIsNot(new_model.steps[0][1], model.steps[0][1]) self.assertIsNot(new_model.steps[1][1], model.steps[1][1]) - new_model_params = new_model.get_params() - del new_model_params["scaler"] - del new_model_params["dummy"] - del new_model_params["steps"] - fu_params = model.get_params() - del fu_params["scaler"] - del fu_params["dummy"] - del fu_params["steps"] - - self.assertEqual(new_model_params, fu_params) - new_model.fit(self.X, self.y) - def test_serialize_pipeline_clustering(self): scaler = sklearn.preprocessing.StandardScaler(with_mean=False) km = sklearn.cluster.KMeans() @@ -454,7 +463,13 @@ def test_serialize_pipeline_clustering(self): "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["scaler"], "sklearn.cluster.{}.KMeans".format(cluster_name): ["clusterer"], } - serialization = self.extension.model_to_flow(model) + serialization, new_model = self._serialization_test_helper( + model, + X=None, + y=None, + subcomponent_parameters=["scaler", "steps", "clusterer"], + dependencies_mock_call_count=(3, 6), + ) structure = serialization.get_structure("name") self.assertEqual(serialization.name, fixture_name) @@ -493,33 +508,10 @@ def test_serialize_pipeline_clustering(self): self.assertIsInstance(serialization.components["scaler"], OpenMLFlow) self.assertIsInstance(serialization.components["clusterer"], OpenMLFlow) - # del serialization.model - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - # Only in Python 3.x, as Python 2 has Unicode issues - if sys.version_info[0] >= 3: - self.assertEqual(str(model.get_params()), str(new_model.get_params())) - - self.assertEqual(type(new_model), type(model)) - self.assertIsNot(new_model, model) - self.assertEqual([step[0] for step in new_model.steps], [step[0] for step in model.steps]) self.assertIsNot(new_model.steps[0][1], model.steps[0][1]) self.assertIsNot(new_model.steps[1][1], model.steps[1][1]) - new_model_params = new_model.get_params() - del new_model_params["scaler"] - del new_model_params["clusterer"] - del new_model_params["steps"] - fu_params = model.get_params() - del fu_params["scaler"] - del fu_params["clusterer"] - del fu_params["steps"] - - self.assertEqual(new_model_params, fu_params) - new_model.fit(self.X, self.y) - @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", @@ -536,6 +528,7 @@ def test_serialize_column_transformer(self): sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore"), [3, 4, 5], ), + ("drop", "drop", [6, 7, 8]), ], remainder="passthrough", ) @@ -544,7 +537,8 @@ def test_serialize_column_transformer(self): fixture = ( "sklearn.compose._column_transformer.ColumnTransformer(" "numeric=sklearn.preprocessing.{}.StandardScaler," - "nominal=sklearn.preprocessing._encoders.OneHotEncoder)".format(scaler_name) + "nominal=sklearn.preprocessing._encoders.OneHotEncoder," + "drop=drop)".format(scaler_name) ) fixture_short_name = "sklearn.ColumnTransformer" @@ -567,25 +561,21 @@ def test_serialize_column_transformer(self): fixture: [], "sklearn.preprocessing.{}.StandardScaler".format(scaler_name): ["numeric"], "sklearn.preprocessing._encoders.OneHotEncoder": ["nominal"], + "drop": ["drop"], } - serialization = self.extension.model_to_flow(model) + serialization, new_model = self._serialization_test_helper( + model, + X=None, + y=None, + subcomponent_parameters=["transformers", "numeric", "nominal"], + dependencies_mock_call_count=(4, 8), + ) structure = serialization.get_structure("name") self.assertEqual(serialization.name, fixture) self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) - # del serialization.model - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - # Only in Python 3.x, as Python 2 has Unicode issues - if sys.version_info[0] >= 3: - self.assertEqual(str(model.get_params()), str(new_model.get_params())) - self.assertEqual(type(new_model), type(model)) - self.assertIsNot(new_model, model) - serialization2 = self.extension.model_to_flow(new_model) - assert_flows_equal(serialization, serialization2) @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", @@ -653,21 +643,25 @@ def test_serialize_column_transformer_pipeline(self): else: fixture_description = self.extension._get_sklearn_description(model) - serialization = self.extension.model_to_flow(model) + serialization, new_model = self._serialization_test_helper( + model, + X=None, + y=None, + subcomponent_parameters=( + "transformer", + "classifier", + "transformer__transformers", + "steps", + "transformer__nominal", + "transformer__numeric", + ), + dependencies_mock_call_count=(5, 10), + ) structure = serialization.get_structure("name") self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) - # del serialization.model - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - self.assertEqual(str(model.get_params()), str(new_model.get_params())) - self.assertEqual(type(new_model), type(model)) - self.assertIsNot(new_model, model) - serialization2 = self.extension.model_to_flow(new_model) - assert_flows_equal(serialization, serialization2) @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="Pipeline processing behaviour updated" @@ -680,7 +674,13 @@ def test_serialize_feature_union(self): scaler = sklearn.preprocessing.StandardScaler() fu = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)]) - serialization = self.extension.model_to_flow(fu) + serialization, new_model = self._serialization_test_helper( + fu, + X=self.X, + y=self.y, + subcomponent_parameters=("ohe", "scaler", "transformer_list"), + dependencies_mock_call_count=(3, 6), + ) structure = serialization.get_structure("name") # OneHotEncoder was moved to _encoders module in 0.20 module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" @@ -699,15 +699,6 @@ def test_serialize_feature_union(self): } self.assertEqual(serialization.name, fixture_name) self.assertDictEqual(structure, fixture_structure) - new_model = self.extension.flow_to_model(serialization) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - # Only in Python 3.x, as Python 2 has Unicode issues - if sys.version_info[0] >= 3: - self.assertEqual(str(fu.get_params()), str(new_model.get_params())) - - self.assertEqual(type(new_model), type(fu)) - self.assertIsNot(new_model, fu) self.assertEqual(new_model.transformer_list[0][0], fu.transformer_list[0][0]) self.assertEqual( new_model.transformer_list[0][1].get_params(), fu.transformer_list[0][1].get_params() @@ -724,29 +715,20 @@ def test_serialize_feature_union(self): self.assertIsNot(new_model.transformer_list[0][1], fu.transformer_list[0][1]) self.assertIsNot(new_model.transformer_list[1][1], fu.transformer_list[1][1]) - new_model_params = new_model.get_params() - del new_model_params["ohe"] - del new_model_params["scaler"] - del new_model_params["transformer_list"] - fu_params = fu.get_params() - del fu_params["ohe"] - del fu_params["scaler"] - del fu_params["transformer_list"] - - self.assertEqual(new_model_params, fu_params) - new_model.fit(self.X, self.y) - fu.set_params(scaler="drop") - serialization = self.extension.model_to_flow(fu) + serialization, new_model = self._serialization_test_helper( + fu, + X=self.X, + y=self.y, + subcomponent_parameters=("ohe", "transformer_list"), + dependencies_mock_call_count=(3, 6), + ) self.assertEqual( serialization.name, "sklearn.pipeline.FeatureUnion(" "ohe=sklearn.preprocessing.{}.OneHotEncoder," "scaler=drop)".format(module_name_encoder), ) - new_model = self.extension.flow_to_model(serialization) - self.assertEqual(type(new_model), type(fu)) - self.assertIsNot(new_model, fu) self.assertIs(new_model.transformer_list[1][1], "drop") def test_serialize_feature_union_switched_names(self): @@ -755,8 +737,14 @@ def test_serialize_feature_union_switched_names(self): scaler = sklearn.preprocessing.StandardScaler() fu1 = sklearn.pipeline.FeatureUnion(transformer_list=[("ohe", ohe), ("scaler", scaler)]) fu2 = sklearn.pipeline.FeatureUnion(transformer_list=[("scaler", ohe), ("ohe", scaler)]) - fu1_serialization = self.extension.model_to_flow(fu1) - fu2_serialization = self.extension.model_to_flow(fu2) + + fu1_serialization, _ = self._serialization_test_helper( + fu1, X=None, y=None, subcomponent_parameters=(), dependencies_mock_call_count=(3, 6), + ) + fu2_serialization, _ = self._serialization_test_helper( + fu2, X=None, y=None, subcomponent_parameters=(), dependencies_mock_call_count=(3, 6), + ) + # OneHotEncoder was moved to _encoders module in 0.20 module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" scaler_name = "data" if LooseVersion(sklearn.__version__) < "0.22" else "_data" @@ -776,7 +764,7 @@ def test_serialize_feature_union_switched_names(self): ) def test_serialize_complex_flow(self): - ohe = sklearn.preprocessing.OneHotEncoder() + ohe = sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore") scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( base_estimator=sklearn.tree.DecisionTreeClassifier() @@ -785,9 +773,9 @@ def test_serialize_complex_flow(self): steps=[("ohe", ohe), ("scaler", scaler), ("boosting", boosting)] ) parameter_grid = { - "base_estimator__max_depth": scipy.stats.randint(1, 10), - "learning_rate": scipy.stats.uniform(0.01, 0.99), - "n_estimators": [1, 5, 10, 100], + "boosting__base_estimator__max_depth": scipy.stats.randint(1, 10), + "boosting__learning_rate": scipy.stats.uniform(0.01, 0.99), + "boosting__n_estimators": [1, 5, 10, 100], } # convert to ordered dict, sorted by keys) due to param grid check parameter_grid = OrderedDict(sorted(parameter_grid.items())) @@ -795,7 +783,13 @@ def test_serialize_complex_flow(self): rs = sklearn.model_selection.RandomizedSearchCV( estimator=model, param_distributions=parameter_grid, cv=cv ) - serialized = self.extension.model_to_flow(rs) + serialized, new_model = self._serialization_test_helper( + rs, + X=self.X, + y=self.y, + subcomponent_parameters=(), + dependencies_mock_call_count=(6, 12), + ) structure = serialized.get_structure("name") # OneHotEncoder was moved to _encoders module in 0.20 module_name_encoder = "_encoders" if LooseVersion(sklearn.__version__) >= "0.20" else "data" @@ -829,18 +823,100 @@ def test_serialize_complex_flow(self): self.assertEqual(serialized.name, fixture_name) self.assertEqual(structure, fixture_structure) - # now do deserialization - deserialized = self.extension.flow_to_model(serialized) - # compares string representations of the dict, as it potentially - # contains complex objects that can not be compared with == op - # JvR: compare str length, due to memory address of distribution - self.assertEqual(len(str(rs.get_params())), len(str(deserialized.get_params()))) + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.21", + reason="Pipeline till 0.20 doesn't support 'passthrough'", + ) + def test_serialize_strings_as_pipeline_steps(self): + import sklearn.compose - # Checks that sklearn_to_flow is idempotent. - serialized2 = self.extension.model_to_flow(deserialized) - self.assertNotEqual(rs, deserialized) - # Would raise an exception if the flows would be unequal - assert_flows_equal(serialized, serialized2) + # First check: test whether a passthrough in a pipeline is serialized correctly + model = sklearn.pipeline.Pipeline(steps=[("transformer", "passthrough")]) + serialized = self.extension.model_to_flow(model) + self.assertIsInstance(serialized, OpenMLFlow) + self.assertEqual(len(serialized.components), 1) + self.assertEqual(serialized.components["transformer"].name, "passthrough") + serialized = self.extension._serialize_sklearn( + ("transformer", "passthrough"), parent_model=model + ) + self.assertEqual(serialized, ("transformer", "passthrough")) + extracted_info = self.extension._extract_information_from_model(model) + self.assertEqual(len(extracted_info[2]), 1) + self.assertIsInstance(extracted_info[2]["transformer"], OpenMLFlow) + self.assertEqual(extracted_info[2]["transformer"].name, "passthrough") + + # Second check: test whether a lone passthrough in a column transformer is serialized + # correctly + model = sklearn.compose.ColumnTransformer([("passthrough", "passthrough", (0,))]) + serialized = self.extension.model_to_flow(model) + self.assertIsInstance(serialized, OpenMLFlow) + self.assertEqual(len(serialized.components), 1) + self.assertEqual(serialized.components["passthrough"].name, "passthrough") + serialized = self.extension._serialize_sklearn( + ("passthrough", "passthrough"), parent_model=model + ) + self.assertEqual(serialized, ("passthrough", "passthrough")) + extracted_info = self.extension._extract_information_from_model(model) + self.assertEqual(len(extracted_info[2]), 1) + self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow) + self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough") + + # Third check: passthrough and drop in a column transformer + model = sklearn.compose.ColumnTransformer( + [("passthrough", "passthrough", (0,)), ("drop", "drop", (1,))] + ) + serialized = self.extension.model_to_flow(model) + self.assertIsInstance(serialized, OpenMLFlow) + self.assertEqual(len(serialized.components), 2) + self.assertEqual(serialized.components["passthrough"].name, "passthrough") + self.assertEqual(serialized.components["drop"].name, "drop") + serialized = self.extension._serialize_sklearn( + ("passthrough", "passthrough"), parent_model=model + ) + self.assertEqual(serialized, ("passthrough", "passthrough")) + extracted_info = self.extension._extract_information_from_model(model) + self.assertEqual(len(extracted_info[2]), 2) + self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow) + self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow) + self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough") + self.assertEqual(extracted_info[2]["drop"].name, "drop") + + # Fourth check: having an actual preprocessor in the column transformer, too + model = sklearn.compose.ColumnTransformer( + [ + ("passthrough", "passthrough", (0,)), + ("drop", "drop", (1,)), + ("test", sklearn.preprocessing.StandardScaler(), (2,)), + ] + ) + serialized = self.extension.model_to_flow(model) + self.assertIsInstance(serialized, OpenMLFlow) + self.assertEqual(len(serialized.components), 3) + self.assertEqual(serialized.components["passthrough"].name, "passthrough") + self.assertEqual(serialized.components["drop"].name, "drop") + serialized = self.extension._serialize_sklearn( + ("passthrough", "passthrough"), parent_model=model + ) + self.assertEqual(serialized, ("passthrough", "passthrough")) + extracted_info = self.extension._extract_information_from_model(model) + self.assertEqual(len(extracted_info[2]), 3) + self.assertIsInstance(extracted_info[2]["passthrough"], OpenMLFlow) + self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow) + self.assertEqual(extracted_info[2]["passthrough"].name, "passthrough") + self.assertEqual(extracted_info[2]["drop"].name, "drop") + + # Fifth check: test whether a lone drop in a feature union is serialized correctly + model = sklearn.pipeline.FeatureUnion([("drop", "drop")]) + serialized = self.extension.model_to_flow(model) + self.assertIsInstance(serialized, OpenMLFlow) + self.assertEqual(len(serialized.components), 1) + self.assertEqual(serialized.components["drop"].name, "drop") + serialized = self.extension._serialize_sklearn(("drop", "drop"), parent_model=model) + self.assertEqual(serialized, ("drop", "drop")) + extracted_info = self.extension._extract_information_from_model(model) + self.assertEqual(len(extracted_info[2]), 1) + self.assertIsInstance(extracted_info[2]["drop"], OpenMLFlow) + self.assertEqual(extracted_info[2]["drop"].name, "drop") def test_serialize_type(self): supported_types = [float, np.float, np.float32, np.float64, int, np.int, np.int32, np.int64] @@ -1978,14 +2054,21 @@ def test_run_on_model_with_empty_steps(self): run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True) self.assertEqual(len(flow.components), 3) - self.assertEqual(flow.components["dummystep"], "passthrough") - self.assertTrue(isinstance(flow.components["classifier"], OpenMLFlow)) - self.assertTrue(isinstance(flow.components["prep"], OpenMLFlow)) - self.assertTrue( - isinstance(flow.components["prep"].components["columntransformer"], OpenMLFlow) + self.assertIsInstance(flow.components["dummystep"], OpenMLFlow) + self.assertEqual(flow.components["dummystep"].name, "passthrough") + self.assertIsInstance(flow.components["classifier"], OpenMLFlow) + if LooseVersion(sklearn.__version__) < "0.22": + self.assertEqual(flow.components["classifier"].name, "sklearn.svm.classes.SVC") + else: + self.assertEqual(flow.components["classifier"].name, "sklearn.svm._classes.SVC") + self.assertIsInstance(flow.components["prep"], OpenMLFlow) + self.assertEqual(flow.components["prep"].class_name, "sklearn.pipeline.Pipeline") + self.assertIsInstance(flow.components["prep"].components["columntransformer"], OpenMLFlow) + self.assertIsInstance( + flow.components["prep"].components["columntransformer"].components["cat"], OpenMLFlow, ) self.assertEqual( - flow.components["prep"].components["columntransformer"].components["cat"], "drop" + flow.components["prep"].components["columntransformer"].components["cat"].name, "drop" ) # de-serializing flow to a model with non-actionable step @@ -1996,6 +2079,15 @@ def test_run_on_model_with_empty_steps(self): self.assertEqual(len(model.named_steps), 3) self.assertEqual(model.named_steps["dummystep"], "passthrough") + xml = flow._to_dict() + new_model = self.extension.flow_to_model(OpenMLFlow._from_dict(xml)) + + new_model.fit(X, y) + self.assertEqual(type(new_model), type(clf)) + self.assertNotEqual(new_model, clf) + self.assertEqual(len(new_model.named_steps), 3) + self.assertEqual(new_model.named_steps["dummystep"], "passthrough") + def test_sklearn_serialization_with_none_step(self): msg = ( "Cannot serialize objects of None type. Please use a valid "