diff --git a/doc/progress.rst b/doc/progress.rst index f2e0bc90d..5ce263fce 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -17,6 +17,7 @@ Changelog * DOC #639: More descriptive documention for function to convert array format. * ADD #687: Adds a function to retrieve the list of evaluation measures available. * ADD #695: A function to retrieve all the data quality measures available. +* ADD #412: Add a function to trim flow names for scikit-learn flows. * ADD #715: `list_evaluations` now has an option to sort evaluations by score (value). * ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible. * MAINT #726: Update examples to remove deprecation warnings from scikit-learn diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index ce8e4ebf9..5883ed489 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -87,6 +87,122 @@ def can_handle_model(cls, model: Any) -> bool: """ return isinstance(model, sklearn.base.BaseEstimator) + @classmethod + def trim_flow_name( + cls, + long_name: str, + extra_trim_length: int = 100, + _outer: bool = True + ) -> str: + """ Shorten generated sklearn flow name to at most `max_length` characters. + + Flows are assumed to have the following naming structure: + (model_selection)? (pipeline)? (steps)+ + and will be shortened to: + sklearn.(selection.)?(pipeline.)?(steps)+ + e.g. (white spaces and newlines added for readability) + sklearn.pipeline.Pipeline( + columntransformer=sklearn.compose._column_transformer.ColumnTransformer( + numeric=sklearn.pipeline.Pipeline( + imputer=sklearn.preprocessing.imputation.Imputer, + standardscaler=sklearn.preprocessing.data.StandardScaler), + nominal=sklearn.pipeline.Pipeline( + simpleimputer=sklearn.impute.SimpleImputer, + onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), + variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + svc=sklearn.svm.classes.SVC) + -> + sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC) + + Parameters + ---------- + long_name : str + The full flow name generated by the scikit-learn extension. + extra_trim_length: int (default=100) + If the trimmed name would exceed `extra_trim_length` characters, additional trimming + of the short name is performed. This reduces the produced short name length. + There is no guarantee the end result will not exceed `extra_trim_length`. + _outer : bool (default=True) + For internal use only. Specifies if the function is called recursively. + + Returns + ------- + str + + """ + def remove_all_in_parentheses(string: str) -> str: + string, removals = re.subn(r"\([^()]*\)", "", string) + while removals > 0: + string, removals = re.subn(r"\([^()]*\)", "", string) + return string + + # Generally, we want to trim all hyperparameters, the exception to that is for model + # selection, as the `estimator` hyperparameter is very indicative of what is in the flow. + # So we first trim name of the `estimator` specified in mode selection. For reference, in + # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and + # keep it in the final trimmed flow name: + # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer, + # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator= + # sklearn.tree.tree.DecisionTreeClassifier)) + if 'sklearn.model_selection' in long_name: + start_index = long_name.index('sklearn.model_selection') + estimator_start = (start_index + + long_name[start_index:].index('estimator=') + + len('estimator=')) + + model_select_boilerplate = long_name[start_index:estimator_start] + # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator=" + model_selection_class = model_select_boilerplate.split('(')[0].split('.')[-1] + + # Now we want to also find and parse the `estimator`, for this we find the closing + # parenthesis to the model selection technique: + closing_parenthesis_expected = 1 + for i, char in enumerate(long_name[estimator_start:], start=estimator_start): + if char == '(': + closing_parenthesis_expected += 1 + if char == ')': + closing_parenthesis_expected -= 1 + if closing_parenthesis_expected == 0: + break + + model_select_pipeline = long_name[estimator_start:i] + trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False) + _, trimmed_pipeline = trimmed_pipeline.split('.', maxsplit=1) # trim module prefix + model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline) + name = long_name[:start_index] + model_select_short + long_name[i + 1:] + else: + name = long_name + + module_name = long_name.split('.')[0] + short_name = module_name + '.{}' + + if name.startswith('sklearn.pipeline'): + full_pipeline_class, pipeline = name[:-1].split('(', maxsplit=1) + pipeline_class = full_pipeline_class.split('.')[-1] + # We don't want nested pipelines in the short name, so we trim all complicated + # subcomponents, i.e. those with parentheses: + pipeline = remove_all_in_parentheses(pipeline) + + # then the pipeline steps are formatted e.g.: + # step1name=sklearn.submodule.ClassName,step2name... + components = [component.split('.')[-1] for component in pipeline.split(',')] + pipeline = "{}({})".format(pipeline_class, ','.join(components)) + if len(short_name.format(pipeline)) > extra_trim_length: + pipeline = "{}(...,{})".format(pipeline_class, components[-1]) + else: + # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier + pipeline = remove_all_in_parentheses(name).split('.')[-1] + + if not _outer: + # Anything from parenthesis in inner calls should not be culled, so we use brackets + pipeline = pipeline.replace('(', '[').replace(')', ']') + else: + # Square brackets may be introduced with nested model_selection + pipeline = pipeline.replace('[', '(').replace(']', ')') + + return short_name.format(pipeline) + ################################################################################################ # Methods for flow serialization and de-serialization @@ -402,6 +518,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: name = '%s(%s)' % (class_name, sub_components_names[1:]) else: name = class_name + short_name = SklearnExtension.trim_flow_name(name) # Get the external versions of all sub-components external_version = self._get_external_version_string(model, subcomponents) @@ -419,6 +536,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: sklearn_version_formatted = sklearn_version.replace('==', '_') flow = OpenMLFlow(name=name, class_name=class_name, + custom_name=short_name, description='Automatically created scikit-learn flow.', model=model, components=subcomponents, diff --git a/openml/flows/flow.py b/openml/flows/flow.py index bdd4fe6a6..379233208 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -417,14 +417,15 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow': _copy_server_fields(flow, self) try: openml.flows.functions.assert_flows_equal( - self, flow, flow.upload_date, ignore_parameter_values=True + self, flow, flow.upload_date, + ignore_parameter_values=True, + ignore_custom_name_if_none=True ) except ValueError as e: message = e.args[0] - raise ValueError("Flow was not stored correctly on the server. " - "New flow ID is %d. Please check manually and " - "remove the flow if necessary! Error is:\n'%s'" % - (flow_id, message)) + raise ValueError("The flow on the server is inconsistent with the local flow. " + "The server flow ID is {}. Please check manually and remove " + "the flow if necessary! Error is:\n'{}'".format(flow_id, message)) return self def get_structure(self, key_item: str) -> Dict[str, List[str]]: diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 53a1fdc0a..d12bcfe91 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -307,7 +307,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None: def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values_on_older_children: str = None, - ignore_parameter_values: bool = False) -> None: + ignore_parameter_values: bool = False, + ignore_custom_name_if_none: bool = False) -> None: """Check equality of two flows. Two flows are equal if their all keys which are not set by the server @@ -325,6 +326,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values : bool Whether to ignore parameter values when comparing flows. + + ignore_custom_name_if_none : bool + Whether to ignore the custom name field if either flow has `custom_name` equal to `None`. """ if not isinstance(flow1, OpenMLFlow): raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' % @@ -358,7 +362,8 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, 'argument2, but not in argument1.' % name) assert_flows_equal(attr1[name], attr2[name], ignore_parameter_values_on_older_children, - ignore_parameter_values) + ignore_parameter_values, + ignore_custom_name_if_none) elif key == '_extension': continue else: @@ -385,6 +390,13 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, # Continue needs to be done here as the first if # statement triggers in both special cases continue + elif (key == 'custom_name' + and ignore_custom_name_if_none + and (attr1 is None or attr2 is None)): + # If specified, we allow `custom_name` inequality if one flow's name is None. + # Helps with backwards compatibility as `custom_name` is now auto-generated, but + # before it used to be `None`. + continue if attr1 != attr2: raise ValueError("Flow %s: values for attribute '%s' differ: " diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 2217b332b..2728076fe 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -76,6 +76,7 @@ def test_serialize_model(self): max_leaf_nodes=2000) fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier' + fixture_short_name = 'sklearn.DecisionTreeClassifier' fixture_description = 'Automatically created scikit-learn flow.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ @@ -117,6 +118,7 @@ def test_serialize_model(self): self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters, fixture_parameters) self.assertEqual(serialization.dependencies, version_fixture) @@ -142,6 +144,7 @@ def test_serialize_model_clustering(self): model = sklearn.cluster.KMeans() fixture_name = 'sklearn.cluster.k_means_.KMeans' + fixture_short_name = 'sklearn.KMeans' fixture_description = 'Automatically created scikit-learn flow.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ @@ -179,6 +182,7 @@ def test_serialize_model_clustering(self): self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters, fixture_parameters) self.assertEqual(serialization.dependencies, version_fixture) @@ -204,6 +208,7 @@ def test_serialize_model_with_subcomponent(self): fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \ '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)' fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' + fixture_short_name = 'sklearn.AdaBoostClassifier' fixture_description = 'Automatically created scikit-learn flow.' fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier' @@ -218,6 +223,7 @@ def test_serialize_model_with_subcomponent(self): self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_class_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"') self.assertIsInstance(serialization.parameters['base_estimator'], str) @@ -259,6 +265,7 @@ def test_serialize_pipeline(self): fixture_name = 'sklearn.pipeline.Pipeline(' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'dummy=sklearn.dummy.DummyClassifier)' + fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)' fixture_description = 'Automatically created scikit-learn flow.' fixture_structure = { fixture_name: [], @@ -270,6 +277,7 @@ def test_serialize_pipeline(self): structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) @@ -343,6 +351,7 @@ def test_serialize_pipeline_clustering(self): fixture_name = 'sklearn.pipeline.Pipeline(' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'clusterer=sklearn.cluster.k_means_.KMeans)' + fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)' fixture_description = 'Automatically created scikit-learn flow.' fixture_structure = { fixture_name: [], @@ -354,6 +363,7 @@ def test_serialize_pipeline_clustering(self): structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) @@ -431,6 +441,7 @@ def test_serialize_column_transformer(self): fixture = 'sklearn.compose._column_transformer.ColumnTransformer(' \ 'numeric=sklearn.preprocessing.data.StandardScaler,' \ 'nominal=sklearn.preprocessing._encoders.OneHotEncoder)' + fixture_short_name = 'sklearn.ColumnTransformer' fixture_description = 'Automatically created scikit-learn flow.' fixture_structure = { fixture: [], @@ -441,6 +452,7 @@ def test_serialize_column_transformer(self): serialization = self.extension.model_to_flow(model) structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) # del serialization.model @@ -1598,3 +1610,62 @@ def test__extract_trace_data(self): self.assertIn(param_in_trace, trace_iteration.parameters) param_value = json.loads(trace_iteration.parameters[param_in_trace]) self.assertTrue(param_value in param_grid[param]) + + def test_trim_flow_name(self): + import re + long = """sklearn.pipeline.Pipeline( + columntransformer=sklearn.compose._column_transformer.ColumnTransformer( + numeric=sklearn.pipeline.Pipeline( + imputer=sklearn.preprocessing.imputation.Imputer, + standardscaler=sklearn.preprocessing.data.StandardScaler), + nominal=sklearn.pipeline.Pipeline( + simpleimputer=sklearn.impute.SimpleImputer, + onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), + variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + svc=sklearn.svm.classes.SVC)""" + short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)" + shorter = "sklearn.Pipeline(...,SVC)" + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + self.assertEqual(shorter, + SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50)) + + long = """sklearn.pipeline.Pipeline( + imputation=openmlstudy14.preprocessing.ConditionalImputer, + hotencoding=sklearn.preprocessing.data.OneHotEncoder, + variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + classifier=sklearn.ensemble.forest.RandomForestClassifier)""" + short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)" # noqa: E501 + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = """sklearn.pipeline.Pipeline( + Imputer=sklearn.preprocessing.imputation.Imputer, + VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501 + Estimator=sklearn.model_selection._search.RandomizedSearchCV( + estimator=sklearn.tree.tree.DecisionTreeClassifier))""" + short = "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))" # noqa: E501 + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = """sklearn.model_selection._search.RandomizedSearchCV( + estimator=sklearn.pipeline.Pipeline( + Imputer=sklearn.preprocessing.imputation.Imputer, + classifier=sklearn.ensemble.forest.RandomForestClassifier))""" + short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))" + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = """sklearn.pipeline.FeatureUnion( + pca=sklearn.decomposition.pca.PCA, + svd=sklearn.decomposition.truncated_svd.TruncatedSVD)""" + short = "sklearn.FeatureUnion(PCA,TruncatedSVD)" + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = "sklearn.ensemble.forest.RandomForestClassifier" + short = "sklearn.RandomForestClassifier" + self.assertEqual(short, SklearnExtension.trim_flow_name(long)) + + self.assertEqual("weka.IsolationForest", + SklearnExtension.trim_flow_name("weka.IsolationForest")) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 44b649b87..6e7eb7fbb 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -302,8 +302,8 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): flow.flow_id)) fixture = ( - "Flow was not stored correctly on the server. " - "New flow ID is 1. Please check manually and remove " + "The flow on the server is inconsistent with the local flow. " + "The server flow ID is 1. Please check manually and remove " "the flow if necessary! Error is:\n" "'Flow sklearn.ensemble.forest.RandomForestClassifier: " "values for attribute 'name' differ: " diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 02d4b2a7d..de933731a 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -283,9 +283,16 @@ def test_get_flow_reinstantiate_model_no_extension(self): flow_id=10, reinstantiate=True) - @unittest.skipIf(LooseVersion(sklearn.__version__) == "0.20.0", - reason="No non-0.20 scikit-learn flow known.") + @unittest.skipIf(LooseVersion(sklearn.__version__) == "0.19.1", + reason="Target flow is from sklearn 0.19.1") def test_get_flow_reinstantiate_model_wrong_version(self): - # 20 is scikit-learn ==0.20.0 - # I can't find a != 0.20 permanent flow on the test server. - self.assertRaises(ValueError, openml.flows.get_flow, flow_id=20, reinstantiate=True) + # Note that CI does not test against 0.19.1. + openml.config.server = self.production_server + _, sklearn_major, _ = LooseVersion(sklearn.__version__).version + flow = 8175 + expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.' + self.assertRaisesRegex(ValueError, + expected, + openml.flows.get_flow, + flow_id=flow, + reinstantiate=True)