openml · mfeurer · Jul 24, 2019 · Jul 2, 2019 · Jul 2, 2019 · Jul 2, 2019
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -17,6 +17,7 @@ Changelog
 * DOC #639: More descriptive documention for function to convert array format.
 * ADD #687: Adds a function to retrieve the list of evaluation measures available.
 * ADD #695: A function to retrieve all the data quality measures available.
+* ADD #412: Add a function to trim flow names for scikit-learn flows.
 * ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
 * ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
 * MAINT #726: Update examples to remove deprecation warnings from scikit-learn

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -87,6 +87,122 @@ def can_handle_model(cls, model: Any) -> bool:
         """
         return isinstance(model, sklearn.base.BaseEstimator)
 
+    @classmethod
+    def trim_flow_name(
+            cls,
+            long_name: str,
+            extra_trim_length: int = 100,
+            _outer: bool = True
+    ) -> str:
+        """ Shorten generated sklearn flow name to at most `max_length` characters.
+
+        Flows are assumed to have the following naming structure:
+        (model_selection)? (pipeline)? (steps)+
+        and will be shortened to:
+        sklearn.(selection.)?(pipeline.)?(steps)+
+        e.g. (white spaces and newlines added for readability)
+        sklearn.pipeline.Pipeline(
+            columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
+                numeric=sklearn.pipeline.Pipeline(
+                    imputer=sklearn.preprocessing.imputation.Imputer,
+                    standardscaler=sklearn.preprocessing.data.StandardScaler),
+                nominal=sklearn.pipeline.Pipeline(
+                    simpleimputer=sklearn.impute.SimpleImputer,
+                    onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
+            variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
+            svc=sklearn.svm.classes.SVC)
+        ->
+        sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)
+
+        Parameters
+        ----------
+        long_name : str
+            The full flow name generated by the scikit-learn extension.
+        extra_trim_length: int (default=100)
+            If the trimmed name would exceed `extra_trim_length` characters, additional trimming
+            of the short name is performed. This reduces the produced short name length.
+            There is no guarantee the end result will not exceed `extra_trim_length`.
+        _outer : bool (default=True)
+            For internal use only. Specifies if the function is called recursively.
+
+        Returns
+        -------
+        str
+
+        """
+        def remove_all_in_parentheses(string: str) -> str:
+            string, removals = re.subn(r"\([^()]*\)", "", string)
+            while removals > 0:
+                string, removals = re.subn(r"\([^()]*\)", "", string)
+            return string
+
+        # Generally, we want to trim all hyperparameters, the exception to that is for model
+        # selection, as the `estimator` hyperparameter is very indicative of what is in the flow.
+        # So we first trim name of the `estimator` specified in mode selection. For reference, in
+        # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and
+        # keep it in the final trimmed flow name:
+        # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer,
+        # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
+        # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator=
+        # sklearn.tree.tree.DecisionTreeClassifier))
+        if 'sklearn.model_selection' in long_name:
+            start_index = long_name.index('sklearn.model_selection')
+            estimator_start = (start_index
+                               + long_name[start_index:].index('estimator=')
+                               + len('estimator='))
+
+            model_select_boilerplate = long_name[start_index:estimator_start]
+            # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator="
+            model_selection_class = model_select_boilerplate.split('(')[0].split('.')[-1]
+
+            # Now we want to also find and parse the `estimator`, for this we find the closing
+            # parenthesis to the model selection technique:
+            closing_parenthesis_expected = 1
+            for i, char in enumerate(long_name[estimator_start:], start=estimator_start):
+                if char == '(':
+                    closing_parenthesis_expected += 1
+                if char == ')':
+                    closing_parenthesis_expected -= 1
+                if closing_parenthesis_expected == 0:
+                    break
+
+            model_select_pipeline = long_name[estimator_start:i]
+            trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False)
+            _, trimmed_pipeline = trimmed_pipeline.split('.', maxsplit=1)  # trim module prefix
+            model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline)
+            name = long_name[:start_index] + model_select_short + long_name[i + 1:]
+        else:
+            name = long_name
+
+        module_name = long_name.split('.')[0]
+        short_name = module_name + '.{}'
+
+        if name.startswith('sklearn.pipeline'):
+            full_pipeline_class, pipeline = name[:-1].split('(', maxsplit=1)
+            pipeline_class = full_pipeline_class.split('.')[-1]
+            # We don't want nested pipelines in the short name, so we trim all complicated
+            # subcomponents, i.e. those with parentheses:
+            pipeline = remove_all_in_parentheses(pipeline)
+
+            # then the pipeline steps are formatted e.g.:
+            # step1name=sklearn.submodule.ClassName,step2name...
+            components = [component.split('.')[-1] for component in pipeline.split(',')]
+            pipeline = "{}({})".format(pipeline_class, ','.join(components))
+            if len(short_name.format(pipeline)) > extra_trim_length:
+                pipeline = "{}(...,{})".format(pipeline_class, components[-1])
+        else:
+            # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier
+            pipeline = remove_all_in_parentheses(name).split('.')[-1]
+
+        if not _outer:
+            # Anything from parenthesis in inner calls should not be culled, so we use brackets
+            pipeline = pipeline.replace('(', '[').replace(')', ']')
+        else:
+            # Square brackets may be introduced with nested model_selection
+            pipeline = pipeline.replace('[', '(').replace(']', ')')
+
+        return short_name.format(pipeline)
+
     ################################################################################################
     # Methods for flow serialization and de-serialization
 
@@ -402,6 +518,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
             name = '%s(%s)' % (class_name, sub_components_names[1:])
         else:
             name = class_name
+        short_name = SklearnExtension.trim_flow_name(name)
 
         # Get the external versions of all sub-components
         external_version = self._get_external_version_string(model, subcomponents)
@@ -419,6 +536,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         sklearn_version_formatted = sklearn_version.replace('==', '_')
         flow = OpenMLFlow(name=name,
                           class_name=class_name,
+                          custom_name=short_name,
                           description='Automatically created scikit-learn flow.',
                           model=model,
                           components=subcomponents,

diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -417,14 +417,15 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
         _copy_server_fields(flow, self)
         try:
             openml.flows.functions.assert_flows_equal(
-                self, flow, flow.upload_date, ignore_parameter_values=True
+                self, flow, flow.upload_date,
+                ignore_parameter_values=True,
+                ignore_custom_name_if_none=True
             )
         except ValueError as e:
             message = e.args[0]
-            raise ValueError("Flow was not stored correctly on the server. "
-                             "New flow ID is %d. Please check manually and "
-                             "remove the flow if necessary! Error is:\n'%s'" %
-                             (flow_id, message))
+            raise ValueError("The flow on the server is inconsistent with the local flow. "
+                             "The server flow ID is {}. Please check manually and remove "
+                             "the flow if necessary! Error is:\n'{}'".format(flow_id, message))
         return self
 
     def get_structure(self, key_item: str) -> Dict[str, List[str]]:

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -307,7 +307,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
 
 def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                        ignore_parameter_values_on_older_children: str = None,
-                       ignore_parameter_values: bool = False) -> None:
+                       ignore_parameter_values: bool = False,
+                       ignore_custom_name_if_none: bool = False) -> None:
     """Check equality of two flows.
 
     Two flows are equal if their all keys which are not set by the server
@@ -325,6 +326,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
 
     ignore_parameter_values : bool
         Whether to ignore parameter values when comparing flows.
+
+   ignore_custom_name_if_none : bool
+        Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.
     """
     if not isinstance(flow1, OpenMLFlow):
         raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
@@ -358,7 +362,8 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                                      'argument2, but not in argument1.' % name)
                 assert_flows_equal(attr1[name], attr2[name],
                                    ignore_parameter_values_on_older_children,
-                                   ignore_parameter_values)
+                                   ignore_parameter_values,
+                                   ignore_custom_name_if_none)
         elif key == '_extension':
             continue
         else:
@@ -385,6 +390,13 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                     # Continue needs to be done here as the first if
                     # statement triggers in both special cases
                     continue
+            elif (key == 'custom_name'
+                  and ignore_custom_name_if_none
+                  and (attr1 is None or attr2 is None)):
+                # If specified, we allow `custom_name` inequality if one flow's name is None.
+                # Helps with backwards compatibility as `custom_name` is now auto-generated, but
+                # before it used to be `None`.
+                continue
 
             if attr1 != attr2:
                 raise ValueError("Flow %s: values for attribute '%s' differ: "