openml · mfeurer · Sep 13, 2019 · Aug 5, 2019 · Aug 5, 2019 · Aug 6, 2019
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -6,6 +6,7 @@
 import json
 import logging
 import re
+from re import IGNORECASE
 import sys
 import time
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -476,6 +477,167 @@ def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool:
             or ',sklearn==' in flow.external_version
         )
 
+    def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str:
+        '''Fetches the sklearn function docstring for the flow description
+
+        Retrieves the sklearn docstring available and does the following:
+        * If length of docstring <= char_lim, then returns the complete docstring
+        * Else, trims the docstring till it encounters a 'Read more in the :ref:'
+        * Or till it encounters a 'Parameters\n----------\n'
+        The final string returned is at most of length char_lim with leading and
+        trailing whitespaces removed.
+
+        Parameters
+        ----------
+        model : sklearn model
+        char_lim : int
+            Specifying the max length of the returned string.
+            OpenML servers have a constraint of 1024 characters for the 'description' field.
+
+        Returns
+        -------
+        str
+        '''
+        def match_format(s):
+            return "{}\n{}\n".format(s, len(s) * '-')
+        s = inspect.getdoc(model)
+        if s is None:
+            return ''
+        try:
+            # trim till 'Read more'
+            pattern = "Read more in the :ref:"
+            index = s.index(pattern)
+            s = s[:index]
+            # trimming docstring to be within char_lim
+            if len(s) > char_lim:
+                s = "{}...".format(s[:char_lim - 3])
+            return s.strip()
+        except ValueError:
+            logging.warning("'Read more' not found in descriptions. "
+                            "Trying to trim till 'Parameters' if available in docstring.")
+            pass
+        try:
+            # if 'Read more' doesn't exist, trim till 'Parameters'
+            pattern = "Parameters"
+            index = s.index(match_format(pattern))
+        except ValueError:
+            # returning full docstring
+            logging.warning("'Parameters' not found in docstring. Omitting docstring trimming.")
+            index = len(s)
+        s = s[:index]
+        # trimming docstring to be within char_lim
+        if len(s) > char_lim:
+            s = "{}...".format(s[:char_lim - 3])
+        return s.strip()
+
+    def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]:
+        '''Extracts the part of sklearn docstring containing parameter information
+
+        Fetches the entire docstring and trims just the Parameter section.
+        The assumption is that 'Parameters' is the first section in sklearn docstrings,
+        followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
+        appearing in that order if defined.
+        Returns a None if no section with 'Parameters' can be found in the docstring.
+
+        Parameters
+        ----------
+        model : sklearn model
+
+        Returns
+        -------
+        str, or None
+        '''
+        def match_format(s):
+            return "{}\n{}\n".format(s, len(s) * '-')
+        s = inspect.getdoc(model)
+        if s is None:
+            return None
+        try:
+            index1 = s.index(match_format("Parameters"))
+        except ValueError as e:
+            # when sklearn docstring has no 'Parameters' section
+            logging.warning("{} {}".format(match_format("Parameters"), e))
+            return None
+
+        headings = ["Attributes", "Notes", "See also", "Note", "References"]
+        for h in headings:
+            try:
+                # to find end of Parameters section
+                index2 = s.index(match_format(h))
+                break
+            except ValueError:
+                logging.warning("{} not available in docstring".format(h))
+                continue
+        else:
+            # in the case only 'Parameters' exist, trim till end of docstring
+            index2 = len(s)
+        s = s[index1:index2]
+        return s.strip()
+
+    def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]:
+        '''Parses parameter type and description from sklearn dosctring
+
+        Parameters
+        ----------
+        model : sklearn model
+        char_lim : int
+            Specifying the max length of the returned string.
+            OpenML servers have a constraint of 1024 characters string fields.
+
+        Returns
+        -------
+        Dict, or None
+        '''
+        docstring = self._extract_sklearn_parameter_docstring(model)
+        if docstring is None:
+            # when sklearn docstring has no 'Parameters' section
+            return None
+
+        n = re.compile("[.]*\n", flags=IGNORECASE)
+        lines = n.split(docstring)
+        p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE)
+        # The above regular expression is designed to detect sklearn parameter names and type
+        # in the format of [variable_name][space]:[space][type]
+        # The expectation is that the parameter description for this detected parameter will
+        # be all the lines in the docstring till the regex finds another parameter match
+
+        # collecting parameters and their descriptions
+        description = []  # type: List
+        for i, s in enumerate(lines):
+            param = p.findall(s)
+            if param != []:
+                # a parameter definition is found by regex
+                # creating placeholder when parameter found which will be a list of strings
+                # string descriptions will be appended in subsequent iterations
+                # till another parameter is found and a new placeholder is created
+                placeholder = ['']  # type: List[str]
+                description.append(placeholder)
+            else:
+                if len(description) > 0:  # description=[] means no parameters found yet
+                    # appending strings to the placeholder created when parameter found
+                    description[-1].append(s)
+        for i in range(len(description)):
+            # concatenating parameter description strings
+            description[i] = '\n'.join(description[i]).strip()
+            # limiting all parameter descriptions to accepted OpenML string length
+            if len(description[i]) > char_lim:
+                description[i] = "{}...".format(description[i][:char_lim - 3])
+
+        # collecting parameters and their types
+        parameter_docs = OrderedDict()  # type: Dict
+        matches = p.findall(docstring)
+        for i, param in enumerate(matches):
+            key, value = str(param).split(':')
+            parameter_docs[key.strip()] = [value.strip(), description[i]]
+
+        # to avoid KeyError for missing parameters
+        param_list_true = list(model.get_params().keys())
+        param_list_found = list(parameter_docs.keys())
+        for param in list(set(param_list_true) - set(param_list_found)):
+            parameter_docs[param] = [None, None]
+
+        return parameter_docs
+
     def _serialize_model(self, model: Any) -> OpenMLFlow:
         """Create an OpenMLFlow.
 
@@ -534,10 +696,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
 
         sklearn_version = self._format_external_version('sklearn', sklearn.__version__)
         sklearn_version_formatted = sklearn_version.replace('==', '_')
+
+        sklearn_description = self._get_sklearn_description(model)
         flow = OpenMLFlow(name=name,
                           class_name=class_name,
                           custom_name=short_name,
-                          description='Automatically created scikit-learn flow.',
+                          description=sklearn_description,
                           model=model,
                           components=subcomponents,
                           parameters=parameters,
@@ -623,6 +787,7 @@ def _extract_information_from_model(
         sub_components_explicit = set()
         parameters = OrderedDict()  # type: OrderedDict[str, Optional[str]]
         parameters_meta_info = OrderedDict()  # type: OrderedDict[str, Optional[Dict]]
+        parameters_docs = self._extract_sklearn_param_info(model)
 
         model_parameters = model.get_params(deep=False)
         for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
@@ -743,7 +908,12 @@ def flatten_all(list_):
                 else:
                     parameters[k] = None
 
-            parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None)))
+            if parameters_docs is not None:
+                data_type, description = parameters_docs[k]
+                parameters_meta_info[k] = OrderedDict((('description', description),
+                                                       ('data_type', data_type)))
+            else:
+                parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None)))
 
         return parameters, parameters_meta_info, sub_components, sub_components_explicit
 

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -308,7 +308,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
 def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                        ignore_parameter_values_on_older_children: str = None,
                        ignore_parameter_values: bool = False,
-                       ignore_custom_name_if_none: bool = False) -> None:
+                       ignore_custom_name_if_none: bool = False,
+                       check_description: bool = True) -> None:
     """Check equality of two flows.
 
     Two flows are equal if their all keys which are not set by the server
@@ -327,8 +328,11 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
     ignore_parameter_values : bool
         Whether to ignore parameter values when comparing flows.
 
-   ignore_custom_name_if_none : bool
+    ignore_custom_name_if_none : bool
         Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.
+
+    check_description : bool
+        Whether to ignore matching of flow descriptions.
     """
     if not isinstance(flow1, OpenMLFlow):
         raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
@@ -366,6 +370,10 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                                    ignore_custom_name_if_none)
         elif key == '_extension':
             continue
+        elif check_description and key == 'description':
+            # to ignore matching of descriptions since sklearn based flows may have
+            # altering docstrings and is not guaranteed to be consistent
+            continue
         else:
             if key == 'parameters':
                 if ignore_parameter_values or \
@@ -397,6 +405,35 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                 # Helps with backwards compatibility as `custom_name` is now auto-generated, but
                 # before it used to be `None`.
                 continue
+            elif key == 'parameters_meta_info':
+                # this value is a dictionary where each key is a parameter name, containing another
+                # dictionary with keys specifying the parameter's 'description' and 'data_type'
+                # checking parameter descriptions can be ignored since that might change
+                # data type check can also be ignored if one of them is not defined, i.e., None
+                params1 = set(flow1.parameters_meta_info.keys())
+                params2 = set(flow2.parameters_meta_info.keys())
+                if params1 != params2:
+                    raise ValueError('Parameter list in meta info for parameters differ '
+                                     'in the two flows.')
+                # iterating over the parameter's meta info list
+                for param in params1:
+                    if isinstance(flow1.parameters_meta_info[param], Dict) and \
+                       isinstance(flow2.parameters_meta_info[param], Dict) and \
+                       'data_type' in flow1.parameters_meta_info[param] and \
+                       'data_type' in flow2.parameters_meta_info[param]:
+                        value1 = flow1.parameters_meta_info[param]['data_type']
+                        value2 = flow2.parameters_meta_info[param]['data_type']
+                    else:
+                        value1 = flow1.parameters_meta_info[param]
+                        value2 = flow2.parameters_meta_info[param]
+                    if value1 is None or value2 is None:
+                        continue
+                    elif value1 != value2:
+                        raise ValueError("Flow {}: data type for parameter {} in {} differ "
+                                         "as {}\nvs\n{}".format(flow1.name, param, key,
+                                                                value1, value2))
+                # the continue is to avoid the 'attr != attr2' check at end of function
+                continue
 
             if attr1 != attr2:
                 raise ValueError("Flow %s: values for attribute '%s' differ: "