diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index d44b61ae7..de81d435d 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -6,6 +6,7 @@ import json import logging import re +from re import IGNORECASE import sys import time from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union @@ -476,6 +477,167 @@ def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool: or ',sklearn==' in flow.external_version ) + def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str: + '''Fetches the sklearn function docstring for the flow description + + Retrieves the sklearn docstring available and does the following: + * If length of docstring <= char_lim, then returns the complete docstring + * Else, trims the docstring till it encounters a 'Read more in the :ref:' + * Or till it encounters a 'Parameters\n----------\n' + The final string returned is at most of length char_lim with leading and + trailing whitespaces removed. + + Parameters + ---------- + model : sklearn model + char_lim : int + Specifying the max length of the returned string. + OpenML servers have a constraint of 1024 characters for the 'description' field. + + Returns + ------- + str + ''' + def match_format(s): + return "{}\n{}\n".format(s, len(s) * '-') + s = inspect.getdoc(model) + if s is None: + return '' + try: + # trim till 'Read more' + pattern = "Read more in the :ref:" + index = s.index(pattern) + s = s[:index] + # trimming docstring to be within char_lim + if len(s) > char_lim: + s = "{}...".format(s[:char_lim - 3]) + return s.strip() + except ValueError: + logging.warning("'Read more' not found in descriptions. " + "Trying to trim till 'Parameters' if available in docstring.") + pass + try: + # if 'Read more' doesn't exist, trim till 'Parameters' + pattern = "Parameters" + index = s.index(match_format(pattern)) + except ValueError: + # returning full docstring + logging.warning("'Parameters' not found in docstring. Omitting docstring trimming.") + index = len(s) + s = s[:index] + # trimming docstring to be within char_lim + if len(s) > char_lim: + s = "{}...".format(s[:char_lim - 3]) + return s.strip() + + def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]: + '''Extracts the part of sklearn docstring containing parameter information + + Fetches the entire docstring and trims just the Parameter section. + The assumption is that 'Parameters' is the first section in sklearn docstrings, + followed by other sections titled 'Attributes', 'See also', 'Note', 'References', + appearing in that order if defined. + Returns a None if no section with 'Parameters' can be found in the docstring. + + Parameters + ---------- + model : sklearn model + + Returns + ------- + str, or None + ''' + def match_format(s): + return "{}\n{}\n".format(s, len(s) * '-') + s = inspect.getdoc(model) + if s is None: + return None + try: + index1 = s.index(match_format("Parameters")) + except ValueError as e: + # when sklearn docstring has no 'Parameters' section + logging.warning("{} {}".format(match_format("Parameters"), e)) + return None + + headings = ["Attributes", "Notes", "See also", "Note", "References"] + for h in headings: + try: + # to find end of Parameters section + index2 = s.index(match_format(h)) + break + except ValueError: + logging.warning("{} not available in docstring".format(h)) + continue + else: + # in the case only 'Parameters' exist, trim till end of docstring + index2 = len(s) + s = s[index1:index2] + return s.strip() + + def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]: + '''Parses parameter type and description from sklearn dosctring + + Parameters + ---------- + model : sklearn model + char_lim : int + Specifying the max length of the returned string. + OpenML servers have a constraint of 1024 characters string fields. + + Returns + ------- + Dict, or None + ''' + docstring = self._extract_sklearn_parameter_docstring(model) + if docstring is None: + # when sklearn docstring has no 'Parameters' section + return None + + n = re.compile("[.]*\n", flags=IGNORECASE) + lines = n.split(docstring) + p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE) + # The above regular expression is designed to detect sklearn parameter names and type + # in the format of [variable_name][space]:[space][type] + # The expectation is that the parameter description for this detected parameter will + # be all the lines in the docstring till the regex finds another parameter match + + # collecting parameters and their descriptions + description = [] # type: List + for i, s in enumerate(lines): + param = p.findall(s) + if param != []: + # a parameter definition is found by regex + # creating placeholder when parameter found which will be a list of strings + # string descriptions will be appended in subsequent iterations + # till another parameter is found and a new placeholder is created + placeholder = [''] # type: List[str] + description.append(placeholder) + else: + if len(description) > 0: # description=[] means no parameters found yet + # appending strings to the placeholder created when parameter found + description[-1].append(s) + for i in range(len(description)): + # concatenating parameter description strings + description[i] = '\n'.join(description[i]).strip() + # limiting all parameter descriptions to accepted OpenML string length + if len(description[i]) > char_lim: + description[i] = "{}...".format(description[i][:char_lim - 3]) + + # collecting parameters and their types + parameter_docs = OrderedDict() # type: Dict + matches = p.findall(docstring) + for i, param in enumerate(matches): + key, value = str(param).split(':') + parameter_docs[key.strip()] = [value.strip(), description[i]] + + # to avoid KeyError for missing parameters + param_list_true = list(model.get_params().keys()) + param_list_found = list(parameter_docs.keys()) + for param in list(set(param_list_true) - set(param_list_found)): + parameter_docs[param] = [None, None] + + return parameter_docs + def _serialize_model(self, model: Any) -> OpenMLFlow: """Create an OpenMLFlow. @@ -534,10 +696,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: sklearn_version = self._format_external_version('sklearn', sklearn.__version__) sklearn_version_formatted = sklearn_version.replace('==', '_') + + sklearn_description = self._get_sklearn_description(model) flow = OpenMLFlow(name=name, class_name=class_name, custom_name=short_name, - description='Automatically created scikit-learn flow.', + description=sklearn_description, model=model, components=subcomponents, parameters=parameters, @@ -623,6 +787,7 @@ def _extract_information_from_model( sub_components_explicit = set() parameters = OrderedDict() # type: OrderedDict[str, Optional[str]] parameters_meta_info = OrderedDict() # type: OrderedDict[str, Optional[Dict]] + parameters_docs = self._extract_sklearn_param_info(model) model_parameters = model.get_params(deep=False) for k, v in sorted(model_parameters.items(), key=lambda t: t[0]): @@ -743,7 +908,12 @@ def flatten_all(list_): else: parameters[k] = None - parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None))) + if parameters_docs is not None: + data_type, description = parameters_docs[k] + parameters_meta_info[k] = OrderedDict((('description', description), + ('data_type', data_type))) + else: + parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None))) return parameters, parameters_meta_info, sub_components, sub_components_explicit diff --git a/openml/flows/functions.py b/openml/flows/functions.py index d12bcfe91..aa6f64600 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -308,7 +308,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None: def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values_on_older_children: str = None, ignore_parameter_values: bool = False, - ignore_custom_name_if_none: bool = False) -> None: + ignore_custom_name_if_none: bool = False, + check_description: bool = True) -> None: """Check equality of two flows. Two flows are equal if their all keys which are not set by the server @@ -327,8 +328,11 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values : bool Whether to ignore parameter values when comparing flows. - ignore_custom_name_if_none : bool + ignore_custom_name_if_none : bool Whether to ignore the custom name field if either flow has `custom_name` equal to `None`. + + check_description : bool + Whether to ignore matching of flow descriptions. """ if not isinstance(flow1, OpenMLFlow): raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' % @@ -366,6 +370,10 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_custom_name_if_none) elif key == '_extension': continue + elif check_description and key == 'description': + # to ignore matching of descriptions since sklearn based flows may have + # altering docstrings and is not guaranteed to be consistent + continue else: if key == 'parameters': if ignore_parameter_values or \ @@ -397,6 +405,35 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, # Helps with backwards compatibility as `custom_name` is now auto-generated, but # before it used to be `None`. continue + elif key == 'parameters_meta_info': + # this value is a dictionary where each key is a parameter name, containing another + # dictionary with keys specifying the parameter's 'description' and 'data_type' + # checking parameter descriptions can be ignored since that might change + # data type check can also be ignored if one of them is not defined, i.e., None + params1 = set(flow1.parameters_meta_info.keys()) + params2 = set(flow2.parameters_meta_info.keys()) + if params1 != params2: + raise ValueError('Parameter list in meta info for parameters differ ' + 'in the two flows.') + # iterating over the parameter's meta info list + for param in params1: + if isinstance(flow1.parameters_meta_info[param], Dict) and \ + isinstance(flow2.parameters_meta_info[param], Dict) and \ + 'data_type' in flow1.parameters_meta_info[param] and \ + 'data_type' in flow2.parameters_meta_info[param]: + value1 = flow1.parameters_meta_info[param]['data_type'] + value2 = flow2.parameters_meta_info[param]['data_type'] + else: + value1 = flow1.parameters_meta_info[param] + value2 = flow2.parameters_meta_info[param] + if value1 is None or value2 is None: + continue + elif value1 != value2: + raise ValueError("Flow {}: data type for parameter {} in {} differ " + "as {}\nvs\n{}".format(flow1.name, param, key, + value1, value2)) + # the continue is to avoid the 'attr != attr2' check at end of function + continue if attr1 != attr2: raise ValueError("Flow %s: values for attribute '%s' differ: " diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 8bc615516..4e7e40dc3 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -8,6 +8,7 @@ from collections import OrderedDict from unittest import mock import warnings +from packaging import version import numpy as np import scipy.optimize @@ -75,7 +76,8 @@ def test_serialize_model(self): fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_short_name = 'sklearn.DecisionTreeClassifier' - fixture_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'A decision tree classifier.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ # min_impurity_decrease has been introduced in 0.20 @@ -143,7 +145,8 @@ def test_serialize_model_clustering(self): fixture_name = 'sklearn.cluster.k_means_.KMeans' fixture_short_name = 'sklearn.KMeans' - fixture_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'K-Means clustering' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ # n_jobs default has changed to None in 0.20 @@ -207,10 +210,18 @@ def test_serialize_model_with_subcomponent(self): '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)' fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' fixture_short_name = 'sklearn.AdaBoostClassifier' - fixture_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a '\ + 'meta-estimator that begins by fitting a\nclassifier on the original'\ + ' dataset and then fits additional copies of the\nclassifier on the '\ + 'same dataset but where the weights of incorrectly\nclassified '\ + 'instances are adjusted such that subsequent classifiers focus\nmore'\ + ' on difficult cases.\n\nThis class implements the algorithm known '\ + 'as AdaBoost-SAMME [2].' fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier' - fixture_subcomponent_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model.base_estimator) + fixture_subcomponent_description = 'A decision tree classifier.' fixture_structure = { fixture_name: [], 'sklearn.tree.tree.DecisionTreeClassifier': ['base_estimator'] @@ -264,7 +275,25 @@ def test_serialize_pipeline(self): 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'dummy=sklearn.dummy.DummyClassifier)' fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)' - fixture_description = 'Automatically created scikit-learn flow.' + + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially"\ + " apply a list of transforms and a final estimator.\n"\ + "Intermediate steps of the pipeline must be 'transforms', that "\ + "is, they\nmust implement fit and transform methods.\nThe final "\ + "estimator only needs to implement fit.\nThe transformers in "\ + "the pipeline can be cached using ``memory`` argument.\n\nThe "\ + "purpose of the pipeline is to assemble several steps that can "\ + "be\ncross-validated together while setting different parameters"\ + ".\nFor this, it enables setting parameters of the various steps"\ + " using their\nnames and the parameter name separated by a '__',"\ + " as in the example below.\nA step's estimator may be replaced "\ + "entirely by setting the parameter\nwith its name to another "\ + "estimator, or a transformer removed by setting\nit to "\ + "'passthrough' or ``None``." + else: + fixture_description = self.extension._get_sklearn_description(model) + fixture_structure = { fixture_name: [], 'sklearn.preprocessing.data.StandardScaler': ['scaler'], @@ -353,7 +382,24 @@ def test_serialize_pipeline_clustering(self): 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'clusterer=sklearn.cluster.k_means_.KMeans)' fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)' - fixture_description = 'Automatically created scikit-learn flow.' + + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially"\ + " apply a list of transforms and a final estimator.\n"\ + "Intermediate steps of the pipeline must be 'transforms', that "\ + "is, they\nmust implement fit and transform methods.\nThe final "\ + "estimator only needs to implement fit.\nThe transformers in "\ + "the pipeline can be cached using ``memory`` argument.\n\nThe "\ + "purpose of the pipeline is to assemble several steps that can "\ + "be\ncross-validated together while setting different parameters"\ + ".\nFor this, it enables setting parameters of the various steps"\ + " using their\nnames and the parameter name separated by a '__',"\ + " as in the example below.\nA step's estimator may be replaced "\ + "entirely by setting the parameter\nwith its name to another "\ + "estimator, or a transformer removed by setting\nit to "\ + "'passthrough' or ``None``." + else: + fixture_description = self.extension._get_sklearn_description(model) fixture_structure = { fixture_name: [], 'sklearn.preprocessing.data.StandardScaler': ['scaler'], @@ -445,7 +491,20 @@ def test_serialize_column_transformer(self): 'numeric=sklearn.preprocessing.data.StandardScaler,' \ 'nominal=sklearn.preprocessing._encoders.OneHotEncoder)' fixture_short_name = 'sklearn.ColumnTransformer' - fixture_description = 'Automatically created scikit-learn flow.' + + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'Applies transformers to columns of an array or pandas '\ + 'DataFrame.\n\nThis estimator allows different columns or '\ + 'column subsets of the input\nto be transformed separately and '\ + 'the features generated by each transformer\nwill be '\ + 'concatenated to form a single feature space.\nThis is useful '\ + 'for heterogeneous or columnar data, to combine several\nfeature'\ + ' extraction mechanisms or transformations into a single '\ + 'transformer.' + else: + fixture_description = self.extension._get_sklearn_description(model) + fixture_structure = { fixture: [], 'sklearn.preprocessing.data.StandardScaler': ['numeric'], @@ -504,7 +563,25 @@ def test_serialize_column_transformer_pipeline(self): fixture_name: [], } - fixture_description = 'Automatically created scikit-learn flow.' + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially"\ + " apply a list of transforms and a final estimator.\n"\ + "Intermediate steps of the pipeline must be 'transforms', that "\ + "is, they\nmust implement fit and transform methods.\nThe final"\ + " estimator only needs to implement fit.\nThe transformers in "\ + "the pipeline can be cached using ``memory`` argument.\n\nThe "\ + "purpose of the pipeline is to assemble several steps that can "\ + "be\ncross-validated together while setting different "\ + "parameters.\nFor this, it enables setting parameters of the "\ + "various steps using their\nnames and the parameter name "\ + "separated by a '__', as in the example below.\nA step's "\ + "estimator may be replaced entirely by setting the parameter\n"\ + "with its name to another estimator, or a transformer removed by"\ + " setting\nit to 'passthrough' or ``None``." + else: + fixture_description = self.extension._get_sklearn_description(model) + serialization = self.extension.model_to_flow(model) structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture_name) diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 95b4fa3f0..9c4d49439 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -95,7 +95,6 @@ def test_are_flows_equal(self): # Test most important values that can be set by a user openml.flows.functions.assert_flows_equal(flow, flow) for attribute, new_value in [('name', 'Tes'), - ('description', 'Test flo'), ('external_version', '2'), ('language', 'english'), ('dependencies', 'ab'),