Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 172 additions & 2 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import logging
import re
from re import IGNORECASE
import sys
import time
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
Expand Down Expand Up @@ -476,6 +477,167 @@ def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool:
or ',sklearn==' in flow.external_version
)

def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str:
'''Fetches the sklearn function docstring for the flow description

Retrieves the sklearn docstring available and does the following:
* If length of docstring <= char_lim, then returns the complete docstring
* Else, trims the docstring till it encounters a 'Read more in the :ref:'
* Or till it encounters a 'Parameters\n----------\n'
The final string returned is at most of length char_lim with leading and
trailing whitespaces removed.

Parameters
----------
model : sklearn model
char_lim : int
Specifying the max length of the returned string.
OpenML servers have a constraint of 1024 characters for the 'description' field.

Returns
-------
str
'''
def match_format(s):
return "{}\n{}\n".format(s, len(s) * '-')
s = inspect.getdoc(model)
if s is None:
return ''
try:
# trim till 'Read more'
pattern = "Read more in the :ref:"
index = s.index(pattern)
s = s[:index]
# trimming docstring to be within char_lim
if len(s) > char_lim:
s = "{}...".format(s[:char_lim - 3])
return s.strip()
except ValueError:
logging.warning("'Read more' not found in descriptions. "
"Trying to trim till 'Parameters' if available in docstring.")
pass
try:
# if 'Read more' doesn't exist, trim till 'Parameters'
pattern = "Parameters"
index = s.index(match_format(pattern))
except ValueError:
# returning full docstring
logging.warning("'Parameters' not found in docstring. Omitting docstring trimming.")
index = len(s)
s = s[:index]
# trimming docstring to be within char_lim
if len(s) > char_lim:
s = "{}...".format(s[:char_lim - 3])
return s.strip()

def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]:
'''Extracts the part of sklearn docstring containing parameter information

Fetches the entire docstring and trims just the Parameter section.
The assumption is that 'Parameters' is the first section in sklearn docstrings,
followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
appearing in that order if defined.
Returns a None if no section with 'Parameters' can be found in the docstring.

Parameters
----------
model : sklearn model

Returns
-------
str, or None
'''
def match_format(s):
return "{}\n{}\n".format(s, len(s) * '-')
s = inspect.getdoc(model)
if s is None:
return None
try:
index1 = s.index(match_format("Parameters"))
except ValueError as e:
# when sklearn docstring has no 'Parameters' section
logging.warning("{} {}".format(match_format("Parameters"), e))
return None

headings = ["Attributes", "Notes", "See also", "Note", "References"]
for h in headings:
try:
# to find end of Parameters section
index2 = s.index(match_format(h))
break
except ValueError:
logging.warning("{} not available in docstring".format(h))
continue
else:
# in the case only 'Parameters' exist, trim till end of docstring
index2 = len(s)
s = s[index1:index2]
return s.strip()

def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]:
'''Parses parameter type and description from sklearn dosctring

Parameters
----------
model : sklearn model
char_lim : int
Specifying the max length of the returned string.
OpenML servers have a constraint of 1024 characters string fields.

Returns
-------
Dict, or None
'''
docstring = self._extract_sklearn_parameter_docstring(model)
if docstring is None:
# when sklearn docstring has no 'Parameters' section
return None

n = re.compile("[.]*\n", flags=IGNORECASE)
lines = n.split(docstring)
p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE)
# The above regular expression is designed to detect sklearn parameter names and type
# in the format of [variable_name][space]:[space][type]
# The expectation is that the parameter description for this detected parameter will
# be all the lines in the docstring till the regex finds another parameter match

# collecting parameters and their descriptions
description = [] # type: List
for i, s in enumerate(lines):
param = p.findall(s)
if param != []:
# a parameter definition is found by regex
# creating placeholder when parameter found which will be a list of strings
# string descriptions will be appended in subsequent iterations
# till another parameter is found and a new placeholder is created
placeholder = [''] # type: List[str]
description.append(placeholder)
else:
if len(description) > 0: # description=[] means no parameters found yet
# appending strings to the placeholder created when parameter found
description[-1].append(s)
for i in range(len(description)):
# concatenating parameter description strings
description[i] = '\n'.join(description[i]).strip()
# limiting all parameter descriptions to accepted OpenML string length
if len(description[i]) > char_lim:
description[i] = "{}...".format(description[i][:char_lim - 3])

# collecting parameters and their types
parameter_docs = OrderedDict() # type: Dict
matches = p.findall(docstring)
for i, param in enumerate(matches):
key, value = str(param).split(':')
parameter_docs[key.strip()] = [value.strip(), description[i]]

# to avoid KeyError for missing parameters
param_list_true = list(model.get_params().keys())
param_list_found = list(parameter_docs.keys())
for param in list(set(param_list_true) - set(param_list_found)):
parameter_docs[param] = [None, None]

return parameter_docs

def _serialize_model(self, model: Any) -> OpenMLFlow:
"""Create an OpenMLFlow.

Expand Down Expand Up @@ -534,10 +696,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:

sklearn_version = self._format_external_version('sklearn', sklearn.__version__)
sklearn_version_formatted = sklearn_version.replace('==', '_')

sklearn_description = self._get_sklearn_description(model)
flow = OpenMLFlow(name=name,
class_name=class_name,
custom_name=short_name,
description='Automatically created scikit-learn flow.',
description=sklearn_description,
model=model,
components=subcomponents,
parameters=parameters,
Expand Down Expand Up @@ -623,6 +787,7 @@ def _extract_information_from_model(
sub_components_explicit = set()
parameters = OrderedDict() # type: OrderedDict[str, Optional[str]]
parameters_meta_info = OrderedDict() # type: OrderedDict[str, Optional[Dict]]
parameters_docs = self._extract_sklearn_param_info(model)

model_parameters = model.get_params(deep=False)
for k, v in sorted(model_parameters.items(), key=lambda t: t[0]):
Expand Down Expand Up @@ -743,7 +908,12 @@ def flatten_all(list_):
else:
parameters[k] = None

parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None)))
if parameters_docs is not None:
data_type, description = parameters_docs[k]
parameters_meta_info[k] = OrderedDict((('description', description),
('data_type', data_type)))
else:
parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None)))

return parameters, parameters_meta_info, sub_components, sub_components_explicit

Expand Down
41 changes: 39 additions & 2 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_parameter_values_on_older_children: str = None,
ignore_parameter_values: bool = False,
ignore_custom_name_if_none: bool = False) -> None:
ignore_custom_name_if_none: bool = False,
check_description: bool = True) -> None:
"""Check equality of two flows.

Two flows are equal if their all keys which are not set by the server
Expand All @@ -327,8 +328,11 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_parameter_values : bool
Whether to ignore parameter values when comparing flows.

ignore_custom_name_if_none : bool
ignore_custom_name_if_none : bool
Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.

check_description : bool
Whether to ignore matching of flow descriptions.
"""
if not isinstance(flow1, OpenMLFlow):
raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
Expand Down Expand Up @@ -366,6 +370,10 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_custom_name_if_none)
elif key == '_extension':
continue
elif check_description and key == 'description':
# to ignore matching of descriptions since sklearn based flows may have
# altering docstrings and is not guaranteed to be consistent
continue
else:
if key == 'parameters':
if ignore_parameter_values or \
Expand Down Expand Up @@ -397,6 +405,35 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
# Helps with backwards compatibility as `custom_name` is now auto-generated, but
# before it used to be `None`.
continue
elif key == 'parameters_meta_info':
# this value is a dictionary where each key is a parameter name, containing another
# dictionary with keys specifying the parameter's 'description' and 'data_type'
# checking parameter descriptions can be ignored since that might change
# data type check can also be ignored if one of them is not defined, i.e., None
params1 = set(flow1.parameters_meta_info.keys())
params2 = set(flow2.parameters_meta_info.keys())
if params1 != params2:
raise ValueError('Parameter list in meta info for parameters differ '
'in the two flows.')
# iterating over the parameter's meta info list
for param in params1:
if isinstance(flow1.parameters_meta_info[param], Dict) and \
isinstance(flow2.parameters_meta_info[param], Dict) and \
'data_type' in flow1.parameters_meta_info[param] and \
'data_type' in flow2.parameters_meta_info[param]:
value1 = flow1.parameters_meta_info[param]['data_type']
value2 = flow2.parameters_meta_info[param]['data_type']
else:
value1 = flow1.parameters_meta_info[param]
value2 = flow2.parameters_meta_info[param]
if value1 is None or value2 is None:
continue
elif value1 != value2:
raise ValueError("Flow {}: data type for parameter {} in {} differ "
"as {}\nvs\n{}".format(flow1.name, param, key,
value1, value2))
# the continue is to avoid the 'attr != attr2' check at end of function
continue

if attr1 != attr2:
raise ValueError("Flow %s: values for attribute '%s' differ: "
Expand Down
Loading