Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Changelog
* DOC #639: More descriptive documention for function to convert array format.
* ADD #687: Adds a function to retrieve the list of evaluation measures available.
* ADD #695: A function to retrieve all the data quality measures available.
* ADD #412: Add a function to trim flow names for scikit-learn flows.
* ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
* MAINT #726: Update examples to remove deprecation warnings from scikit-learn
Expand Down
118 changes: 118 additions & 0 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,122 @@ def can_handle_model(cls, model: Any) -> bool:
"""
return isinstance(model, sklearn.base.BaseEstimator)

@classmethod
def trim_flow_name(
cls,
long_name: str,
extra_trim_length: int = 100,
_outer: bool = True
) -> str:
""" Shorten generated sklearn flow name to at most `max_length` characters.

Flows are assumed to have the following naming structure:
(model_selection)? (pipeline)? (steps)+
and will be shortened to:
sklearn.(selection.)?(pipeline.)?(steps)+
e.g. (white spaces and newlines added for readability)
sklearn.pipeline.Pipeline(
columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
numeric=sklearn.pipeline.Pipeline(
imputer=sklearn.preprocessing.imputation.Imputer,
standardscaler=sklearn.preprocessing.data.StandardScaler),
nominal=sklearn.pipeline.Pipeline(
simpleimputer=sklearn.impute.SimpleImputer,
onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
svc=sklearn.svm.classes.SVC)
->
sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)

Parameters
----------
long_name : str
The full flow name generated by the scikit-learn extension.
extra_trim_length: int (default=100)
If the trimmed name would exceed `extra_trim_length` characters, additional trimming
of the short name is performed. This reduces the produced short name length.
There is no guarantee the end result will not exceed `extra_trim_length`.
_outer : bool (default=True)
For internal use only. Specifies if the function is called recursively.

Returns
-------
str

"""
def remove_all_in_parentheses(string: str) -> str:
string, removals = re.subn(r"\([^()]*\)", "", string)
while removals > 0:
string, removals = re.subn(r"\([^()]*\)", "", string)
return string

# Generally, we want to trim all hyperparameters, the exception to that is for model
# selection, as the `estimator` hyperparameter is very indicative of what is in the flow.
# So we first trim name of the `estimator` specified in mode selection. For reference, in
# the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and
# keep it in the final trimmed flow name:
# sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer,
# VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
# Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator=
# sklearn.tree.tree.DecisionTreeClassifier))
if 'sklearn.model_selection' in long_name:
start_index = long_name.index('sklearn.model_selection')
estimator_start = (start_index
+ long_name[start_index:].index('estimator=')
+ len('estimator='))

model_select_boilerplate = long_name[start_index:estimator_start]
# above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator="
model_selection_class = model_select_boilerplate.split('(')[0].split('.')[-1]

# Now we want to also find and parse the `estimator`, for this we find the closing
# parenthesis to the model selection technique:
closing_parenthesis_expected = 1
for i, char in enumerate(long_name[estimator_start:], start=estimator_start):
if char == '(':
closing_parenthesis_expected += 1
if char == ')':
closing_parenthesis_expected -= 1
if closing_parenthesis_expected == 0:
break

model_select_pipeline = long_name[estimator_start:i]
trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False)
_, trimmed_pipeline = trimmed_pipeline.split('.', maxsplit=1) # trim module prefix
model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline)
name = long_name[:start_index] + model_select_short + long_name[i + 1:]
else:
name = long_name

module_name = long_name.split('.')[0]
short_name = module_name + '.{}'

if name.startswith('sklearn.pipeline'):
full_pipeline_class, pipeline = name[:-1].split('(', maxsplit=1)
pipeline_class = full_pipeline_class.split('.')[-1]
# We don't want nested pipelines in the short name, so we trim all complicated
# subcomponents, i.e. those with parentheses:
pipeline = remove_all_in_parentheses(pipeline)

# then the pipeline steps are formatted e.g.:
# step1name=sklearn.submodule.ClassName,step2name...
components = [component.split('.')[-1] for component in pipeline.split(',')]
pipeline = "{}({})".format(pipeline_class, ','.join(components))
if len(short_name.format(pipeline)) > extra_trim_length:
pipeline = "{}(...,{})".format(pipeline_class, components[-1])
else:
# Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier
pipeline = remove_all_in_parentheses(name).split('.')[-1]

if not _outer:
# Anything from parenthesis in inner calls should not be culled, so we use brackets
pipeline = pipeline.replace('(', '[').replace(')', ']')
else:
# Square brackets may be introduced with nested model_selection
pipeline = pipeline.replace('[', '(').replace(']', ')')

return short_name.format(pipeline)

################################################################################################
# Methods for flow serialization and de-serialization

Expand Down Expand Up @@ -402,6 +518,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
name = '%s(%s)' % (class_name, sub_components_names[1:])
else:
name = class_name
short_name = SklearnExtension.trim_flow_name(name)

# Get the external versions of all sub-components
external_version = self._get_external_version_string(model, subcomponents)
Expand All @@ -419,6 +536,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
sklearn_version_formatted = sklearn_version.replace('==', '_')
flow = OpenMLFlow(name=name,
class_name=class_name,
custom_name=short_name,
description='Automatically created scikit-learn flow.',
model=model,
components=subcomponents,
Expand Down
11 changes: 6 additions & 5 deletions openml/flows/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,14 +417,15 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
_copy_server_fields(flow, self)
try:
openml.flows.functions.assert_flows_equal(
self, flow, flow.upload_date, ignore_parameter_values=True
self, flow, flow.upload_date,
ignore_parameter_values=True,
ignore_custom_name_if_none=True
)
except ValueError as e:
message = e.args[0]
raise ValueError("Flow was not stored correctly on the server. "
"New flow ID is %d. Please check manually and "
"remove the flow if necessary! Error is:\n'%s'" %
(flow_id, message))
raise ValueError("The flow on the server is inconsistent with the local flow. "
"The server flow ID is {}. Please check manually and remove "
"the flow if necessary! Error is:\n'{}'".format(flow_id, message))
return self

def get_structure(self, key_item: str) -> Dict[str, List[str]]:
Expand Down
16 changes: 14 additions & 2 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:

def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_parameter_values_on_older_children: str = None,
ignore_parameter_values: bool = False) -> None:
ignore_parameter_values: bool = False,
ignore_custom_name_if_none: bool = False) -> None:
"""Check equality of two flows.

Two flows are equal if their all keys which are not set by the server
Expand All @@ -325,6 +326,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,

ignore_parameter_values : bool
Whether to ignore parameter values when comparing flows.

ignore_custom_name_if_none : bool
Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.
"""
if not isinstance(flow1, OpenMLFlow):
raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
Expand Down Expand Up @@ -358,7 +362,8 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
'argument2, but not in argument1.' % name)
assert_flows_equal(attr1[name], attr2[name],
ignore_parameter_values_on_older_children,
ignore_parameter_values)
ignore_parameter_values,
ignore_custom_name_if_none)
elif key == '_extension':
continue
else:
Expand All @@ -385,6 +390,13 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
# Continue needs to be done here as the first if
# statement triggers in both special cases
continue
elif (key == 'custom_name'
and ignore_custom_name_if_none
and (attr1 is None or attr2 is None)):
# If specified, we allow `custom_name` inequality if one flow's name is None.
# Helps with backwards compatibility as `custom_name` is now auto-generated, but
# before it used to be `None`.
continue

if attr1 != attr2:
raise ValueError("Flow %s: values for attribute '%s' differ: "
Expand Down
Loading