Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Changed target name/series ID divider and added ability to return series ID column with predictions :pr:`4357`
* Fixes
* Changes
* Pinned networkx version below 3.2 for Python version compatibility :pr:`4351`
Expand Down
37 changes: 36 additions & 1 deletion evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
TimeSeriesRegressionPipeline,
)
from evalml.problem_types import ProblemTypes
from evalml.utils import infer_feature_types


class MultiseriesRegressionPipeline(TimeSeriesRegressionPipeline):
Expand Down Expand Up @@ -91,6 +92,7 @@ def predict_in_sample(
y_train,
objective=None,
calculating_residuals=False,
include_series_id=False,
):
"""Predict on future data where the target is known, e.g. cross validation.

Expand All @@ -102,6 +104,7 @@ def predict_in_sample(
objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional.
calculating_residuals (bool): Whether we're calling predict_in_sample to calculate the residuals. This means
the X and y arguments are not future data, but actually the train data.
include_series_id (bool): If true, include the series ID value in the prediction results

Returns:
pd.Series: Estimated labels.
Expand All @@ -125,6 +128,33 @@ def predict_in_sample(
self.time_index,
self.input_target_name,
)

# Order series columns to be same as expected input feature names
# and filter to only include features in `X_unstacked`.
input_features = list(self.input_feature_names.values())[0]
X_unstacked = X_unstacked[
[feature for feature in input_features if feature in X_unstacked.columns]
]
X_train_unstacked = X_train_unstacked[
[
feature
for feature in input_features
if feature in X_train_unstacked.columns
]
]
y_overlapping_features = [
feature
for feature in y_train_unstacked.columns
if feature in y_unstacked.columns
]
y_unstacked = y_unstacked[y_overlapping_features]
y_train_unstacked = y_train_unstacked[y_overlapping_features]

X_train_unstacked = infer_feature_types(X_train_unstacked)
y_train_unstacked = infer_feature_types(y_train_unstacked)
X_unstacked = infer_feature_types(X_unstacked)
y_unstacked = infer_feature_types(y_unstacked)

unstacked_predictions = super().predict_in_sample(
X_unstacked,
y_unstacked,
Expand All @@ -133,10 +163,15 @@ def predict_in_sample(
objective,
calculating_residuals,
)
stacked_predictions = stack_data(unstacked_predictions)
stacked_predictions = stack_data(
unstacked_predictions,
include_series_id=include_series_id,
series_id_name=self.series_id,
)

# Index will start at the unstacked index, so we need to reset it to the original index
stacked_predictions.index = X.index
stacked_predictions = infer_feature_types(stacked_predictions)
return stacked_predictions

def get_forecast_period(self, X):
Expand Down
10 changes: 8 additions & 2 deletions evalml/pipelines/time_series_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
return return_intervals

if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
from evalml.pipelines.utils import stack_data, unstack_multiseries
from evalml.pipelines.utils import (
MULTISERIES_SEPARATOR_SYMBOL,
stack_data,
unstack_multiseries,
)

X, y = unstack_multiseries(
X,
Expand Down Expand Up @@ -268,7 +272,9 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
# `pred_intervals` are in {series_id: {coverage_label: bound_value}} form
for series_id, series_intervals in pred_intervals.items():
series_id_target_name = (
self.input_target_name + "_" + str(series_id)
self.input_target_name
+ MULTISERIES_SEPARATOR_SYMBOL
+ str(series_id)
)
series_id_prediction_intervals = _get_series_intervals(
series_intervals,
Expand Down
24 changes: 17 additions & 7 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
from evalml.utils.gen_utils import contains_all_ts_parameters

DECOMPOSER_PERIOD_CAP = 1000
MULTISERIES_SEPARATOR_SYMBOL = "|"


def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):
Expand Down Expand Up @@ -1418,7 +1419,7 @@ def unstack_multiseries(
for column_name in full_dataset.columns.drop([time_index, series_id]):
new_column = single_series[column_name]
new_column.index = new_time_index
new_column.name = f"{column_name}_{s_id}"
new_column.name = f"{column_name}{MULTISERIES_SEPARATOR_SYMBOL}{s_id}"

if column_name == target_name:
y_unstacked_cols.append(new_column)
Expand All @@ -1435,11 +1436,15 @@ def unstack_multiseries(
# Reset the axes now that they've been unstacked, keep time info in X
X_unstacked = X_unstacked.reset_index()
y_unstacked = y_unstacked.reset_index(drop=True)

return X_unstacked, y_unstacked


def stack_data(data, include_series_id=False, series_id_name=None, starting_index=None):
def stack_data(
data,
include_series_id=False,
series_id_name=None,
starting_index=None,
):
"""Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True.

Should only be used for data that is expected to be a single series. To stack multiple unstacked columns,
Expand All @@ -1464,7 +1469,9 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde

# Extract the original column name
series_id_with_name = stacked_series.index.droplevel()
stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1])
stacked_series.name = MULTISERIES_SEPARATOR_SYMBOL.join(
series_id_with_name[0].split(MULTISERIES_SEPARATOR_SYMBOL)[:-1],
)

# If the index is the time index, keep it
if not data.index.is_numeric() and starting_index is None:
Expand All @@ -1481,11 +1488,14 @@ def stack_data(data, include_series_id=False, series_id_name=None, starting_inde
# Pull out the series id information, if requested
if include_series_id:
series_id_col = pd.Series(
series_id_with_name.map(lambda col_name: col_name.split("_")[-1]),
series_id_with_name.map(
lambda col_name: col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1],
),
name=series_id_name or "series_id",
index=stacked_series.index,
)
stacked_series = pd.concat([series_id_col, stacked_series], axis=1)

return stacked_series


Expand All @@ -1511,8 +1521,8 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values
for col in X.columns:
if col == time_index:
continue
separated_name = col.split("_")
original_columns.add("_".join(separated_name[:-1]))
separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL)
original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1]))
series_ids.add(separated_name[-1])

if len(series_ids) == 0:
Expand Down
5 changes: 4 additions & 1 deletion evalml/tests/component_tests/test_time_series_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)

from evalml.pipelines import TimeSeriesFeaturizer
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

ROLLING_TRANSFORM_METHOD_NAME = "_compute_rolling_transforms"
DELAYED_FEATURES_METHOD_NAME = "_compute_delays"
Expand Down Expand Up @@ -991,7 +992,9 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked):

assert featurizer.statistically_significant_lags == [6]

expected_y_cols = [f"target_{i}_delay_6" for i in range(y.shape[1])]
expected_y_cols = [
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1])
]
X_t = featurizer.transform(X, y)
for expected_y_col in expected_y_cols:
assert expected_y_col in X_t.columns
13 changes: 11 additions & 2 deletions evalml/tests/component_tests/test_time_series_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)

from evalml.pipelines.components import TimeSeriesImputer
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL


def test_invalid_strategy_parameters():
Expand Down Expand Up @@ -745,7 +746,12 @@ def test_time_series_imputer_multiseries(
_, y_imputed = imputer.transform(X, y)
assert isinstance(y_imputed, pd.DataFrame)

y_expected = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
y_expected = pd.DataFrame(
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)


Expand Down Expand Up @@ -777,7 +783,10 @@ def test_time_series_imputer_multiseries_some_columns_all_nan(
_, y_imputed = imputer.transform(X, y)

y_expected = pd.DataFrame(
{f"target_{i}": range(i, 100, 5) for i in range(num_nan_cols, 5)},
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(num_nan_cols, 5)
},
)
assert_frame_equal(y_imputed, y_expected, check_dtype=False)

Expand Down
21 changes: 18 additions & 3 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1094,12 +1094,27 @@ def multiseries_ts_data_stacked():

@pytest.fixture
def multiseries_ts_data_unstacked():
feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)})
from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL

feature_a = pd.DataFrame(
{
f"feature_a{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)
feature_b = pd.DataFrame(
{f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)},
{
f"feature_b{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(99 - i, -1, -5)
for i in range(5)
},
)
X = pd.concat([feature_a, feature_b], axis=1)
y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
y = pd.DataFrame(
{
f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}": range(i, 100, 5)
for i in range(5)
},
)

X["date"] = pd.date_range(start="1/1/2018", periods=20)
return X, y
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from evalml.pipelines import MultiseriesRegressionPipeline
from evalml.pipelines.utils import unstack_multiseries
from evalml.preprocessing import split_multiseries_data
from evalml.utils import infer_feature_types


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -90,7 +91,9 @@ def test_multiseries_pipeline_fit(
assert pipeline.frequency is not None


@pytest.mark.parametrize("include_series_id", [True, False])
def test_multiseries_pipeline_predict_in_sample(
include_series_id,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
Expand All @@ -111,14 +114,69 @@ def test_multiseries_pipeline_predict_in_sample(
y_holdout,
X_train=X_train,
y_train=y_train,
include_series_id=include_series_id,
)
expected = pd.Series(
range(55, 65),
index=range(90, 100),
name="target",
dtype="float64",
)
pd.testing.assert_series_equal(y_pred, expected)
if include_series_id:
expected = pd.concat([X_holdout["series_id"], expected], axis=1)
expected = infer_feature_types(expected)
pd.testing.assert_frame_equal(y_pred, expected)
else:
pd.testing.assert_series_equal(y_pred, expected)


@pytest.mark.parametrize("include_series_id", [True, False])
def test_multiseries_pipeline_predict_in_sample_series_out_of_order(
include_series_id,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
):
X, y = multiseries_ts_data_stacked
X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
X,
y,
"series_id",
"date",
)

# Reorder rows but keep ordered by date
# Store ordered series ID values to compare to output later
X_holdout_series_id = X_holdout["series_id"]
X_index = X_holdout.index
X_holdout = X_holdout.sample(frac=1).sort_values(by="date")
y_holdout = y_holdout.reindex(X_holdout.index)

X_holdout.index = X_index
y_holdout.index = X_index

pipeline = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict_in_sample(
X_holdout,
y_holdout,
X_train=X_train,
y_train=y_train,
include_series_id=include_series_id,
)
expected = pd.Series(
range(55, 65),
index=range(90, 100),
name="target",
dtype="float64",
)
if include_series_id:
expected = pd.concat([X_holdout_series_id, expected], axis=1)
expected = infer_feature_types(expected)
pd.testing.assert_frame_equal(y_pred, expected)
else:
pd.testing.assert_series_equal(y_pred, expected)


@pytest.mark.parametrize("forecast_horizon", [1, 7])
Expand Down
4 changes: 3 additions & 1 deletion evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
handle_component_class,
)
from evalml.pipelines.utils import (
MULTISERIES_SEPARATOR_SYMBOL,
_get_pipeline_base_class,
_get_preprocessing_components,
_make_pipeline_from_multiple_graphs,
Expand Down Expand Up @@ -1404,7 +1405,8 @@ def test_unstack_multiseries(
X_unstacked, y_unstacked = multiseries_ts_data_unstacked
y.name = target_name
y_unstacked.columns = [
f"{target_name}_{i}" for i in range(len(y_unstacked.columns))
f"{target_name}{MULTISERIES_SEPARATOR_SYMBOL}{i}"
for i in range(len(y_unstacked.columns))
]

X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(
Expand Down