Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Release Notes
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
* Added multiseries regression pipeline class :pr:`4256`
* Added multiseries VARMAX regressor :pr:`4238`
* Added support for prediction intervals for VARMAX regressor :pr:`4267`
* Fixes
* Added support for pandas 2 :pr:`4216`
* Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class ExponentialSmoothingRegressor(Estimator):
random_seed (int): Seed for the random number generator. Defaults to 0.
"""

_N_REPETITIONS = 400

name = "Exponential Smoothing Regressor"
hyperparameter_ranges = {
"trend": [None, "additive"],
Expand Down Expand Up @@ -167,7 +169,7 @@ def get_prediction_intervals(
# anchor represents where the simulations should start from (forecasting is done from the "end")
y_pred = self._component_obj._fitted_forecaster.simulate(
nsimulations=X.shape[0],
repetitions=400,
repetitions=self._N_REPETITIONS,
anchor="end",
random_state=self.parameters["random_state"],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class VARMAXRegressor(Estimator):
solely be based off of the datetimes and target values. Defaults to True.
"""

_N_REPETITIONS = 400

name = "VARMAX Regressor"
hyperparameter_ranges = {
"p": Integer(0, 10),
Expand Down Expand Up @@ -215,11 +217,51 @@ def get_prediction_intervals(
predictions (pd.Series): Not used for VARMAX regressor.

Returns:
dict: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper.
dict[dict]: A dict of prediction intervals, where the dict is in the format {series_id: {coverage}_lower or {coverage}_upper}.
"""
raise NotImplementedError(
"VARMAX does not have prediction intervals implemented yet.",
if coverage is None:
coverage = [0.95]

X, y = self._manage_woodwork(X, y)
use_exog = (
# If exogenous variables were used during training
self._component_obj._fitted_forecaster.model.exog is not None
and self.use_covariates
)
if use_exog:
X = X.ww.select(exclude=["Datetime"])
X = convert_bool_to_double(X)
# Accesses the fitted statsmodels model within sktime
# nsimulations represents how many steps should be simulated
# repetitions represents the number of simulations that should be run (confusing, I know)
# anchor represents where the simulations should start from (forecasting is done from the "end")
y_pred = self._component_obj._fitted_forecaster.simulate(
nsimulations=X.shape[0],
repetitions=self._N_REPETITIONS,
anchor="end",
random_state=self.random_seed,
exog=X if use_exog else None,
)
prediction_interval_result = {}
# Access the target column names (i.e. the series_id values) that the VARMAX component obj was fitted on
for series in self._component_obj._fitted_forecaster.model.endog_names:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are endog_names, where do those come from? Is that the columns of y in unstacked/dataframe format?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, they are and they are set internally in statsmodels during the fit process. I can add a comment describing this. I don't think there's a better way to access this info other than storing it as class variable during the fit process?

series_result = {}
series_preds = y_pred[[col for col in y_pred.columns if series in col]]
for conf_int in coverage:
prediction_interval_lower = series_preds.quantile(
q=round((1 - conf_int) / 2, 3),
axis="columns",
)
prediction_interval_upper = series_preds.quantile(
q=round((1 + conf_int) / 2, 3),
axis="columns",
)
prediction_interval_lower.index = X.index
prediction_interval_upper.index = X.index
series_result[f"{conf_int}_lower"] = prediction_interval_lower
series_result[f"{conf_int}_upper"] = prediction_interval_upper
prediction_interval_result[series] = series_result
return prediction_interval_result

@property
def feature_importance(self) -> np.ndarray:
Expand Down
59 changes: 43 additions & 16 deletions evalml/tests/component_tests/test_varmax_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,26 +260,10 @@ def test_varmax_regressor_respects_use_covariates(
assert "X" not in mock_predict.call_args.kwargs


def test_varmax_regressor_prediction_intervals_not_implemented_yet_error(
ts_multiseries_data,
):
X_train, X_test, y_train = ts_multiseries_data(n_series=2)

clf = VARMAXRegressor()

clf.fit(X_train, y_train)
with pytest.raises(
NotImplementedError,
match="VARMAX does not have prediction intervals implemented yet.",
):
clf.get_prediction_intervals(X_test)


def test_varmax_regressor_can_forecast_arbitrary_dates_no_covariates(
ts_multiseries_data,
):
X, _, y = ts_multiseries_data(n_series=2)

X_train, X_test, y_train, y_test = split_data(
X,
y,
Expand Down Expand Up @@ -329,3 +313,46 @@ def test_varmax_regressor_can_forecast_arbitrary_dates_past_holdout(
varmax.fit(X_train, y_train)

varmax.predict(X_test)


@pytest.mark.parametrize("use_X_train", [True, False])
@pytest.mark.parametrize("coverages", [None, [0.85], [0.95, 0.90, 0.85]])
@pytest.mark.parametrize("use_covariates", [True, False])
def test_varmax_regressor_prediction_intervals(
use_covariates,
coverages,
use_X_train,
ts_multiseries_data,
):
X_train, X_test, y_train = ts_multiseries_data(no_features=not use_covariates)

clf = VARMAXRegressor(use_covariates=use_covariates)
clf.fit(X=X_train if use_X_train else None, y=y_train)

# Check we are not using exogenous variables if use_covariates=False, even if X_test is passed.
if not use_covariates or not use_X_train:
with patch.object(
clf._component_obj._fitted_forecaster,
"simulate",
) as mock_simulate:
clf.get_prediction_intervals(X_test, None, coverages)
assert mock_simulate.call_args[1]["exog"] is None

results_coverage = clf.get_prediction_intervals(X_test, None, coverages)
predictions = clf.predict(X_test)

series_id_targets = list(results_coverage.keys())
for series in series_id_targets:
series_results_coverage = results_coverage[series]
conf_ints = list(series_results_coverage.keys())
data = list(series_results_coverage.values())

assert len(conf_ints) == (len(coverages) if coverages is not None else 1) * 2
assert len(data) == (len(coverages) if coverages is not None else 1) * 2

for interval in coverages if coverages is not None else [0.95]:
conf_int_lower = f"{interval}_lower"
conf_int_upper = f"{interval}_upper"

assert (series_results_coverage[conf_int_upper] > predictions[series]).all()
assert (predictions[series] > series_results_coverage[conf_int_lower]).all()