Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Release Notes
* Added baseline regressor for multiseries time series problems :pr:`4246`
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
* Added multiseries regression pipeline class :pr:`4256`
* Added multiseries VARMAX regressor :pr:`4238`
* Fixes
* Added support for pandas 2 :pr:`4216`
* Fixed bug where time series pipelines would fail due to MASE needing `y_train` when scoring :pr:`4258`
Expand Down
4 changes: 4 additions & 0 deletions evalml/model_family/model_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ class ModelFamily(Enum):
ARIMA = "arima"
"""ARIMA model family."""

VARMAX = "varmax"
"""VARMAX model family."""

BASELINE = "baseline"
"""Baseline model family."""

Expand Down Expand Up @@ -69,6 +72,7 @@ def __str__(self):
ModelFamily.ENSEMBLE.name: "Ensemble",
ModelFamily.EXPONENTIAL_SMOOTHING.name: "Exponential Smoothing",
ModelFamily.ARIMA.name: "ARIMA",
ModelFamily.VARMAX.name: "VARMAX",
ModelFamily.PROPHET.name: "Prophet",
ModelFamily.NONE.name: "None",
}
Expand Down
1 change: 1 addition & 0 deletions evalml/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
SVMRegressor,
ExponentialSmoothingRegressor,
ARIMARegressor,
VARMAXRegressor,
ProphetRegressor,
VowpalWabbitBinaryClassifier,
VowpalWabbitMulticlassClassifier,
Expand Down
1 change: 1 addition & 0 deletions evalml/pipelines/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
SVMRegressor,
ExponentialSmoothingRegressor,
ARIMARegressor,
VARMAXRegressor,
VowpalWabbitBinaryClassifier,
VowpalWabbitMulticlassClassifier,
VowpalWabbitRegressor,
Expand Down
1 change: 1 addition & 0 deletions evalml/pipelines/components/estimators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
SVMRegressor,
ExponentialSmoothingRegressor,
ARIMARegressor,
VARMAXRegressor,
ProphetRegressor,
VowpalWabbitRegressor,
)
3 changes: 3 additions & 0 deletions evalml/pipelines/components/estimators/regressors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
from evalml.pipelines.components.estimators.regressors.arima_regressor import (
ARIMARegressor,
)
from evalml.pipelines.components.estimators.regressors.varmax_regressor import (
VARMAXRegressor,
)
from evalml.pipelines.components.estimators.regressors.vowpal_wabbit_regressor import (
VowpalWabbitRegressor,
)
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
import pandas as pd
from pandas.api.types import is_integer_dtype
from skopt.space import Integer
from sktime.forecasting.base import ForecastingHorizon

from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.pipelines.components.utils import convert_bool_to_double, match_indices
from evalml.problem_types import ProblemTypes
from evalml.utils import (
import_or_raise,
Expand Down Expand Up @@ -139,21 +141,7 @@ def _remove_datetime(

return data_no_dt

def _match_indices(
self,
X: pd.DataFrame,
y: pd.Series,
) -> Tuple[pd.DataFrame, pd.Series]:
if X is not None:
if X.index.equals(y.index):
return X, y
else:
y.index = X.index
return X, y

def _set_forecast(self, X: pd.DataFrame):
from sktime.forecasting.base import ForecastingHorizon

# we can only calculate the difference if the indices are of the same type
units_diff = 1
if isinstance(X.index[0], type(self.last_X_index)) and isinstance(
Expand Down Expand Up @@ -220,14 +208,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
X = self._remove_datetime(X, features=True)

if X is not None:
X.ww.set_types(
{
col: "Double"
for col in X.ww.select(["Boolean"], return_schema=True).columns
},
)
X = convert_bool_to_double(X)
y = self._remove_datetime(y)
X, y = self._match_indices(X, y)
X, y = match_indices(X, y)

if X is not None and not X.empty and self.use_covariates:
self._component_obj.fit(y=y, X=X)
Expand All @@ -238,12 +221,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
def _manage_types_and_forecast(self, X: pd.DataFrame) -> tuple:
fh_ = self._set_forecast(X)
X = X.ww.select(exclude=["Datetime"])
X.ww.set_types(
{
col: "Double"
for col in X.ww.select(["Boolean"], return_schema=True).columns
},
)
X = convert_bool_to_double(X)
return X, fh_

@staticmethod
Expand Down
227 changes: 227 additions & 0 deletions evalml/pipelines/components/estimators/regressors/varmax_regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
"""Vector Autoregressive Moving Average with eXogenous regressors model. The two parameters (p, q) are the AR order and the MA order. More information here: https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.varmax.VARMAX.html."""
from typing import Dict, Hashable, List, Optional, Union

import numpy as np
import pandas as pd
from skopt.space import Categorical, Integer
from sktime.forecasting.base import ForecastingHorizon

from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.pipelines.components.utils import convert_bool_to_double, match_indices
from evalml.problem_types import ProblemTypes
from evalml.utils import (
import_or_raise,
infer_feature_types,
)


class VARMAXRegressor(Estimator):
"""Vector Autoregressive Moving Average with eXogenous regressors model. The two parameters (p, q) are the AR order and the MA order. More information here: https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.varmax.VARMAX.html.

Currently VARMAXRegressor isn't supported via conda install. It's recommended that it be installed via PyPI.

Args:
time_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None.
p (int): Maximum Autoregressive order. Defaults to 1.
q (int): Maximum Moving Average order. Defaults to 0.
trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term,
't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such
as [1, 1, 0, 1].
random_seed (int): Seed for the random number generator. Defaults to 0.
Comment thread
christopherbunn marked this conversation as resolved.
max_iter (int): Maximum number of iterations for solver. Defaults to 10.
use_covariates (bool): If True, will pass exogenous variables in fit/predict methods. If False, forecasts will
solely be based off of the datetimes and target values. Defaults to True.
"""

name = "VARMAX Regressor"
hyperparameter_ranges = {
"p": Integer(0, 10),
"q": Integer(0, 10),
"trend": Categorical(["n", "c", "t", "ct"]),
}
"""{
"p": Integer(1, 10),
"q": Integer(1, 10),
"trend": Categorical(['n', 'c', 't', 'ct']),
}"""
model_family = ModelFamily.VARMAX
is_multiseries = True
"""ModelFamily.VARMAX"""
supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
"""[ProblemTypes.TIME_SERIES_REGRESSION]"""

def __init__(
self,
time_index: Optional[Hashable] = None,
p: int = 1,
q: int = 0,
trend: Optional[str] = "c",
random_seed: Union[int, float] = 0,
maxiter: int = 10,
use_covariates: bool = True,
**kwargs,
):
self.preds_95_upper = None
self.preds_95_lower = None
parameters = {
"order": (p, q),
"trend": trend,
"maxiter": maxiter,
}
parameters.update(kwargs)

varmax_model_msg = (
"sktime is not installed. Please install using `pip install sktime.`"
)
sktime_varmax = import_or_raise(
"sktime.forecasting.varmax",
error_msg=varmax_model_msg,
)
varmax_model = sktime_varmax.VARMAX(**parameters)

parameters["use_covariates"] = use_covariates
parameters["time_index"] = time_index

self.use_covariates = use_covariates
self.time_index = time_index

super().__init__(
parameters=parameters,
component_obj=varmax_model,
random_seed=random_seed,
)

def _set_forecast_horizon(self, X: pd.DataFrame):
# we can only calculate the difference if the indices are of the same type
units_diff = 1
if isinstance(X.index[0], type(self.last_X_index)):
if isinstance(
X.index,
pd.DatetimeIndex,
):
dates_diff = pd.date_range(
start=self.last_X_index,
end=X.index[0],
freq=X.index.freq,
)
units_diff = len(dates_diff) - 1
elif X.index.is_numeric():
units_diff = X.index[0] - self.last_X_index
fh_ = ForecastingHorizon(
[units_diff + i for i in range(len(X))],
is_relative=True,
)
return fh_
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Super nit, but should X here be more like X_forecast? It's what we're forecasting on, right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't X what we're forecasting on?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're forecasting on the entire dataset? or a subset of X?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see what you mean. X in this case is the covariate data which is indeed what we're forecasting on. I can change X to be X_forecast in this case if that clarifies it.


def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
"""Fits VARMAX regressor to data.

Args:
X (pd.DataFrame): The input training data of shape [n_samples, n_features].
y (pd.DataFrane): The target training data of shape [n_samples, n_series_id_values].

Returns:
self

Raises:
ValueError: If y was not passed in.
"""
X, y = self._manage_woodwork(X, y)

if y is None:
raise ValueError("VARMAX Regressor requires y as input.")
Comment thread
christopherbunn marked this conversation as resolved.
Comment on lines +132 to +133
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I recall correctly, VARMAX can predict on single series data just fine. I'm curious of your thoughts, should we enforce that we only run it for multiseries? We can do that easily here by double checking y's shape - it would also help prevent the case where we try to predict on stacked multiseries data (I did that in local VARMAX testing out of curiosity and it didn't error but also didn't do what we'd want it to do)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think sktime enforces that there must be two or more columns for y iirc. The docstring for sktime fit is not super clear on this but it says it errors out if it has a specific multivariate tag.

I'm open to having a ValueError raised if there's a single column saying it might need to be unstacked if you think it's worth having a more specific error message!

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If VARMAX raises it, I'm sure that's fine 😀


if X is not None and self.use_covariates:
self.last_X_index = X.index[-1]
X = X.ww.select(exclude=["Datetime"])

X = convert_bool_to_double(X)
y = convert_bool_to_double(y)
X, y = match_indices(X, y)

if not X.empty:
self._component_obj.fit(y=y, X=X)
else:
self._component_obj.fit(y=y)
else:
self.last_X_index = y.index[-1]
self._component_obj.fit(y=y)
return self

def _manage_types_and_forecast(self, X: pd.DataFrame) -> tuple:
fh_ = self._set_forecast_horizon(X)
X = X.ww.select(exclude=["Datetime"])
X = convert_bool_to_double(X)
return X, fh_

def predict(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> pd.Series:
"""Make predictions using fitted VARMAX regressor.

Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].
y (pd.DataFrame): Target data of shape [n_samples, n_series_id_values].

Returns:
pd.Series: Predicted values.

Raises:
ValueError: If X was passed to `fit` but not passed in `predict`.
"""
X, y = self._manage_woodwork(X, y)
X, fh_ = self._manage_types_and_forecast(X=X)

if not X.empty and self.use_covariates:
if fh_[0] != 1:
# statsmodels (which sktime uses under the hood) only forecasts off the training data
# but sktime circumvents this by predicting everything from the end of training data to the date / periods requested
# and only returning the values for dates / periods given to sktime. Because of this,
# pmdarima requires the number of covariate rows to equal the length of the total number of periods (X.shape[0] == fh_[-1]) if covariates are used.
# We circument this by adding arbitrary rows to the start of X since sktime discards these values when predicting.
num_rows_diff = fh_[-1] - X.shape[0]
filler = pd.DataFrame(
columns=X.columns,
index=range(num_rows_diff),
).fillna(0)
X_ = pd.concat([filler, X], ignore_index=True)
X_.ww.init(schema=X.ww.schema)
else:
X_ = X
y_pred = self._component_obj.predict(
fh=fh_,
X=X_,
)
else:
y_pred = self._component_obj.predict(
fh=fh_,
)
y_pred.index = X.index
return infer_feature_types(y_pred)

def get_prediction_intervals(
self,
X: pd.DataFrame,
y: pd.DataFrame = None,
coverage: List[float] = None,
predictions: pd.Series = None,
) -> Dict[str, pd.Series]:
"""Find the prediction intervals using the fitted VARMAXRegressor.

Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].
y (pd.DataFrame): Target data of shape [n_samples, n_series_id_values]. Optional.
coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the
prediction interval should be calculated for.
predictions (pd.Series): Not used for VARMAX regressor.

Returns:
dict: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper.
"""
raise NotImplementedError(
"VARMAX does not have prediction intervals implemented yet.",
)

@property
def feature_importance(self) -> np.ndarray:
"""Returns array of 0's with a length of 1 as feature_importance is not defined for VARMAX regressor."""
return np.zeros(1)
Loading