Source code for statsmodels.regression.rolling

"""
Rolling OLS and WLS

Implements an efficient rolling estimator that avoids repeated matrix
multiplication.

Copyright (c) 2019 Kevin Sheppard
License: 3-clause BSD
"""
from statsmodels.compat.numpy import lstsq

from collections import namedtuple

import numpy as np
from pandas import Series, DataFrame, MultiIndex
from pandas.util._decorators import Appender, Substitution
from scipy import stats

from statsmodels.base import model
from statsmodels.base.model import LikelihoodModelResults, Model
from statsmodels.compat.pandas import cache_readonly
from statsmodels.regression.linear_model import RegressionResults, \
    RegressionModel
from statsmodels.tools.validation import array_like, int_like, string_like


def strip4(line):
    if line.startswith(' '):
        return line[4:]
    return line


RollingStore = namedtuple('RollingStore', ['params', 'ssr', 'llf', 'nobs',
                                           's2', 'xpxi', 'xeex',
                                           'centered_tss', 'uncentered_tss'])

common_params = '\n'.join(map(strip4, model._model_params_doc.split('\n')))
window_parameters = """\
window : int
    Length of the rolling window. Must be strictly larger than the number
    of variables in the model.
"""

weight_parameters = """
weights : array_like, optional
    A 1d array of weights.  If you supply 1/W then the variables are
    pre- multiplied by 1/sqrt(W).  If no weights are supplied the
    default value is 1 and WLS results are the same as OLS.
"""

_missing_param_doc = """\
min_nobs : {int, None}
    Minimum number of observations required to estimate a model when
    data are missing.  If None, the minimum depends on the number of
    regressors in the model. Must be smaller than window.
missing : str
    Available options are 'drop', 'skip' and 'raise'. If 'drop', any
    observations with nans are dropped and the estimates are computed using
    only the non-missing values in each window. If 'skip' blocks containing
    missing values are skipped and the corresponding results contains NaN.
    If 'raise', an error is raised. Default is 'drop'."""


extra_base = _missing_param_doc
extra_parameters = window_parameters + weight_parameters + extra_base

_doc = """
Rolling %(model_type)s Least Squares

%(parameters)s
%(extra_parameters)s

See Also
--------
statsmodels.regression.linear_model.%(model)s
    %(model)s estimation and parameter testing.

Notes
-----
Tested against %(model)s for accuracy.

Results may differ from %(model)s applied to windows of data if this
model contains an implicit constant (i.e., includes dummies for all
categories) rather than an explicit constant (e.g., a column of 1s).

Examples
--------
>>> from statsmodels.regression.rolling import Rolling%(model)s
>>> from statsmodels.datasets import longley
>>> data = longley.load(as_pandas=False)
>>> exog = add_constant(data.exog, prepend=False)
>>> mod = Rolling%(model)s(data.endog, exog)
>>> rolling_res = mod.fit(reset=50)

Use params_only to skip all calculations except parameter estimation

>>> rolling_params = mod.fit(params_only=True)
"""


[docs]@Substitution(model_type='Weighted', model='WLS', parameters=common_params, extra_parameters=extra_parameters) @Appender(_doc) class RollingWLS(object): def __init__(self, endog, exog, window=None, weights=None, min_nobs=None, missing='drop'): # Call Model.__init__ twice to use const detection in first pass # But to not drop in the second pass missing = string_like(missing, 'missing', options=('drop', 'raise', 'skip')) temp_msng = 'drop' if missing != 'raise' else 'raise' Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None) k_const = self.k_constant const_idx = self.data.const_idx Model.__init__(self, endog, exog, missing='none', hasconst=False) self.k_constant = k_const self.data.const_idx = const_idx self._y = array_like(endog, 'endog') nobs = self._y.shape[0] self._x = array_like(exog, 'endog', ndim=2, shape=(nobs, None)) window = int_like(window, 'window', optional=True) weights = array_like(weights, 'weights', optional=True, shape=(nobs,)) self._window = window if window is not None else self._y.shape[0] self._weighted = weights is not None self._weights = np.ones(nobs) if weights is None else weights w12 = np.sqrt(self._weights) self._wy = w12 * self._y self._wx = w12[:, None] * self._x self._is_nan = np.zeros_like(self._y, dtype=np.bool) self._has_nan = self._find_nans() self.const_idx = self.data.const_idx self._skip_missing = missing == 'skip' min_nobs = int_like(min_nobs, 'min_nobs', optional=True) self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1] if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window: raise ValueError('min_nobs must be larger than the number of ' 'regressors in the model and less than window') def _handle_data(self, endog, exog, missing, hasconst, **kwargs): return Model._handle_data(self, endog, exog, missing, hasconst, **kwargs) def _find_nans(self): nans = np.isnan(self._y) nans |= np.any(np.isnan(self._x), axis=1) nans |= np.isnan(self._weights) self._is_nan[:] = nans has_nan = np.cumsum(nans) w = self._window has_nan[w - 1:] = has_nan[w - 1:] - has_nan[:-(w - 1)] has_nan[:w - 1] = False return has_nan.astype(np.bool) def _get_data(self, idx): window = self._window # TODO: Make a helper function y = self._y[idx - window:idx] wy = self._wy[idx - window:idx] wx = self._wx[idx - window:idx] weights = self._weights[idx - window:idx] missing = self._is_nan[idx - window:idx] not_missing = ~missing if np.any(missing): y = y[not_missing] wy = wy[not_missing] wx = wx[not_missing] weights = weights[not_missing] return y, wy, wx, weights, not_missing def _fit_single(self, idx, wxpwx, wxpwy, nobs, store, params_only, method): if nobs < self._min_nobs: return try: wxpwxi = np.linalg.inv(wxpwx) if method == 'inv': params = wxpwxi @ wxpwy else: _, wy, wx, _, _ = self._get_data(idx) if method == 'lstsq': params = lstsq(wx, wy)[0] else: # 'pinv' wxpwxiwxp = np.linalg.pinv(wx) params = wxpwxiwxp @ wy except np.linalg.LinAlgError: return store.params[idx - 1] = params if params_only: return y, wy, wx, weights, _ = self._get_data(idx) wresid, ssr, llf = self._loglike(params, wy, wx, weights, nobs) wxwresid = wx * wresid[:, None] wxepwxe = wxwresid.T @ wxwresid tot_params = wx.shape[1] s2 = ssr / (nobs - tot_params) centered_tss, uncentered_tss = self._sum_of_squares(y, wy, weights) store.ssr[idx - 1] = ssr store.llf[idx - 1] = llf store.nobs[idx - 1] = nobs store.s2[idx - 1] = s2 store.xpxi[idx - 1] = wxpwxi store.xeex[idx - 1] = wxepwxe store.centered_tss[idx - 1] = centered_tss store.uncentered_tss[idx - 1] = uncentered_tss def _loglike(self, params, wy, wx, weights, nobs): nobs2 = nobs / 2.0 wresid = wy - wx @ params ssr = np.sum(wresid ** 2, axis=0) llf = -np.log(ssr) * nobs2 # concentrated likelihood llf -= (1 + np.log(np.pi / nobs2)) * nobs2 # with constant llf += 0.5 * np.sum(np.log(weights)) return wresid, ssr, llf def _sum_of_squares(self, y, wy, weights): mean = np.average(y, weights=weights) centered_tss = np.sum(weights * (y - mean) ** 2) uncentered_tss = np.dot(wy, wy) return centered_tss, uncentered_tss def _reset(self, idx): """Compute xpx and xpy using a single dot product""" _, wy, wx, _, not_missing = self._get_data(idx) nobs = not_missing.sum() xpx = wx.T @ wx xpy = wx.T @ wy return xpx, xpy, nobs def fit(self, method='inv', cov_type='nonrobust', cov_kwds=None, reset=None, use_t=False, params_only=False): """ Estimate model parameters. Parameters ---------- method : {'inv', 'lstsq', 'pinv'} Method to use when computing the the model parameters. * 'inv' - use moving windows inner-products and matrix inversion. This method is the fastest, but may be less accurate than the other methods. * 'lstsq' - Use numpy.linalg.lstsq * 'pinv' - Use numpy.linalg.pinv. This method matches the default estimator in non-moving regression estimators. cov_type : {'nonrobust', 'HCCM', 'HC0'} Covariance estimator: * nonrobust - The classic OLS covariance estimator * HCCM, HC0 - White heteroskedasticity robust covariance cov_kwds : dict Unused reset : int, optional Interval to recompute the moving window inner products used to estimate the model parameters. Smaller values improve accuracy, although in practice this setting is not required to be set. use_t : bool, optional Flag indicating to use the Student's t distribution when computing p-values. params_only : bool, optional Flag indicating that only parameters should be computed. Avoids calculating all other statistics or performing inference. Returns ------- RollingRegressionResults Estimation results where all pre-sample values are nan-filled. """ method = string_like(method, 'method', options=('inv', 'lstsq', 'pinv')) reset = int_like(reset, 'reset', optional=True) reset = self._y.shape[0] if reset is None else reset if reset < 1: raise ValueError('reset must be a positive integer') window = self._window nobs, k = self._x.shape store = RollingStore(params=np.full((nobs, k), np.nan), ssr=np.full(nobs, np.nan), llf=np.full(nobs, np.nan), nobs=np.zeros(nobs, dtype=np.int), s2=np.full(nobs, np.nan), xpxi=np.full((nobs, k, k), np.nan), xeex=np.full((nobs, k, k), np.nan), centered_tss=np.full(nobs, np.nan), uncentered_tss=np.full(nobs, np.nan)) xpx, xpy, nobs = self._reset(window) w = self._window if not (self._has_nan[window - 1] and self._skip_missing): self._fit_single(w, xpx, xpy, nobs, store, params_only, method) wx, wy = self._wx, self._wy for i in range(w + 1, self._x.shape[0] + 1): if self._has_nan[i - 1] and self._skip_missing: continue if i % reset == 0: xpx, xpy, nobs = self._reset(i) else: if not self._is_nan[i - w - 1]: remove_x = wx[i - w - 1:i - w] xpx -= remove_x.T @ remove_x xpy -= remove_x.T @ wy[i - w - 1:i - w] nobs -= 1 if not self._is_nan[i - 1]: add_x = wx[i - 1:i] xpx += add_x.T @ add_x xpy += add_x.T @ wy[i - 1:i] nobs += 1 self._fit_single(i, xpx, xpy, nobs, store, params_only, method) return RollingRegressionResults(self, store, self.k_constant, use_t, cov_type) @classmethod @Appender(Model.from_formula.__doc__) def from_formula(cls, formula, data, window, weights=None, subset=None, *args, **kwargs): if subset is not None: data = data.loc[subset] eval_env = kwargs.pop('eval_env', None) if eval_env is None: eval_env = 2 elif eval_env == -1: from patsy import EvalEnvironment eval_env = EvalEnvironment({}) else: eval_env += 1 # we're going down the stack again missing = kwargs.get('missing', 'skip') from patsy import dmatrices, NAAction na_action = NAAction(on_NA='raise', NA_types=[]) result = dmatrices(formula, data, eval_env, return_type='dataframe', NA_action=na_action) endog, exog = result if (endog.ndim > 1 and endog.shape[1] > 1) or endog.ndim > 2: raise ValueError('endog has evaluated to an array with multiple ' 'columns that has shape {0}. This occurs when ' 'the variable converted to endog is non-numeric' ' (e.g., bool or str).'.format(endog.shape)) kwargs.update({'missing': missing, 'window': window}) if weights is not None: kwargs['weights'] = weights mod = cls(endog, exog, *args, **kwargs) mod.formula = formula # since we got a dataframe, attach the original mod.data.frame = data return mod
extra_parameters = window_parameters + extra_base
[docs]@Substitution(model_type='Ordinary', model='OLS', parameters=common_params, extra_parameters=extra_parameters) @Appender(_doc) class RollingOLS(RollingWLS): def __init__(self, endog, exog, window=None, min_nobs=None, missing='drop'): super().__init__(endog, exog, window, None, min_nobs, missing)
[docs]class RollingRegressionResults(object): """ Results from rolling regressions Parameters ---------- model : RollingWLS Model instance store : RollingStore Container for raw moving window results k_constant : bool Flag indicating that the model contains a constant use_t : bool Flag indicating to use the Student's t distribution when computing p-values. cov_type : str Name of covariance estimator """ def __init__(self, model, store: RollingStore, k_constant, use_t, cov_type): self.model = model self._params = store.params self._ssr = store.ssr self._llf = store.llf self._nobs = store.nobs self._s2 = store.s2 self._xpxi = store.xpxi self._xepxe = store.xeex self._centered_tss = store.centered_tss self._uncentered_tss = store.uncentered_tss self._k_constant = k_constant self._nvar = self._xpxi.shape[-1] if use_t is None: use_t = cov_type == 'nonrobust' self._use_t = use_t self._cov_type = cov_type self._use_pandas = self.model.data.row_labels is not None self._data_attr = [] def _wrap(self, val): """Wrap output as pandas Series or DataFrames as needed""" if not self._use_pandas: return val col_names = self.model.data.param_names row_names = self.model.data.row_labels if val.ndim == 1: return Series(val, index=row_names) if val.ndim == 2: return DataFrame(val, columns=col_names, index=row_names) else: # ndim == 3 mi = MultiIndex.from_product((row_names, col_names)) val = np.reshape(val, (-1, val.shape[-1])) return DataFrame(val, columns=col_names, index=mi) @cache_readonly @Appender(RegressionResults.aic.func.__doc__) def aic(self): return self._wrap(RegressionResults.aic.func(self)) @cache_readonly @Appender(RegressionResults.bic.func.__doc__) def bic(self): with np.errstate(divide='ignore'): return self._wrap(RegressionResults.bic.func(self)) @cache_readonly def params(self): """Estimated model parameters""" return self._wrap(self._params) @cache_readonly @Appender(RegressionResults.ssr.func.__doc__) def ssr(self): return self._wrap(self._ssr) @cache_readonly @Appender(RegressionResults.llf.func.__doc__) def llf(self): return self._wrap(self._llf) @cache_readonly @Appender(RegressionModel.df_model.__doc__) def df_model(self): return self._nvar - self._k_constant @cache_readonly def k_constant(self): """Flag indicating whether the model contains a constant""" return self._k_constant @cache_readonly @Appender(RegressionResults.centered_tss.func.__doc__) def centered_tss(self): return self._centered_tss @cache_readonly @Appender(RegressionResults.uncentered_tss.func.__doc__) def uncentered_tss(self): return self._uncentered_tss @cache_readonly @Appender(RegressionResults.rsquared.func.__doc__) def rsquared(self): return self._wrap(RegressionResults.rsquared.func(self)) @cache_readonly @Appender(RegressionResults.rsquared_adj.func.__doc__) def rsquared_adj(self): return self._wrap(RegressionResults.rsquared_adj.func(self)) @cache_readonly @Appender(RegressionResults.nobs.func.__doc__) def nobs(self): return self._wrap(self._nobs) @cache_readonly @Appender(RegressionModel.df_resid.__doc__) def df_resid(self): return self._wrap(self._nobs - self.df_model - self._k_constant) @cache_readonly @Appender(RegressionResults.use_t.__doc__) def use_t(self): return self._use_t @cache_readonly @Appender(RegressionResults.ess.func.__doc__) def ess(self): return self._wrap(RegressionResults.ess.func(self)) @cache_readonly @Appender(RegressionResults.mse_model.func.__doc__) def mse_model(self): return self._wrap(RegressionResults.mse_model.func(self)) @cache_readonly @Appender(RegressionResults.mse_resid.func.__doc__) def mse_resid(self): return self._wrap(RegressionResults.mse_resid.func(self)) @cache_readonly @Appender(RegressionResults.mse_total.func.__doc__) def mse_total(self): return self._wrap(RegressionResults.mse_total.func(self)) @cache_readonly def _cov_params(self): if self._cov_type == 'nonrobust': return self._s2[:, None, None] * self._xpxi else: return self._xpxi @ self._xepxe @ self._xpxi def cov_params(self): """ Estimated parameter covariance Returns ------- array_like The estimated model covariances. If the original input is a numpy array, the returned covariance is a 3-d array with shape (nobs, nvar, nvar). If the original inputs are pandas types, then the returned covariance is a DataFrame with a MultiIndex with key (observation, variable), so that the covariance for observation with index i is cov.loc[i]. """ return self._wrap(self._cov_params) @cache_readonly @Appender(RegressionResults.f_pvalue.func.__doc__) def f_pvalue(self): with np.errstate(invalid='ignore'): return self._wrap(RegressionResults.f_pvalue.func(self)) @cache_readonly @Appender(RegressionResults.fvalue.func.__doc__) def fvalue(self): if self._cov_type == 'nonrobust': return self.mse_model / self.mse_resid else: nobs = self._params.shape[0] stat = np.full(nobs, np.nan) k = self._params.shape[1] r = np.eye(k) locs = list(range(k)) if self.k_constant: locs.pop(self.model.const_idx) if not locs: return stat r = r[locs] vcv = self._cov_params rvcvr = r @ vcv @ r.T p = self._params for i in range(nobs): rp = p[i:i + 1] @ r.T denom = rp.shape[1] inv_cov = np.linalg.inv(rvcvr[i]) stat[i] = (rp @ inv_cov @ rp.T) / denom return stat @cache_readonly @Appender(RegressionResults.bse.func.__doc__) def bse(self): with np.errstate(invalid='ignore'): return self._wrap(np.sqrt(np.diagonal(self._cov_params, 0, 2))) @cache_readonly @Appender(LikelihoodModelResults.tvalues.func.__doc__) def tvalues(self): with np.errstate(invalid='ignore'): return self._wrap(LikelihoodModelResults.tvalues.func(self)) @cache_readonly @Appender(LikelihoodModelResults.pvalues.func.__doc__) def pvalues(self): if self.use_t: df_resid = getattr(self, 'df_resid_inference', self.df_resid) df_resid = np.asarray(df_resid)[:, None] with np.errstate(invalid='ignore'): return stats.t.sf(np.abs(self.tvalues), df_resid) * 2 else: with np.errstate(invalid='ignore'): return stats.norm.sf(np.abs(self.tvalues)) * 2 def _conf_int(self, alpha, cols): bse = np.asarray(self.bse) if self.use_t: dist = stats.t df_resid = getattr(self, 'df_resid_inference', self.df_resid) df_resid = np.asarray(df_resid)[:, None] q = dist.ppf(1 - alpha / 2, df_resid) else: dist = stats.norm q = dist.ppf(1 - alpha / 2) params = np.asarray(self.params) lower = params - q * bse upper = params + q * bse if cols is not None: cols = np.asarray(cols) lower = lower[:, cols] upper = upper[:, cols] return np.asarray(list(zip(lower, upper))) @Appender(LikelihoodModelResults.conf_int.__doc__) def conf_int(self, alpha=.05, cols=None): ci = self._conf_int(alpha, cols) if not self._use_pandas: return ci ci_names = ('lower', 'upper') row_names = self.model.data.row_labels col_names = self.model.data.param_names if cols is not None: col_names = [col_names[i] for i in cols] mi = MultiIndex.from_product((col_names, ci_names)) ci = np.reshape(np.swapaxes(ci, 1, 2), (ci.shape[0], -1)) return DataFrame(ci, columns=mi, index=row_names) @property def cov_type(self): """Name of covariance estimator""" return self._cov_type @classmethod @Appender(LikelihoodModelResults.load.__doc__) def load(cls, fname): return LikelihoodModelResults.load(fname) remove_data = LikelihoodModelResults.remove_data @Appender(LikelihoodModelResults.save.__doc__) def save(self, fname, remove_data=False): return LikelihoodModelResults.save(self, fname, remove_data) def plot_recursive_coefficient(self, variables=None, alpha=0.05, legend_loc='upper left', fig=None, figsize=None): r""" Plot the recursively estimated coefficients on a given variable Parameters ---------- variables : {int, str, Iterable[int], Iterable[str], None}, optional Integer index or string name of the variables whose coefficients to plot. Can also be an iterable of integers or strings. Default plots all coefficients. alpha : float, optional The confidence intervals for the coefficient are (1 - alpha)%. Set to None to exclude confidence intervals. legend_loc : str, optional The location of the legend in the plot. Default is upper left. fig : Matplotlib Figure instance, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Returns ------- Figure The matplotlib Figure object. """ from statsmodels.graphics.utils import _import_mpl, create_mpl_fig if alpha is not None: ci = self._conf_int(alpha, None) row_labels = self.model.data.row_labels if row_labels is None: row_labels = np.arange(self._params.shape[0]) k_variables = self._params.shape[1] param_names = self.model.data.param_names if variables is None: variable_idx = list(range(k_variables)) else: if isinstance(variables, (int, str)): variables = [variables] variable_idx = [] for i in range(len(variables)): variable = variables[i] if variable in param_names: variable_idx.append(param_names.index(variable)) elif isinstance(variable, int): variable_idx.append(variable) else: msg = ('variable {0} is not an integer and was not found ' 'in the list of variable ' 'names: {1}'.format(variables[i], ', '.join(param_names))) raise ValueError(msg) _import_mpl() fig = create_mpl_fig(fig, figsize) loc = 0 import pandas as pd if isinstance(row_labels, pd.PeriodIndex): row_labels = row_labels.to_timestamp() row_labels = np.asarray(row_labels) for i in variable_idx: ax = fig.add_subplot(len(variable_idx), 1, loc + 1) params = self._params[:, i] valid = ~np.isnan(self._params[:, i]) row_lbl = row_labels[valid] ax.plot(row_lbl, params[valid]) if alpha is not None: this_ci = np.reshape(ci[:, :, i], (-1, 2)) if not np.all(np.isnan(this_ci)): ax.plot(row_lbl, this_ci[:, 0][valid], 'k:', label='Lower CI') ax.plot(row_lbl, this_ci[:, 1][valid], 'k:', label='Upper CI') if loc == 0: ax.legend(loc=legend_loc) ax.set_xlim(row_lbl[0], row_lbl[-1]) ax.set_title(param_names[i]) loc += 1 fig.tight_layout() return fig