Source code for statsmodels.tsa.ar_model

from __future__ import division
from statsmodels.compat.python import iteritems, range, string_types, lmap, long

import numpy as np
from numpy import dot, identity
from numpy.linalg import inv, slogdet
from scipy.stats import norm
from statsmodels.regression.linear_model import OLS
from statsmodels.tsa.tsatools import (lagmat, add_trend,
                                      _ar_transparams, _ar_invtransparams)
import statsmodels.tsa.base.tsa_model as tsbase
import statsmodels.base.model as base
from statsmodels.tools.decorators import (resettable_cache,
                                          cache_readonly, cache_writable)
from statsmodels.tools.numdiff import approx_fprime, approx_hess
from statsmodels.tsa.kalmanf.kalmanfilter import KalmanFilter
import statsmodels.base.wrapper as wrap
from statsmodels.tsa.vector_ar import util
from statsmodels.tsa.base.datetools import _index_date


__all__ = ['AR']


def sumofsq(x, axis=0):
    """Helper function to calculate sum of squares along first axis"""
    return np.sum(x**2, axis=0)


def _check_ar_start(start, k_ar, method, dynamic):
    if (method == 'cmle' or dynamic) and start < k_ar:
        raise ValueError("Start must be >= k_ar for conditional MLE "
                         "or dynamic forecast. Got %d" % start)


def _validate(start, k_ar, dates, method):
    """
    Checks the date and then returns an integer
    """
    from datetime import datetime
    if isinstance(start, (string_types, datetime)):
        start_date = start
        start = _index_date(start, dates)
    if 'mle' not in method and start < k_ar:
        raise ValueError("Start must be >= k_ar for conditional MLE or "
                         "dynamic forecast. Got %s" % start_date)
    return start


def _ar_predict_out_of_sample(y, params, p, k_trend, steps, start=0):
    mu = params[:k_trend] or 0  # only have to worry about constant
    arparams = params[k_trend:][::-1]  # reverse for dot

    # dynamic endogenous variable
    endog = np.zeros(p + steps)  # this is one too big but doesn't matter
    if start:
        endog[:p] = y[start-p:start]
    else:
        endog[:p] = y[-p:]

    forecast = np.zeros(steps)
    for i in range(steps):
        fcast = mu + np.dot(arparams, endog[i:i+p])
        forecast[i] = fcast
        endog[i + p] = fcast

    return forecast


[docs]class AR(tsbase.TimeSeriesModel): __doc__ = tsbase._tsa_doc % {"model" : "Autoregressive AR(p) model", "params" : """endog : array-like 1-d endogenous response variable. The independent variable.""", "extra_params" : base._missing_param_doc, "extra_sections" : ""} def __init__(self, endog, dates=None, freq=None, missing='none'): super(AR, self).__init__(endog, None, dates, freq, missing=missing) endog = self.endog # original might not have been an ndarray if endog.ndim == 1: endog = endog[:, None] self.endog = endog # to get shapes right elif endog.ndim > 1 and endog.shape[1] != 1: raise ValueError("Only the univariate case is implemented")
[docs] def initialize(self): pass
def _transparams(self, params): """ Transforms params to induce stationarity/invertability. Reference --------- Jones(1980) """ p = self.k_ar k = self.k_trend newparams = params.copy() newparams[k:k+p] = _ar_transparams(params[k:k+p].copy()) return newparams def _invtransparams(self, start_params): """ Inverse of the Jones reparameterization """ p = self.k_ar k = self.k_trend newparams = start_params.copy() newparams[k:k+p] = _ar_invtransparams(start_params[k:k+p].copy()) return newparams def _presample_fit(self, params, start, p, end, y, predictedvalues): """ Return the pre-sample predicted values using the Kalman Filter Notes ----- See predict method for how to use start and p. """ k = self.k_trend # build system matrices T_mat = KalmanFilter.T(params, p, k, p) R_mat = KalmanFilter.R(params, p, k, 0, p) # Initial State mean and variance alpha = np.zeros((p, 1)) Q_0 = dot(inv(identity(p**2)-np.kron(T_mat, T_mat)), dot(R_mat, R_mat.T).ravel('F')) Q_0 = Q_0.reshape(p, p, order='F') # TODO: order might need to be p+k P = Q_0 Z_mat = KalmanFilter.Z(p) for i in range(end): # iterate p-1 times to fit presample v_mat = y[i] - dot(Z_mat, alpha) F_mat = dot(dot(Z_mat, P), Z_mat.T) Finv = 1./F_mat # inv. always scalar K = dot(dot(dot(T_mat, P), Z_mat.T), Finv) # update state alpha = dot(T_mat, alpha) + dot(K, v_mat) L = T_mat - dot(K, Z_mat) P = dot(dot(T_mat, P), L.T) + dot(R_mat, R_mat.T) #P[0,0] += 1 # for MA part, R_mat.R_mat.T above if i >= start - 1: # only record if we ask for it predictedvalues[i + 1 - start] = dot(Z_mat, alpha) def _get_predict_start(self, start, dynamic): method = getattr(self, 'method', 'mle') k_ar = getattr(self, 'k_ar', 0) if start is None: if method == 'mle' and not dynamic: start = 0 else: # can't do presample fit for cmle or dynamic start = k_ar elif isinstance(start, (int, long)): start = super(AR, self)._get_predict_start(start) else: # should be a date start = _validate(start, k_ar, self.data.dates, method) start = super(AR, self)._get_predict_start(start) _check_ar_start(start, k_ar, method, dynamic) self._set_predict_start_date(start) return start
[docs] def predict(self, params, start=None, end=None, dynamic=False): """ Returns in-sample and out-of-sample prediction. Parameters ---------- params : array The fitted model parameters. start : int, str, or datetime Zero-indexed observation number at which to start forecasting, ie., the first forecast is start. Can also be a date string to parse or a datetime type. end : int, str, or datetime Zero-indexed observation number at which to end forecasting, ie., the first forecast is start. Can also be a date string to parse or a datetime type. dynamic : bool The `dynamic` keyword affects in-sample prediction. If dynamic is False, then the in-sample lagged values are used for prediction. If `dynamic` is True, then in-sample forecasts are used in place of lagged dependent variables. The first forecasted value is `start`. Returns ------- predicted values : array Notes ----- The linear Gaussian Kalman filter is used to return pre-sample fitted values. The exact initial Kalman Filter is used. See Durbin and Koopman in the references for more information. """ # will return an index of a date start = self._get_predict_start(start, dynamic) end, out_of_sample = self._get_predict_end(end) if start - end > 1: raise ValueError("end is before start") k_ar = self.k_ar k_trend = self.k_trend method = self.method endog = self.endog.squeeze() if dynamic: out_of_sample += end - start + 1 return _ar_predict_out_of_sample(endog, params, k_ar, k_trend, out_of_sample, start) predictedvalues = np.zeros(end + 1 - start) # fit pre-sample if method == 'mle': # use Kalman Filter to get initial values if k_trend: mu = params[0]/(1-np.sum(params[k_trend:])) # modifies predictedvalues in place if start < k_ar: self._presample_fit(params, start, k_ar, min(k_ar-1, end), endog[:k_ar] - mu, predictedvalues) predictedvalues[:k_ar-start] += mu if end < k_ar: return predictedvalues # just do the whole thing and truncate fittedvalues = dot(self.X, params) pv_start = max(k_ar - start, 0) fv_start = max(start - k_ar, 0) fv_end = min(len(fittedvalues), end-k_ar+1) predictedvalues[pv_start:] = fittedvalues[fv_start:fv_end] if out_of_sample: forecastvalues = _ar_predict_out_of_sample(endog, params, k_ar, k_trend, out_of_sample) predictedvalues = np.r_[predictedvalues, forecastvalues] return predictedvalues
def _presample_varcov(self, params): """ Returns the inverse of the presample variance-covariance. Notes ----- See Hamilton p. 125 """ k = self.k_trend p = self.k_ar p1 = p+1 # get inv(Vp) Hamilton 5.3.7 params0 = np.r_[-1, params[k:]] Vpinv = np.zeros((p, p), dtype=params.dtype) for i in range(1, p1): Vpinv[i-1, i-1:] = np.correlate(params0, params0[:i],)[:-1] Vpinv[i-1, i-1:] -= np.correlate(params0[-i:], params0,)[:-1] Vpinv = Vpinv + Vpinv.T - np.diag(Vpinv.diagonal()) return Vpinv def _loglike_css(self, params): """ Loglikelihood of AR(p) process using conditional sum of squares """ nobs = self.nobs Y = self.Y X = self.X ssr = sumofsq(Y.squeeze() - np.dot(X, params)) sigma2 = ssr/nobs return (-nobs/2 * (np.log(2 * np.pi) + np.log(sigma2)) - ssr/(2 * sigma2)) def _loglike_mle(self, params): """ Loglikelihood of AR(p) process using exact maximum likelihood """ nobs = self.nobs X = self.X endog = self.endog k_ar = self.k_ar k_trend = self.k_trend # reparameterize according to Jones (1980) like in ARMA/Kalman Filter if self.transparams: params = self._transparams(params) # get mean and variance for pre-sample lags yp = endog[:k_ar].copy() if k_trend: c = [params[0]] * k_ar else: c = [0] mup = np.asarray(c / (1 - np.sum(params[k_trend:]))) diffp = yp - mup[:, None] # get inv(Vp) Hamilton 5.3.7 Vpinv = self._presample_varcov(params) diffpVpinv = np.dot(np.dot(diffp.T, Vpinv), diffp).item() ssr = sumofsq(endog[k_ar:].squeeze() - np.dot(X, params)) # concentrating the likelihood means that sigma2 is given by sigma2 = 1./nobs * (diffpVpinv + ssr) self.sigma2 = sigma2 logdet = slogdet(Vpinv)[1] # TODO: add check for singularity loglike = -1/2. * (nobs * (np.log(2 * np.pi) + np.log(sigma2)) - logdet + diffpVpinv / sigma2 + ssr / sigma2) return loglike
[docs] def loglike(self, params): """ The loglikelihood of an AR(p) process Parameters ---------- params : array The fitted parameters of the AR model Returns ------- llf : float The loglikelihood evaluated at `params` Notes ----- Contains constant term. If the model is fit by OLS then this returns the conditonal maximum likelihood. .. math:: \\frac{\\left(n-p\\right)}{2}\\left(\\log\\left(2\\pi\\right)+\\log\\left(\\sigma^{2}\\right)\\right)-\\frac{1}{\\sigma^{2}}\\sum_{i}\\epsilon_{i}^{2} If it is fit by MLE then the (exact) unconditional maximum likelihood is returned. .. math:: -\\frac{n}{2}log\\left(2\\pi\\right)-\\frac{n}{2}\\log\\left(\\sigma^{2}\\right)+\\frac{1}{2}\\left|V_{p}^{-1}\\right|-\\frac{1}{2\\sigma^{2}}\\left(y_{p}-\\mu_{p}\\right)^{\\prime}V_{p}^{-1}\\left(y_{p}-\\mu_{p}\\right)-\\frac{1}{2\\sigma^{2}}\\sum_{t=p+1}^{n}\\epsilon_{i}^{2} where :math:`\\mu_{p}` is a (`p` x 1) vector with each element equal to the mean of the AR process and :math:`\\sigma^{2}V_{p}` is the (`p` x `p`) variance-covariance matrix of the first `p` observations. """ #TODO: Math is on Hamilton ~pp 124-5 if self.method == "cmle": return self._loglike_css(params) else: return self._loglike_mle(params)
[docs] def score(self, params): """ Return the gradient of the loglikelihood at params. Parameters ---------- params : array-like The parameter values at which to evaluate the score function. Notes ----- Returns numerical gradient. """ loglike = self.loglike return approx_fprime(params, loglike, epsilon=1e-8)
[docs] def information(self, params): """ Not Implemented Yet """ return
[docs] def hessian(self, params): """ Returns numerical hessian for now. """ loglike = self.loglike return approx_hess(params, loglike)
def _stackX(self, k_ar, trend): """ Private method to build the RHS matrix for estimation. Columns are trend terms then lags. """ endog = self.endog X = lagmat(endog, maxlag=k_ar, trim='both') k_trend = util.get_trendorder(trend) if k_trend: X = add_trend(X, prepend=True, trend=trend) self.k_trend = k_trend return X
[docs] def select_order(self, maxlag, ic, trend='c', method='mle'): """ Select the lag order according to the information criterion. Parameters ---------- maxlag : int The highest lag length tried. See `AR.fit`. ic : str {'aic','bic','hqic','t-stat'} Criterion used for selecting the optimal lag length. See `AR.fit`. trend : str {'c','nc'} Whether to include a constant or not. 'c' - include constant. 'nc' - no constant. Returns ------- bestlag : int Best lag according to IC. """ endog = self.endog # make Y and X with same nobs to compare ICs Y = endog[maxlag:] self.Y = Y # attach to get correct fit stats X = self._stackX(maxlag, trend) # sets k_trend self.X = X k = self.k_trend # k_trend set in _stackX k = max(1, k) # handle if startlag is 0 results = {} if ic != 't-stat': for lag in range(k, maxlag+1): # have to reinstantiate the model to keep comparable models endog_tmp = endog[maxlag-lag:] fit = AR(endog_tmp).fit(maxlag=lag, method=method, full_output=0, trend=trend, maxiter=100, disp=0) results[lag] = eval('fit.'+ic) bestic, bestlag = min((res, k) for k, res in iteritems(results)) else: # choose by last t-stat. stop = 1.6448536269514722 # for t-stat, norm.ppf(.95) for lag in range(maxlag, k - 1, -1): # have to reinstantiate the model to keep comparable models endog_tmp = endog[maxlag - lag:] fit = AR(endog_tmp).fit(maxlag=lag, method=method, full_output=0, trend=trend, maxiter=35, disp=-1) bestlag = 0 if np.abs(fit.tvalues[-1]) >= stop: bestlag = lag break return bestlag
[docs] def fit(self, maxlag=None, method='cmle', ic=None, trend='c', transparams=True, start_params=None, solver='lbfgs', maxiter=35, full_output=1, disp=1, callback=None, **kwargs): """ Fit the unconditional maximum likelihood of an AR(p) process. Parameters ---------- maxlag : int If `ic` is None, then maxlag is the lag length used in fit. If `ic` is specified then maxlag is the highest lag order used to select the correct lag order. If maxlag is None, the default is round(12*(nobs/100.)**(1/4.)) method : str {'cmle', 'mle'}, optional cmle - Conditional maximum likelihood using OLS mle - Unconditional (exact) maximum likelihood. See `solver` and the Notes. ic : str {'aic','bic','hic','t-stat'} Criterion used for selecting the optimal lag length. aic - Akaike Information Criterion bic - Bayes Information Criterion t-stat - Based on last lag hqic - Hannan-Quinn Information Criterion If any of the information criteria are selected, the lag length which results in the lowest value is selected. If t-stat, the model starts with maxlag and drops a lag until the highest lag has a t-stat that is significant at the 95 % level. trend : str {'c','nc'} Whether to include a constant or not. 'c' - include constant. 'nc' - no constant. The below can be specified if method is 'mle' transparams : bool, optional Whether or not to transform the parameters to ensure stationarity. Uses the transformation suggested in Jones (1980). start_params : array-like, optional A first guess on the parameters. Default is cmle estimates. solver : str or None, optional Solver to be used if method is 'mle'. The default is 'lbfgs' (limited memory Broyden-Fletcher-Goldfarb-Shanno). Other choices are 'bfgs', 'newton' (Newton-Raphson), 'nm' (Nelder-Mead), 'cg' - (conjugate gradient), 'ncg' (non-conjugate gradient), and 'powell'. maxiter : int, optional The maximum number of function evaluations. Default is 35. tol : float The convergence tolerance. Default is 1e-08. full_output : bool, optional If True, all output from solver will be available in the Results object's mle_retvals attribute. Output is dependent on the solver. See Notes for more information. disp : bool, optional If True, convergence information is output. callback : function, optional Called after each iteration as callback(xk) where xk is the current parameter vector. kwargs See Notes for keyword arguments that can be passed to fit. References ---------- Jones, R.H. 1980 "Maximum likelihood fitting of ARMA models to time series with missing observations." `Technometrics`. 22.3. 389-95. See also -------- statsmodels.base.model.LikelihoodModel.fit """ method = method.lower() if method not in ['cmle', 'yw', 'mle']: raise ValueError("Method %s not recognized" % method) self.method = method self.trend = trend self.transparams = transparams nobs = len(self.endog) # overwritten if method is 'cmle' endog = self.endog if maxlag is None: maxlag = int(round(12*(nobs/100.)**(1/4.))) k_ar = maxlag # stays this if ic is None # select lag length if ic is not None: ic = ic.lower() if ic not in ['aic', 'bic', 'hqic', 't-stat']: raise ValueError("ic option %s not understood" % ic) k_ar = self.select_order(k_ar, ic, trend, method) self.k_ar = k_ar # change to what was chosen by ic # redo estimation for best lag # make LHS Y = endog[k_ar:, :] # make lagged RHS X = self._stackX(k_ar, trend) # sets self.k_trend k_trend = self.k_trend self.exog_names = util.make_lag_names(self.endog_names, k_ar, k_trend) self.Y = Y self.X = X if method == "cmle": # do OLS arfit = OLS(Y, X).fit() params = arfit.params self.nobs = nobs - k_ar self.sigma2 = arfit.ssr/arfit.nobs # needed for predict fcasterr elif method == "mle": solver = solver.lower() self.nobs = nobs if start_params is None: start_params = OLS(Y, X).fit().params else: if len(start_params) != k_trend + k_ar: raise ValueError("Length of start params is %d. There" " are %d parameters." % (len(start_params), k_trend + k_ar)) start_params = self._invtransparams(start_params) if solver == 'lbfgs': kwargs.setdefault('pgtol', 1e-8) kwargs.setdefault('factr', 1e2) kwargs.setdefault('m', 12) kwargs.setdefault('approx_grad', True) mlefit = super(AR, self).fit(start_params=start_params, method=solver, maxiter=maxiter, full_output=full_output, disp=disp, callback=callback, **kwargs) params = mlefit.params if self.transparams: params = self._transparams(params) self.transparams = False # turn off now for other results # don't use yw, because we can't estimate the constant #elif method == "yw": # params, omega = yule_walker(endog, order=maxlag, # method="mle", demean=False) # how to handle inference after Yule-Walker? # self.params = params #TODO: don't attach here # self.omega = omega pinv_exog = np.linalg.pinv(X) normalized_cov_params = np.dot(pinv_exog, pinv_exog.T) arfit = ARResults(self, params, normalized_cov_params) if method == 'mle' and full_output: arfit.mle_retvals = mlefit.mle_retvals arfit.mle_settings = mlefit.mle_settings return ARResultsWrapper(arfit)
[docs]class ARResults(tsbase.TimeSeriesModelResults): """ Class to hold results from fitting an AR model. Parameters ---------- model : AR Model instance Reference to the model that is fit. params : array The fitted parameters from the AR Model. normalized_cov_params : array inv(dot(X.T,X)) where X is the lagged values. scale : float, optional An estimate of the scale of the model. Returns ------- **Attributes** aic : float Akaike Information Criterion using Lutkephol's definition. :math:`log(sigma) + 2*(1 + k_ar + k_trend)/nobs` bic : float Bayes Information Criterion :math:`\\log(\\sigma) + (1 + k_ar + k_trend)*\\log(nobs)/nobs` bse : array The standard errors of the estimated parameters. If `method` is 'cmle', then the standard errors that are returned are the OLS standard errors of the coefficients. If the `method` is 'mle' then they are computed using the numerical Hessian. fittedvalues : array The in-sample predicted values of the fitted AR model. The `k_ar` initial values are computed via the Kalman Filter if the model is fit by `mle`. fpe : float Final prediction error using Lutkepohl's definition ((n_totobs+k_trend)/(n_totobs-k_ar-k_trend))*sigma hqic : float Hannan-Quinn Information Criterion. k_ar : float Lag length. Sometimes used as `p` in the docs. k_trend : float The number of trend terms included. 'nc'=0, 'c'=1. llf : float The loglikelihood of the model evaluated at `params`. See `AR.loglike` model : AR model instance A reference to the fitted AR model. nobs : float The number of available observations `nobs` - `k_ar` n_totobs : float The number of total observations in `endog`. Sometimes `n` in the docs. params : array The fitted parameters of the model. pvalues : array The p values associated with the standard errors. resid : array The residuals of the model. If the model is fit by 'mle' then the pre-sample residuals are calculated using fittedvalues from the Kalman Filter. roots : array The roots of the AR process are the solution to (1 - arparams[0]*z - arparams[1]*z**2 -...- arparams[p-1]*z**k_ar) = 0 Stability requires that the roots in modulus lie outside the unit circle. scale : float Same as sigma2 sigma2 : float The variance of the innovations (residuals). trendorder : int The polynomial order of the trend. 'nc' = None, 'c' or 't' = 0, 'ct' = 1, etc. tvalues : array The t-values associated with `params`. """ _cache = {} # for scale setter def __init__(self, model, params, normalized_cov_params=None, scale=1.): super(ARResults, self).__init__(model, params, normalized_cov_params, scale) self._cache = resettable_cache() self.nobs = model.nobs n_totobs = len(model.endog) self.n_totobs = n_totobs self.X = model.X # copy? self.Y = model.Y k_ar = model.k_ar self.k_ar = k_ar k_trend = model.k_trend self.k_trend = k_trend trendorder = None if k_trend > 0: trendorder = k_trend - 1 self.trendorder = trendorder #TODO: cmle vs mle? self.df_model = k_ar + k_trend self.df_resid = self.model.df_resid = n_totobs - self.df_model @cache_writable()
[docs] def sigma2(self): model = self.model if model.method == "cmle": # do DOF correction return 1. / self.nobs * sumofsq(self.resid) else: return self.model.sigma2
@cache_writable() # for compatability with RegressionResults
[docs] def scale(self): return self.sigma2
@cache_readonly
[docs] def bse(self): # allow user to specify? if self.model.method == "cmle": # uses different scale/sigma def. resid = self.resid ssr = np.dot(resid, resid) ols_scale = ssr / (self.nobs - self.k_ar - self.k_trend) return np.sqrt(np.diag(self.cov_params(scale=ols_scale))) else: hess = approx_hess(self.params, self.model.loglike) return np.sqrt(np.diag(-np.linalg.inv(hess)))
@cache_readonly
[docs] def pvalues(self): return norm.sf(np.abs(self.tvalues))*2
@cache_readonly
[docs] def aic(self): #JP: this is based on loglike with dropped constant terms ? # Lutkepohl #return np.log(self.sigma2) + 1./self.model.nobs * self.k_ar # Include constant as estimated free parameter and double the loss return np.log(self.sigma2) + 2 * (1 + self.df_model)/self.nobs
# Stata defintion #nobs = self.nobs #return -2 * self.llf/nobs + 2 * (self.k_ar+self.k_trend)/nobs @cache_readonly
[docs] def hqic(self): nobs = self.nobs # Lutkepohl # return np.log(self.sigma2)+ 2 * np.log(np.log(nobs))/nobs * self.k_ar # R uses all estimated parameters rather than just lags return (np.log(self.sigma2) + 2 * np.log(np.log(nobs))/nobs * (1 + self.df_model))
# Stata #nobs = self.nobs #return -2 * self.llf/nobs + 2 * np.log(np.log(nobs))/nobs * \ # (self.k_ar + self.k_trend) @cache_readonly
[docs] def fpe(self): nobs = self.nobs df_model = self.df_model #Lutkepohl return ((nobs+df_model)/(nobs-df_model))*self.sigma2
@cache_readonly
[docs] def bic(self): nobs = self.nobs # Lutkepohl #return np.log(self.sigma2) + np.log(nobs)/nobs * self.k_ar # Include constant as est. free parameter return np.log(self.sigma2) + (1 + self.df_model) * np.log(nobs)/nobs
# Stata # return -2 * self.llf/nobs + np.log(nobs)/nobs * (self.k_ar + \ # self.k_trend) @cache_readonly
[docs] def resid(self): #NOTE: uses fittedvalues because it calculate presample values for mle model = self.model endog = model.endog.squeeze() if model.method == "cmle": # elimate pre-sample return endog[self.k_ar:] - self.fittedvalues else: return model.endog.squeeze() - self.fittedvalues
#def ssr(self): # resid = self.resid # return np.dot(resid, resid) @cache_readonly
[docs] def roots(self): k = self.k_trend return np.roots(np.r_[1, -self.params[k:]]) ** -1
@cache_readonly
[docs] def fittedvalues(self): return self.model.predict(self.params)
[docs] def predict(self, start=None, end=None, dynamic=False): params = self.params predictedvalues = self.model.predict(params, start, end, dynamic) return predictedvalues
#start = self.model._get_predict_start(start) #end, out_of_sample = self.model._get_predict_end(end) ##TODO: return forecast errors and confidence intervals #from statsmodels.tsa.arima_process import arma2ma #ma_rep = arma2ma(np.r_[1,-params[::-1]], [1], out_of_sample) #fcasterr = np.sqrt(self.sigma2 * np.cumsum(ma_rep**2)) preddoc = AR.predict.__doc__.split('\n') extra_doc = (""" confint : bool, float Whether to return confidence intervals. If `confint` == True, 95 % confidence intervals are returned. Else if `confint` is a float, then it is assumed to be the alpha value of the confidence interval. That is confint == .05 returns a 95% confidence interval, and .10 would return a 90% confidence interval.""" ).split('\n') #ret_doc = """ # fcasterr : array-like # confint : array-like #""" predict.__doc__ = '\n'.join(preddoc[:5] + preddoc[7:20] + extra_doc + preddoc[20:])
class ARResultsWrapper(wrap.ResultsWrapper): _attrs = {} _wrap_attrs = wrap.union_dicts(tsbase.TimeSeriesResultsWrapper._wrap_attrs, _attrs) _methods = {} _wrap_methods = wrap.union_dicts(tsbase.TimeSeriesResultsWrapper._wrap_methods, _methods) wrap.populate_wrapper(ARResultsWrapper, ARResults) if __name__ == "__main__": import statsmodels.api as sm sunspots = sm.datasets.sunspots.load() # Why does R demean the data by defaut? ar_ols = AR(sunspots.endog) res_ols = ar_ols.fit(maxlag=9) ar_mle = AR(sunspots.endog) res_mle_bfgs = ar_mle.fit(maxlag=9, method="mle", solver="bfgs", maxiter=500, gtol=1e-10) # res_mle2 = ar_mle.fit(maxlag=1, method="mle", maxiter=500, penalty=True, # tol=1e-13) # ar_yw = AR(sunspots.endog) # res_yw = ar_yw.fit(maxlag=4, method="yw") # # Timings versus talkbox # from timeit import default_timer as timer # print "Time AR fit vs. talkbox" # # generate a long series of AR(2) data # # nobs = 1000000 # y = np.empty(nobs) # y[0:2] = 0 # for i in range(2,nobs): # y[i] = .25 * y[i-1] - .75 * y[i-2] + np.random.rand() # # mod_sm = AR(y) # t = timer() # res_sm = mod_sm.fit(method="yw", trend="nc", demean=False, maxlag=2) # t_end = timer() # print str(t_end - t) + " seconds for sm.AR with yule-walker, 2 lags" # try: # import scikits.talkbox as tb # except: # raise ImportError("You need scikits.talkbox installed for timings") # t = timer() # mod_tb = tb.lpc(y, 2) # t_end = timer() # print str(t_end - t) + " seconds for talkbox.lpc" # print """For higher lag lengths ours quickly fills up memory and starts #thrashing the swap. Should we include talkbox C code or Cythonize the #Levinson recursion algorithm?""" ## Try with a pandas series import pandas import scikits.timeseries as ts d1 = ts.Date(year=1700, freq='A') #NOTE: have to have yearBegin offset for annual data until parser rewrite #should this be up to the user, or should it be done in TSM init? #NOTE: not anymore, it's end of year now ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog)) pandas_dr = pandas.DatetimeIndex(start=d1.datetime, periods=len(sunspots.endog), freq='A-DEC') #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin) dates = np.arange(1700, 1700 + len(sunspots.endog)) dates = ts.date_array(dates, freq='A') #sunspots = pandas.Series(sunspots.endog, index=dates) #NOTE: pandas only does business days for dates it looks like import datetime dt_dates = np.asarray(lmap(datetime.datetime.fromordinal, ts_dr.toordinal().astype(int))) sunspots = pandas.Series(sunspots.endog, index=dt_dates) #NOTE: pandas can't handle pre-1900 dates mod = AR(sunspots, freq='A') res = mod.fit(method='mle', maxlag=9) # some data for an example in Box Jenkins IBM = np.asarray([460, 457, 452, 459, 462, 459, 463, 479, 493, 490.]) w = np.diff(IBM) theta = .5