from statsmodels.compat.python import range
import numpy as np
from scipy.stats import f as fdist
from scipy.stats import t as student_t
from scipy import stats
from statsmodels.tools.tools import clean0, fullrank
from statsmodels.compat.numpy import np_matrix_rank
#TODO: should this be public if it's just a container?
[docs]class ContrastResults(object):
"""
Class for results of tests of linear restrictions on coefficients in a model.
This class functions mainly as a container for `t_test`, `f_test` and
`wald_test` for the parameters of a model.
The attributes depend on the statistical test and are either based on the
normal, the t, the F or the chisquare distribution.
"""
def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None,
df_num=None, alpha=0.05, **kwds):
self.effect = effect # Let it be None for F
if F is not None:
self.distribution = 'F'
self.fvalue = F
self.statistic = self.fvalue
self.df_denom = df_denom
self.df_num = df_num
self.dist = fdist
self.dist_args = (df_num, df_denom)
self.pvalue = fdist.sf(F, df_num, df_denom)
elif t is not None:
self.distribution = 't'
self.tvalue = t
self.statistic = t # generic alias
self.sd = sd
self.df_denom = df_denom
self.dist = student_t
self.dist_args = (df_denom,)
self.pvalue = self.dist.sf(np.abs(t), df_denom) * 2
elif 'statistic' in kwds:
# TODO: currently targeted to normal distribution, and chi2
self.distribution = kwds['distribution']
self.statistic = kwds['statistic']
self.tvalue = value = kwds['statistic'] # keep alias
# TODO: for results instance we decided to use tvalues also for normal
self.sd = sd
self.dist = getattr(stats, self.distribution)
self.dist_args = ()
if self.distribution is 'chi2':
self.pvalue = self.dist.sf(self.statistic, df_denom)
else:
"normal"
self.pvalue = self.dist.sf(np.abs(value)) * 2
# cleanup
# should we return python scalar?
self.pvalue = np.squeeze(self.pvalue)
[docs] def conf_int(self, alpha=0.05):
"""
Returns the confidence interval of the value, `effect` of the constraint.
This is currently only available for t and z tests.
Parameters
----------
alpha : float, optional
The significance level for the confidence interval.
ie., The default `alpha` = .05 returns a 95% confidence interval.
Returns
-------
ci : ndarray, (k_constraints, 2)
The array has the lower and the upper limit of the confidence
interval in the columns.
"""
if self.effect is not None:
# confidence intervals
q = self.dist.ppf(1 - alpha / 2., *self.dist_args)
lower = self.effect - q * self.sd
upper = self.effect + q * self.sd
return np.column_stack((lower, upper))
else:
raise NotImplementedError('Confidence Interval not available')
def __array__(self):
if hasattr(self, "fvalue"):
return self.fvalue
else:
return self.tvalue
def __str__(self):
return self.summary().__str__()
def __repr__(self):
return str(self.__class__) + '\n' + self.__str__()
[docs] def summary(self, xname=None, alpha=0.05, title=None):
"""Summarize the Results of the hypothesis test
Parameters
-----------
xname : list of strings, optional
Default is `c_##` for ## in p the number of regressors
alpha : float
significance level for the confidence intervals. Default is
alpha = 0.05 which implies a confidence level of 95%.
title : string, optional
Title for the params table. If not None, then this replaces the
default title
Returns
-------
smry : string or Summary instance
This contains a parameter results table in the case of t or z test
in the same form as the parameter results table in the model
results summary.
For F or Wald test, the return is a string.
"""
if self.effect is not None:
# TODO: should also add some extra information, e.g. robust cov ?
# TODO: can we infer names for constraints, xname in __init__ ?
if title is None:
title = 'Test for Constraints'
elif title == '':
# don't add any title,
# I think SimpleTable skips on None - check
title = None
# we have everything for a params table
use_t = (self.distribution == 't')
yname='constraints' # Not used in params_frame
if xname is None:
xname = ['c%d'%ii for ii in range(len(self.effect))]
from statsmodels.iolib.summary import summary_params
pvalues = np.atleast_1d(self.pvalue)
summ = summary_params((self, self.effect, self.sd, self.statistic,
pvalues, self.conf_int(alpha)),
yname=yname, xname=xname, use_t=use_t,
title=title, alpha=alpha)
return summ
elif hasattr(self, 'fvalue'):
# TODO: create something nicer for these casee
return '<F test: F=%s, p=%s, df_denom=%d, df_num=%d>' % \
(repr(self.fvalue), self.pvalue, self.df_denom, self.df_num)
else:
# generic
return '<Wald test: statistic=%s, p-value=%s>' % \
(self.statistic, self.pvalue)
[docs] def summary_frame(self, xname=None, alpha=0.05):
"""Return the parameter table as a pandas DataFrame
This is only available for t and normal tests
"""
if self.effect is not None:
# we have everything for a params table
use_t = (self.distribution == 't')
yname='constraints' # Not used in params_frame
if xname is None:
xname = ['c%d'%ii for ii in range(len(self.effect))]
from statsmodels.iolib.summary import summary_params_frame
summ = summary_params_frame((self, self.effect, self.sd,
self.statistic,self.pvalue,
self.conf_int(alpha)), yname=yname,
xname=xname, use_t=use_t,
alpha=alpha)
return summ
else:
# TODO: create something nicer
raise NotImplementedError('only available for t and z')
class Contrast(object):
"""
This class is used to construct contrast matrices in regression models.
They are specified by a (term, design) pair. The term, T, is a linear
combination of columns of the design matrix. The matrix attribute of
Contrast is a contrast matrix C so that
colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T)))
where pinv(D) is the generalized inverse of D. Further, the matrix
Tnew = dot(C, D)
is full rank. The rank attribute is the rank of
dot(D, dot(pinv(D), T))
In a regression model, the contrast tests that E(dot(Tnew, Y)) = 0
for each column of Tnew.
Parameters
----------
term : array-like
design : array-like
Attributes
----------
contrast_matrix
Examples
--------
>>> import statsmodels.api as sm
>>> from statsmodels.stats.contrast import Contrast
>>> import numpy as np
>>> np.random.seed(54321)
>>> X = np.random.standard_normal((40,10))
# Get a contrast
>>> new_term = np.column_stack((X[:,0], X[:,2]))
>>> c = Contrast(new_term, X)
>>> test = [[1] + [0]*9, [0]*2 + [1] + [0]*7]
>>> np.allclose(c.contrast_matrix, test)
True
Get another contrast
>>> P = np.dot(X, np.linalg.pinv(X))
>>> resid = np.identity(40) - P
>>> noise = np.dot(resid,np.random.standard_normal((40,5)))
>>> new_term2 = np.column_stack((noise,X[:,2]))
>>> c2 = Contrast(new_term2, X)
>>> print(c2.contrast_matrix)
[ -1.26424750e-16 8.59467391e-17 1.56384718e-01 -2.60875560e-17
-7.77260726e-17 -8.41929574e-18 -7.36359622e-17 -1.39760860e-16
1.82976904e-16 -3.75277947e-18]
Get another contrast
>>> zero = np.zeros((40,))
>>> new_term3 = np.column_stack((zero,X[:,2]))
>>> c3 = Contrast(new_term3, X)
>>> test2 = [0]*2 + [1] + [0]*7
>>> np.allclose(c3.contrast_matrix, test2)
True
"""
def _get_matrix(self):
"""
Gets the contrast_matrix property
"""
if not hasattr(self, "_contrast_matrix"):
self.compute_matrix()
return self._contrast_matrix
contrast_matrix = property(_get_matrix)
def __init__(self, term, design):
self.term = np.asarray(term)
self.design = np.asarray(design)
def compute_matrix(self):
"""
Construct a contrast matrix C so that
colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T)))
where pinv(D) is the generalized inverse of D=design.
"""
T = self.term
if T.ndim == 1:
T = T[:,None]
self.T = clean0(T)
self.D = self.design
self._contrast_matrix = contrastfromcols(self.T, self.D)
try:
self.rank = self.matrix.shape[1]
except:
self.rank = 1
#TODO: fix docstring after usage is settled
def contrastfromcols(L, D, pseudo=None):
"""
From an n x p design matrix D and a matrix L, tries
to determine a p x q contrast matrix C which
determines a contrast of full rank, i.e. the
n x q matrix
dot(transpose(C), pinv(D))
is full rank.
L must satisfy either L.shape[0] == n or L.shape[1] == p.
If L.shape[0] == n, then L is thought of as representing
columns in the column space of D.
If L.shape[1] == p, then L is thought of as what is known
as a contrast matrix. In this case, this function returns an estimable
contrast corresponding to the dot(D, L.T)
Note that this always produces a meaningful contrast, not always
with the intended properties because q is always non-zero unless
L is identically 0. That is, it produces a contrast that spans
the column space of L (after projection onto the column space of D).
Parameters
----------
L : array-like
D : array-like
"""
L = np.asarray(L)
D = np.asarray(D)
n, p = D.shape
if L.shape[0] != n and L.shape[1] != p:
raise ValueError("shape of L and D mismatched")
if pseudo is None:
pseudo = np.linalg.pinv(D) # D^+ \approx= ((dot(D.T,D))^(-1),D.T)
if L.shape[0] == n:
C = np.dot(pseudo, L).T
else:
C = L
C = np.dot(pseudo, np.dot(D, C.T)).T
Lp = np.dot(D, C.T)
if len(Lp.shape) == 1:
Lp.shape = (n, 1)
if np_matrix_rank(Lp) != Lp.shape[1]:
Lp = fullrank(Lp)
C = np.dot(pseudo, Lp).T
return np.squeeze(C)
# TODO: this is currently a minimal version, stub
class WaldTestResults(object):
# for F and chi2 tests of joint hypothesis, mainly for vectorized
def __init__(self, statistic, distribution, dist_args, table=None,
pvalues=None):
self.table = table
self.distribution = distribution
self.statistic = statistic
#self.sd = sd
self.dist_args = dist_args
# The following is because I don't know which we want
if table is not None:
self.statistic = table['statistic'].values
self.pvalues = table['pvalue'].values
self.df_constraints = table['df_constraint'].values
if self.distribution == 'F':
self.df_denom = table['df_denom'].values
else:
if self.distribution is 'chi2':
self.dist = stats.chi2
self.df_constraints = self.dist_args[0] # assumes tuple
# using dist_args[0] is a bit dangerous,
elif self.distribution is 'F':
self.dist = stats.f
self.df_constraints, self.df_denom = self.dist_args
else:
raise ValueError('only F and chi2 are possible distribution')
if pvalues is None:
self.pvalues = self.dist.sf(np.abs(statistic), *dist_args)
else:
self.pvalues = pvalues
@property
def col_names(self):
"""column names for summary table
"""
pr_test = "P>%s" % self.distribution
col_names = [self.distribution, pr_test, 'df constraint']
if self.distribution == 'F':
col_names.append('df denom')
return col_names
def summary_frame(self):
# needs to be a method for consistency
if hasattr(self, '_dframe'):
return self._dframe
# rename the column nambes, but don't copy data
renaming = dict(zip(self.table.columns, self.col_names))
self.dframe = self.table.rename(columns=renaming)
return self.dframe
def __str__(self):
return self.summary_frame().to_string()
def __repr__(self):
return str(self.__class__) + '\n' + self.__str__()