"""
Empirical likelihood inference on descriptive statistics
This module conducts hypothesis tests and constructs confidence
intervals for the mean, variance, skewness, kurtosis and correlation.
If matplotlib is installed, this module can also generate multivariate
confidence region plots as well as mean-variance contour plots.
See _OptFuncts docstring for technical details and optimization variable
definitions.
General References:
------------------
Owen, A. (2001). "Empirical Likelihood." Chapman and Hall
"""
import numpy as np
from scipy import optimize
from scipy.stats import chi2, skew, kurtosis
from statsmodels.base.optimizer import _fit_newton
import itertools
from statsmodels.graphics import utils
[docs]def DescStat(endog):
"""
Returns an instance to conduct inference on descriptive statistics
via empirical likelihood. See DescStatUV and DescStatMV for more
information.
Parameters
----------
endog : ndarray
Array of data
Returns : DescStat instance
If k=1, the function returns a univariate instance, DescStatUV.
If k>1, the function returns a multivariate instance, DescStatMV.
"""
if endog.ndim == 1:
endog = endog.reshape(len(endog), 1)
if endog.shape[1] == 1:
return DescStatUV(endog)
if endog.shape[1] > 1:
return DescStatMV(endog)
class _OptFuncts(object):
"""
A class that holds functions that are optimized/solved.
The general setup of the class is simple. Any method that starts with
_opt_ creates a vector of estimating equations named est_vect such that
np.dot(p, (est_vect))=0 where p is the weight on each
observation as a 1 x n array and est_vect is n x k. Then _modif_Newton is
called to determine the optimal p by solving for the Lagrange multiplier
(eta) in the profile likelihood maximization problem. In the presence
of nuisance parameters, _opt_ functions are optimized over to profile
out the nuisance parameters.
Any method starting with _ci_limits calculates the log likelihood
ratio for a specific value of a parameter and then subtracts a
pre-specified critical value. This is solved so that llr - crit = 0.
"""
def __init__(self, endog):
pass
def _log_star(self, eta, est_vect, weights, nobs):
"""
Transforms the log of observation probabilities in terms of the
Lagrange multiplier to the log 'star' of the probabilities.
Parameters
----------
eta : float
Lagrange multiplier
est_vect : ndarray (n,k)
Estimating equations vector
wts : nx1 array
Observation weights
Returns
------
data_star : array
The weighted logstar of the estimting equations
Notes
-----
This function is only a placeholder for the _fit_Newton.
The function value is not used in optimization and the optimal value
is disregarded when computing the log likelihood ratio.
"""
data_star = np.log(weights) + (np.sum(weights) +\
np.dot(est_vect, eta))
idx = data_star < 1. / nobs
not_idx = ~idx
nx = nobs * data_star[idx]
data_star[idx] = np.log(1. / nobs) - 1.5 + nx * (2. - nx / 2)
data_star[not_idx] = np.log(data_star[not_idx])
return data_star
def _hess(self, eta, est_vect, weights, nobs):
"""
Calculates the hessian of a weighted empirical likelihood
problem.
Parameters
----------
eta : ndarray, (1,m)
Lagrange multiplier in the profile likelihood maximization
est_vect : ndarray (n,k)
Estimating equations vector
weights : 1darray
Observation weights
Returns
-------
hess : m x m array
Weighted hessian used in _wtd_modif_newton
"""
#eta = np.squeeze(eta)
data_star_doub_prime = np.sum(weights) + np.dot(est_vect, eta)
idx = data_star_doub_prime < 1. / nobs
not_idx = ~idx
data_star_doub_prime[idx] = - nobs ** 2
data_star_doub_prime[not_idx] = - (data_star_doub_prime[not_idx]) ** -2
wtd_dsdp = weights * data_star_doub_prime
return np.dot(est_vect.T, wtd_dsdp[:, None] * est_vect)
def _grad(self, eta, est_vect, weights, nobs):
"""
Calculates the gradient of a weighted empirical likelihood
problem
Parameters
----------
eta : ndarray, (1,m)
Lagrange multiplier in the profile likelihood maximization
est_vect : ndarray, (n,k)
Estimating equations vector
weights : 1darray
Observation weights
Returns
-------
gradient : ndarray (m,1)
The gradient used in _wtd_modif_newton
"""
#eta = np.squeeze(eta)
data_star_prime = np.sum(weights) + np.dot(est_vect, eta)
idx = data_star_prime < 1. / nobs
not_idx = ~idx
data_star_prime[idx] = nobs * (2 - nobs * data_star_prime[idx])
data_star_prime[not_idx] = 1. / data_star_prime[not_idx]
return np.dot(weights * data_star_prime, est_vect)
def _modif_newton(self, eta, est_vect, weights):
"""
Modified Newton's method for maximizing the log 'star' equation. This
function calls _fit_newton to find the optimal values of eta.
Parameters
----------
eta : ndarray, (1,m)
Lagrange multiplier in the profile likelihood maximization
est_vect : ndarray, (n,k)
Estimating equations vector
weights : 1darray
Observation weights
Returns
-------
params : 1xm array
Lagrange multiplier that maximizes the log-likelihood
"""
nobs = len(est_vect)
f = lambda x0: - np.sum(self._log_star(x0, est_vect, weights, nobs))
grad = lambda x0: - self._grad(x0, est_vect, weights, nobs)
hess = lambda x0: - self._hess(x0, est_vect, weights, nobs)
kwds = {'tol': 1e-8}
eta = eta.squeeze()
res = _fit_newton(f, grad, eta, (), kwds, hess=hess, maxiter=50, \
disp=0)
return res[0]
def _find_eta(self, eta):
"""
Finding the root of sum(xi-h0)/(1+eta(xi-mu)) solves for
eta when computing ELR for univariate mean.
Parameters
----------
eta : float
Lagrange multiplier in the empirical likelihood maximization
Returns
-------
llr : float
n times the log likelihood value for a given value of eta
"""
return np.sum((self.endog - self.mu0) / \
(1. + eta * (self.endog - self.mu0)))
def _ci_limits_mu(self, mu):
"""
Calculates the difference between the log likelihood of mu_test and a
specified critical value.
Parameters
----------
mu : float
Hypothesized value of the mean.
Returns
-------
diff : float
The difference between the log likelihood value of mu0 and
a specified value.
"""
return self.test_mean(mu)[0] - self.r0
def _find_gamma(self, gamma):
"""
Finds gamma that satisfies
sum(log(n * w(gamma))) - log(r0) = 0
Used for confidence intervals for the mean
Parameters
----------
gamma : float
Lagrange multiplier when computing confidence interval
Returns
-------
diff : float
The difference between the log-liklihood when the Lagrange
multiplier is gamma and a pre-specified value
"""
denom = np.sum((self.endog - gamma) ** -1)
new_weights = (self.endog - gamma) ** -1 / denom
return -2 * np.sum(np.log(self.nobs * new_weights)) - \
self.r0
def _opt_var(self, nuisance_mu, pval=False):
"""
This is the function to be optimized over a nuisance mean parameter
to determine the likelihood ratio for the variance
Parameters
----------
nuisance_mu : float
Value of a nuisance mean parameter
Returns
-------
llr : float
Log likelihood of a pre-specified variance holding the nuisance
parameter constant
"""
endog = self.endog
nobs = self.nobs
sig_data = ((endog - nuisance_mu) ** 2 \
- self.sig2_0)
mu_data = (endog - nuisance_mu)
est_vect = np.column_stack((mu_data, sig_data))
eta_star = self._modif_newton(np.array([1. / nobs,
1. / nobs]), est_vect,
np.ones(nobs) * (1. / nobs))
denom = 1 + np.dot(eta_star, est_vect.T)
self.new_weights = 1. / nobs * 1. / denom
llr = np.sum(np.log(nobs * self.new_weights))
if pval: # Used for contour plotting
return chi2.sf(-2 * llr, 1)
return -2 * llr
def _ci_limits_var(self, var):
"""
Used to determine the confidence intervals for the variance.
It calls test_var and when called by an optimizer,
finds the value of sig2_0 that is chi2.ppf(significance-level)
Parameters
----------
var_test : float
Hypothesized value of the variance
Returns
-------
diff : float
The difference between the log likelihood ratio at var_test and a
pre-specified value.
"""
return self.test_var(var)[0] - self.r0
def _opt_skew(self, nuis_params):
"""
Called by test_skew. This function is optimized over
nuisance parameters mu and sigma
Parameters
----------
nuis_params : 1darray
An array with a nuisance mean and variance parameter
Returns
-------
llr : float
The log likelihood ratio of a pre-specified skewness holding
the nuisance parameters constant.
"""
endog = self.endog
nobs = self.nobs
mu_data = endog - nuis_params[0]
sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1]
skew_data = ((((endog - nuis_params[0]) ** 3) /
(nuis_params[1] ** 1.5))) - self.skew0
est_vect = np.column_stack((mu_data, sig_data, skew_data))
eta_star = self._modif_newton(np.array([1. / nobs,
1. / nobs,
1. / nobs]), est_vect,
np.ones(nobs) * (1. / nobs))
denom = 1. + np.dot(eta_star, est_vect.T)
self.new_weights = 1. / nobs * 1. / denom
llr = np.sum(np.log(nobs * self.new_weights))
return -2 * llr
def _opt_kurt(self, nuis_params):
"""
Called by test_kurt. This function is optimized over
nuisance parameters mu and sigma
Parameters
----------
nuis_params : 1darray
An array with a nuisance mean and variance parameter
Returns
-------
llr : float
The log likelihood ratio of a pre-speified kurtosis holding the
nuisance parameters constant
"""
endog = self.endog
nobs = self.nobs
mu_data = endog - nuis_params[0]
sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1]
kurt_data = (((((endog - nuis_params[0]) ** 4) / \
(nuis_params[1] ** 2))) - 3) - self.kurt0
est_vect = np.column_stack((mu_data, sig_data, kurt_data))
eta_star = self._modif_newton(np.array([1. / nobs,
1. / nobs,
1. / nobs]), est_vect,
np.ones(nobs) * (1. / nobs))
denom = 1 + np.dot(eta_star, est_vect.T)
self.new_weights = 1. / nobs * 1. / denom
llr = np.sum(np.log(nobs * self.new_weights))
return -2 * llr
def _opt_skew_kurt(self, nuis_params):
"""
Called by test_joint_skew_kurt. This function is optimized over
nuisance parameters mu and sigma
Parameters
----------
nuis_params : 1darray
An array with a nuisance mean and variance parameter
Returns
------
llr : float
The log likelihood ratio of a pre-speified skewness and
kurtosis holding the nuisance parameters constant.
"""
endog = self.endog
nobs = self.nobs
mu_data = endog - nuis_params[0]
sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1]
skew_data = ((((endog - nuis_params[0]) ** 3) / \
(nuis_params[1] ** 1.5))) - self.skew0
kurt_data = (((((endog - nuis_params[0]) ** 4) / \
(nuis_params[1] ** 2))) - 3) - self.kurt0
est_vect = np.column_stack((mu_data, sig_data, skew_data, kurt_data))
eta_star = self._modif_newton(np.array([1. / nobs,
1. / nobs,
1. / nobs,
1. / nobs]), est_vect,
np.ones(nobs) * (1. / nobs))
denom = 1. + np.dot(eta_star, est_vect.T)
self.new_weights = 1. / nobs * 1. / denom
llr = np.sum(np.log(nobs * self.new_weights))
return -2 * llr
def _ci_limits_skew(self, skew):
"""
Parameters
----------
skew0 : float
Hypothesized value of skewness
Returns
-------
diff : float
The difference between the log likelihood ratio at skew and a
pre-specified value.
"""
return self.test_skew(skew)[0] - self.r0
def _ci_limits_kurt(self, kurt):
"""
Parameters
----------
skew0 : float
Hypothesized value of kurtosis
Returns
-------
diff : float
The difference between the log likelihood ratio at kurt and a
pre-specified value.
"""
return self.test_kurt(kurt)[0] - self.r0
def _opt_correl(self, nuis_params, corr0, endog, nobs, x0, weights0):
"""
Parameters
----------
nuis_params : 1darray
Array containing two nuisance means and two nuisance variances
Returns
-------
llr : float
The log-likelihood of the correlation coefficient holding nuisance
parameters constant
"""
mu1_data, mu2_data = (endog - nuis_params[::2]).T
sig1_data = mu1_data ** 2 - nuis_params[1]
sig2_data = mu2_data ** 2 - nuis_params[3]
correl_data = ((mu1_data * mu2_data) - corr0 *
(nuis_params[1] * nuis_params[3]) ** .5)
est_vect = np.column_stack((mu1_data, sig1_data,
mu2_data, sig2_data, correl_data))
eta_star = self._modif_newton(x0, est_vect, weights0)
denom = 1. + np.dot(est_vect, eta_star)
self.new_weights = 1. / nobs * 1. / denom
llr = np.sum(np.log(nobs * self.new_weights))
return -2 * llr
def _ci_limits_corr(self, corr):
return self.test_corr(corr)[0] - self.r0
[docs]class DescStatUV(_OptFuncts):
"""
A class to compute confidence intervals and hypothesis tests involving
mean, variance, kurtosis and skewness of a univariate random variable.
Parameters
----------
endog : 1darray
Data to be analyzed
Attributes
----------
endog : 1darray
Data to be analyzed
nobs : float
Number of observations
"""
def __init__(self, endog):
self.endog = np.squeeze(endog)
self.nobs = endog.shape[0]
def test_mean(self, mu0, return_weights=False):
"""
Returns - 2 x log-likelihood ratio, p-value and weights
for a hypothesis test of the mean.
Parameters
----------
mu0 : float
Mean value to be tested
return_weights : bool
If return_weights is True the function returns
the weights of the observations under the null hypothesis.
Default is False
Returns
-------
test_results : tuple
The log-likelihood ratio and p-value of mu0
"""
self.mu0 = mu0
endog = self.endog
nobs = self.nobs
eta_min = (1. - (1. / nobs)) / (self.mu0 - max(endog))
eta_max = (1. - (1. / nobs)) / (self.mu0 - min(endog))
eta_star = optimize.brentq(self._find_eta, eta_min, eta_max)
new_weights = (1. / nobs) * 1. / (1. + eta_star * (endog - self.mu0))
llr = -2 * np.sum(np.log(nobs * new_weights))
if return_weights:
return llr, chi2.sf(llr, 1), new_weights
else:
return llr, chi2.sf(llr, 1)
def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8,
gamma_low=-10 ** 10, gamma_high=10 ** 10):
"""
Returns the confidence interval for the mean.
Parameters
----------
sig : float
significance level. Default is .05
method : str
Root finding method, Can be 'nested-brent' or
'gamma'. Default is 'gamma'
'gamma' Tries to solve for the gamma parameter in the
Lagrange (see Owen pg 22) and then determine the weights.
'nested brent' uses brents method to find the confidence
intervals but must maximize the likelihood ratio on every
iteration.
gamma is generally much faster. If the optimizations does not
converge, try expanding the gamma_high and gamma_low
variable.
gamma_low : float
Lower bound for gamma when finding lower limit.
If function returns f(a) and f(b) must have different signs,
consider lowering gamma_low.
gamma_high : float
Upper bound for gamma when finding upper limit.
If function returns f(a) and f(b) must have different signs,
consider raising gamma_high.
epsilon : float
When using 'nested-brent', amount to decrease (increase)
from the maximum (minimum) of the data when
starting the search. This is to protect against the
likelihood ratio being zero at the maximum (minimum)
value of the data. If data is very small in absolute value
(<10 ``**`` -6) consider shrinking epsilon
When using 'gamma', amount to decrease (increase) the
minimum (maximum) by to start the search for gamma.
If function returns f(a) and f(b) must have different signs,
consider lowering epsilon.
Returns
-------
Interval : tuple
Confidence interval for the mean
"""
endog = self.endog
sig = 1 - sig
if method == 'nested-brent':
self.r0 = chi2.ppf(sig, 1)
middle = np.mean(endog)
epsilon_u = (max(endog) - np.mean(endog)) * epsilon
epsilon_l = (np.mean(endog) - min(endog)) * epsilon
ulim = optimize.brentq(self._ci_limits_mu, middle,
max(endog) - epsilon_u)
llim = optimize.brentq(self._ci_limits_mu, middle,
min(endog) + epsilon_l)
return llim, ulim
if method == 'gamma':
self.r0 = chi2.ppf(sig, 1)
gamma_star_l = optimize.brentq(self._find_gamma, gamma_low,
min(endog) - epsilon)
gamma_star_u = optimize.brentq(self._find_gamma, \
max(endog) + epsilon, gamma_high)
weights_low = ((endog - gamma_star_l) ** -1) / \
np.sum((endog - gamma_star_l) ** -1)
weights_high = ((endog - gamma_star_u) ** -1) / \
np.sum((endog - gamma_star_u) ** -1)
mu_low = np.sum(weights_low * endog)
mu_high = np.sum(weights_high * endog)
return mu_low, mu_high
def test_var(self, sig2_0, return_weights=False):
"""
Returns -2 x log-likelihood ratio and the p-value for the
hypothesized variance
Parameters
----------
sig2_0 : float
Hypothesized variance to be tested
return_weights : bool
If True, returns the weights that maximize the
likelihood of observing sig2_0. Default is False
Returns
-------
test_results : tuple
The log-likelihood ratio and the p_value of sig2_0
Examples
--------
>>> import numpy as np
>>> import statsmodels.api as sm
>>> random_numbers = np.random.standard_normal(1000)*100
>>> el_analysis = sm.emplike.DescStat(random_numbers)
>>> hyp_test = el_analysis.test_var(9500)
"""
self.sig2_0 = sig2_0
mu_max = max(self.endog)
mu_min = min(self.endog)
llr = optimize.fminbound(self._opt_var, mu_min, mu_max, \
full_output=1)[1]
p_val = chi2.sf(llr, 1)
if return_weights:
return llr, p_val, self.new_weights.T
else:
return llr, p_val
def ci_var(self, lower_bound=None, upper_bound=None, sig=.05):
"""
Returns the confidence interval for the variance.
Parameters
----------
lower_bound : float
The minimum value the lower confidence interval can
take. The p-value from test_var(lower_bound) must be lower
than 1 - significance level. Default is .99 confidence
limit assuming normality
upper_bound : float
The maximum value the upper confidence interval
can take. The p-value from test_var(upper_bound) must be lower
than 1 - significance level. Default is .99 confidence
limit assuming normality
sig : float
The significance level. Default is .05
Returns
-------
Interval : tuple
Confidence interval for the variance
Examples
--------
>>> import numpy as np
>>> import statsmodels.api as sm
>>> random_numbers = np.random.standard_normal(100)
>>> el_analysis = sm.emplike.DescStat(random_numbers)
>>> el_analysis.ci_var()
(0.7539322567470305, 1.229998852496268)
>>> el_analysis.ci_var(.5, 2)
(0.7539322567469926, 1.2299988524962664)
Notes
-----
If the function returns the error f(a) and f(b) must have
different signs, consider lowering lower_bound and raising
upper_bound.
"""
endog = self.endog
if upper_bound is None:
upper_bound = ((self.nobs - 1) * endog.var()) / \
(chi2.ppf(.0001, self.nobs - 1))
if lower_bound is None:
lower_bound = ((self.nobs - 1) * endog.var()) / \
(chi2.ppf(.9999, self.nobs - 1))
self.r0 = chi2.ppf(1 - sig, 1)
llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var())
ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound)
return llim, ulim
def plot_contour(self, mu_low, mu_high, var_low, var_high, mu_step,
var_step,
levs=[.2, .1, .05, .01, .001]):
"""
Returns a plot of the confidence region for a univariate
mean and variance.
Parameters
----------
mu_low : float
Lowest value of the mean to plot
mu_high : float
Highest value of the mean to plot
var_low : float
Lowest value of the variance to plot
var_high : float
Highest value of the variance to plot
mu_step : float
Increments to evaluate the mean
var_step : float
Increments to evaluate the mean
levs : list
Which values of significance the contour lines will be drawn.
Default is [.2, .1, .05, .01, .001]
Returns
-------
fig : matplotlib figure instance
The contour plot
"""
fig, ax = utils.create_mpl_ax()
ax.set_ylabel('Variance')
ax.set_xlabel('Mean')
mu_vect = list(np.arange(mu_low, mu_high, mu_step))
var_vect = list(np.arange(var_low, var_high, var_step))
z = []
for sig0 in var_vect:
self.sig2_0 = sig0
for mu0 in mu_vect:
z.append(self._opt_var(mu0, pval=True))
z = np.asarray(z).reshape(len(var_vect), len(mu_vect))
ax.contour(mu_vect, var_vect, z, levels=levs)
return fig
def test_skew(self, skew0, return_weights=False):
"""
Returns -2 x log-likelihood and p-value for the hypothesized
skewness.
Parameters
----------
skew0 : float
Skewness value to be tested
return_weights : bool
If True, function also returns the weights that
maximize the likelihood ratio. Default is False.
Returns
-------
test_results : tuple
The log-likelihood ratio and p_value of skew0
"""
self.skew0 = skew0
start_nuisance = np.array([self.endog.mean(),
self.endog.var()])
llr = optimize.fmin_powell(self._opt_skew, start_nuisance,
full_output=1, disp=0)[1]
p_val = chi2.sf(llr, 1)
if return_weights:
return llr, p_val, self.new_weights.T
return llr, p_val
def test_kurt(self, kurt0, return_weights=False):
"""
Returns -2 x log-likelihood and the p-value for the hypothesized
kurtosis.
Parameters
----------
kurt0 : float
Kurtosis value to be tested
return_weights : bool
If True, function also returns the weights that
maximize the likelihood ratio. Default is False.
Returns
-------
test_results : tuple
The log-likelihood ratio and p-value of kurt0
"""
self.kurt0 = kurt0
start_nuisance = np.array([self.endog.mean(),
self.endog.var()])
llr = optimize.fmin_powell(self._opt_kurt, start_nuisance,
full_output=1, disp=0)[1]
p_val = chi2.sf(llr, 1)
if return_weights:
return llr, p_val, self.new_weights.T
return llr, p_val
def test_joint_skew_kurt(self, skew0, kurt0, return_weights=False):
"""
Returns - 2 x log-likelihood and the p-value for the joint
hypothesis test for skewness and kurtosis
Parameters
----------
skew0 : float
Skewness value to be tested
kurt0 : float
Kurtosis value to be tested
return_weights : bool
If True, function also returns the weights that
maximize the likelihood ratio. Default is False.
Returns
-------
test_results : tuple
The log-likelihood ratio and p-value of the joint hypothesis test.
"""
self.skew0 = skew0
self.kurt0 = kurt0
start_nuisance = np.array([self.endog.mean(),
self.endog.var()])
llr = optimize.fmin_powell(self._opt_skew_kurt, start_nuisance,
full_output=1, disp=0)[1]
p_val = chi2.sf(llr, 2)
if return_weights:
return llr, p_val, self.new_weights.T
return llr, p_val
def ci_skew(self, sig=.05, upper_bound=None, lower_bound=None):
"""
Returns the confidence interval for skewness.
Parameters
----------
sig : float
The significance level. Default is .05
upper_bound : float
Maximum value of skewness the upper limit can be.
Default is .99 confidence limit assuming normality.
lower_bound : float
Minimum value of skewness the lower limit can be.
Default is .99 confidence level assuming normality.
Returns
-------
Interval : tuple
Confidence interval for the skewness
Notes
-----
If function returns f(a) and f(b) must have different signs, consider
expanding lower and upper bounds
"""
nobs = self.nobs
endog = self.endog
if upper_bound is None:
upper_bound = skew(endog) + \
2.5 * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5
if lower_bound is None:
lower_bound = skew(endog) - \
2.5 * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5
self.r0 = chi2.ppf(1 - sig, 1)
llim = optimize.brentq(self._ci_limits_skew, lower_bound, skew(endog))
ulim = optimize.brentq(self._ci_limits_skew, skew(endog), upper_bound)
return llim, ulim
def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None):
"""
Returns the confidence interval for kurtosis.
Parameters
----------
sig : float
The significance level. Default is .05
upper_bound : float
Maximum value of kurtosis the upper limit can be.
Default is .99 confidence limit assuming normality.
lower_bound : float
Minimum value of kurtosis the lower limit can be.
Default is .99 confidence limit assuming normality.
Returns
-------
Interval : tuple
Lower and upper confidence limit
Notes
-----
For small n, upper_bound and lower_bound may have to be
provided by the user. Consider using test_kurt to find
values close to the desired significance level.
If function returns f(a) and f(b) must have different signs, consider
expanding the bounds.
"""
endog = self.endog
nobs = self.nobs
if upper_bound is None:
upper_bound = kurtosis(endog) + \
(2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5) * \
(((nobs ** 2.) - 1.) / ((nobs - 3.) *\
(nobs + 5.))) ** .5)
if lower_bound is None:
lower_bound = kurtosis(endog) - \
(2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5) * \
(((nobs ** 2.) - 1.) / ((nobs - 3.) *\
(nobs + 5.))) ** .5)
self.r0 = chi2.ppf(1 - sig, 1)
llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \
kurtosis(endog))
ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \
upper_bound)
return llim, ulim
[docs]class DescStatMV(_OptFuncts):
"""
A class for conducting inference on multivariate means and correlation.
Parameters
----------
endog : ndarray
Data to be analyzed
Attributes
----------
endog : ndarray
Data to be analyzed
nobs : float
Number of observations
"""
def __init__(self, endog):
self.endog = endog
self.nobs = endog.shape[0]
def mv_test_mean(self, mu_array, return_weights=False):
"""
Returns -2 x log likelihood and the p-value
for a multivariate hypothesis test of the mean
Parameters
----------
mu_array : 1d array
Hypothesized values for the mean. Must have same number of
elements as columns in endog
return_weights : bool
If True, returns the weights that maximize the
likelihood of mu_array. Default is False.
Returns
-------
test_results : tuple
The log-likelihood ratio and p-value for mu_array
"""
endog = self.endog
nobs = self.nobs
if len(mu_array) != endog.shape[1]:
raise ValueError('mu_array must have the same number of '
'elements as the columns of the data.')
mu_array = mu_array.reshape(1, endog.shape[1])
means = np.ones((endog.shape[0], endog.shape[1]))
means = mu_array * means
est_vect = endog - means
start_vals = 1. / nobs * np.ones(endog.shape[1])
eta_star = self._modif_newton(start_vals, est_vect,
np.ones(nobs) * (1. / nobs))
denom = 1 + np.dot(eta_star, est_vect.T)
self.new_weights = 1 / nobs * 1 / denom
llr = -2 * np.sum(np.log(nobs * self.new_weights))
p_val = chi2.sf(llr, mu_array.shape[1])
if return_weights:
return llr, p_val, self.new_weights.T
else:
return llr, p_val
def mv_mean_contour(self, mu1_low, mu1_upp, mu2_low, mu2_upp, step1, step2,
levs=(.001, .01, .05, .1, .2), var1_name=None,
var2_name=None, plot_dta=False):
"""
Creates a confidence region plot for the mean of bivariate data
Parameters
----------
m1_low : float
Minimum value of the mean for variable 1
m1_upp : float
Maximum value of the mean for variable 1
mu2_low : float
Minimum value of the mean for variable 2
mu2_upp : float
Maximum value of the mean for variable 2
step1 : float
Increment of evaluations for variable 1
step2 : float
Increment of evaluations for variable 2
levs : list
Levels to be drawn on the contour plot.
Default = (.001, .01, .05, .1, .2)
plot_dta : bool
If True, makes a scatter plot of the data on
top of the contour plot. Defaultis False.
var1_name : str
Name of variable 1 to be plotted on the x-axis
var2_name : str
Name of variable 2 to be plotted on the y-axis
Notes
-----
The smaller the step size, the more accurate the intervals
will be
If the function returns optimization failed, consider narrowing
the boundaries of the plot
Examples
--------
>>> import statsmodels.api as sm
>>> two_rvs = np.random.standard_normal((20,2))
>>> el_analysis = sm.emplike.DescStat(two_rvs)
>>> contourp = el_analysis.mv_mean_contour(-2, 2, -2, 2, .1, .1)
>>> contourp.show()
"""
if self.endog.shape[1] != 2:
raise ValueError('Data must contain exactly two variables')
fig, ax = utils.create_mpl_ax()
if var2_name is None:
ax.set_ylabel('Variable 2')
else:
ax.set_ylabel(var2_name)
if var1_name is None:
ax.set_xlabel('Variable 1')
else:
ax.set_xlabel(var1_name)
x = np.arange(mu1_low, mu1_upp, step1)
y = np.arange(mu2_low, mu2_upp, step2)
pairs = itertools.product(x, y)
z = []
for i in pairs:
z.append(self.mv_test_mean(np.asarray(i))[0])
X, Y = np.meshgrid(x, y)
z = np.asarray(z)
z = z.reshape(X.shape[1], Y.shape[0])
ax.contour(x, y, z.T, levels=levs)
if plot_dta:
ax.plot(self.endog[:, 0], self.endog[:, 1], 'bo')
return fig
def test_corr(self, corr0, return_weights=0):
"""
Returns -2 x log-likelihood ratio and p-value for the
correlation coefficient between 2 variables
Parameters
----------
corr0 : float
Hypothesized value to be tested
return_weights : bool
If true, returns the weights that maximize
the log-likelihood at the hypothesized value
"""
nobs = self.nobs
endog = self.endog
if endog.shape[1] != 2:
raise NotImplementedError('Correlation matrix not yet implemented')
nuis0 = np.array([endog[:, 0].mean(),
endog[:, 0].var(),
endog[:, 1].mean(),
endog[:, 1].var()])
x0 = np.zeros(5)
weights0 = np.array([1. / nobs] * int(nobs))
args = (corr0, endog, nobs, x0, weights0)
llr = optimize.fmin(self._opt_correl, nuis0, args=args,
full_output=1, disp=0)[1]
p_val = chi2.sf(llr, 1)
if return_weights:
return llr, p_val, self.new_weights.T
return llr, p_val
def ci_corr(self, sig=.05, upper_bound=None, lower_bound=None):
"""
Returns the confidence intervals for the correlation coefficient
Parameters
----------
sig : float
The significance level. Default is .05
upper_bound : float
Maximum value the upper confidence limit can be.
Default is 99% confidence limit assuming normality.
lower_bound : float
Minimum value the lower confidence limit can be.
Default is 99% confidence limit assuming normality.
Returns
-------
interval : tuple
Confidence interval for the correlation
"""
endog = self.endog
nobs = self.nobs
self.r0 = chi2.ppf(1 - sig, 1)
point_est = np.corrcoef(endog[:, 0], endog[:, 1])[0, 1]
if upper_bound is None:
upper_bound = min(.999, point_est + \
2.5 * ((1. - point_est ** 2.) / \
(nobs - 2.)) ** .5)
if lower_bound is None:
lower_bound = max(- .999, point_est - \
2.5 * (np.sqrt((1. - point_est ** 2.) / \
(nobs - 2.))))
llim = optimize.brenth(self._ci_limits_corr, lower_bound, point_est)
ulim = optimize.brenth(self._ci_limits_corr, point_est, upper_bound)
return llim, ulim