"""
Empirical CDF Functions
"""
import numpy as np
from scipy.interpolate import interp1d
def _conf_set(F, alpha=.05):
r"""
Constructs a Dvoretzky-Kiefer-Wolfowitz confidence band for the eCDF.
Parameters
----------
F : array_like
The empirical distributions
alpha : float
Set alpha for a (1 - alpha) % confidence band.
Notes
-----
Based on the DKW inequality.
.. math:: P \left( \sup_x \left| F(x) - \hat(F)_n(X) \right| > \epsilon \right) \leq 2e^{-2n\epsilon^2}
References
----------
Wasserman, L. 2006. `All of Nonparametric Statistics`. Springer.
"""
nobs = len(F)
epsilon = np.sqrt(np.log(2./alpha) / (2 * nobs))
lower = np.clip(F - epsilon, 0, 1)
upper = np.clip(F + epsilon, 0, 1)
return lower, upper
[docs]class StepFunction(object):
"""
A basic step function.
Values at the ends are handled in the simplest way possible:
everything to the left of x[0] is set to ival; everything
to the right of x[-1] is set to y[-1].
Parameters
----------
x : array_like
y : array_like
ival : float
ival is the value given to the values to the left of x[0]. Default
is 0.
sorted : bool
Default is False.
side : {'left', 'right'}, optional
Default is 'left'. Defines the shape of the intervals constituting the
steps. 'right' correspond to [a, b) intervals and 'left' to (a, b].
Examples
--------
>>> import numpy as np
>>> from statsmodels.distributions.empirical_distribution import StepFunction
>>>
>>> x = np.arange(20)
>>> y = np.arange(20)
>>> f = StepFunction(x, y)
>>>
>>> print(f(3.2))
3.0
>>> print(f([[3.2,4.5],[24,-3.1]]))
[[ 3. 4.]
[ 19. 0.]]
>>> f2 = StepFunction(x, y, side='right')
>>>
>>> print(f(3.0))
2.0
>>> print(f2(3.0))
3.0
"""
def __init__(self, x, y, ival=0., sorted=False, side='left'):
if side.lower() not in ['right', 'left']:
msg = "side can take the values 'right' or 'left'"
raise ValueError(msg)
self.side = side
_x = np.asarray(x)
_y = np.asarray(y)
if _x.shape != _y.shape:
msg = "x and y do not have the same shape"
raise ValueError(msg)
if len(_x.shape) != 1:
msg = 'x and y must be 1-dimensional'
raise ValueError(msg)
self.x = np.r_[-np.inf, _x]
self.y = np.r_[ival, _y]
if not sorted:
asort = np.argsort(self.x)
self.x = np.take(self.x, asort, 0)
self.y = np.take(self.y, asort, 0)
self.n = self.x.shape[0]
def __call__(self, time):
tind = np.searchsorted(self.x, time, self.side) - 1
return self.y[tind]
[docs]class ECDF(StepFunction):
"""
Return the Empirical CDF of an array as a step function.
Parameters
----------
x : array_like
Observations
side : {'left', 'right'}, optional
Default is 'right'. Defines the shape of the intervals constituting the
steps. 'right' correspond to [a, b) intervals and 'left' to (a, b].
Returns
-------
Empirical CDF as a step function.
Examples
--------
>>> import numpy as np
>>> from statsmodels.distributions.empirical_distribution import ECDF
>>>
>>> ecdf = ECDF([3, 3, 1, 4])
>>>
>>> ecdf([3, 55, 0.5, 1.5])
array([ 0.75, 1. , 0. , 0.25])
"""
def __init__(self, x, side='right'):
x = np.array(x, copy=True)
x.sort()
nobs = len(x)
y = np.linspace(1./nobs,1,nobs)
super(ECDF, self).__init__(x, y, side=side, sorted=True)
# TODO: make `step` an arg and have a linear interpolation option?
# This is the path with `step` is True
# If `step` is False, a previous version of the code read
# `return interp1d(x,y,drop_errors=False,fill_values=ival)`
# which would have raised a NameError if hit, so would need to be
# fixed. See GH#5701.
[docs]def monotone_fn_inverter(fn, x, vectorized=True, **keywords):
"""
Given a monotone function fn (no checking is done to verify monotonicity)
and a set of x values, return an linearly interpolated approximation
to its inverse from its values on x.
"""
x = np.asarray(x)
if vectorized:
y = fn(x, **keywords)
else:
y = []
for _x in x:
y.append(fn(_x, **keywords))
y = np.array(y)
a = np.argsort(y)
return interp1d(y[a], x[a])
if __name__ == "__main__":
#TODO: Make sure everything is correctly aligned and make a plotting
# function
from urllib.request import urlopen
import matplotlib.pyplot as plt
nerve_data = urlopen('http://www.statsci.org/data/general/nerve.txt')
nerve_data = np.loadtxt(nerve_data)
x = nerve_data / 50. # was in 1/50 seconds
cdf = ECDF(x)
x.sort()
F = cdf(x)
plt.step(x, F, where='post')
lower, upper = _conf_set(F)
plt.step(x, lower, 'r', where='post')
plt.step(x, upper, 'r', where='post')
plt.xlim(0, 1.5)
plt.ylim(0, 1.05)
plt.vlines(x, 0, .05)
plt.show()