"""
The RegressionFDR class implements the 'Knockoff' approach for
controlling false discovery rates (FDR) in regression analysis.
The knockoff approach does not require standard errors. Thus one
application is to provide inference for parameter estimates that are
not smooth functions of the data. For example, the knockoff approach
can be used to do inference for parameter estimates obtained from the
LASSO, of from stepwise variable selection.
The knockoff approach controls FDR for parameter estimates that may be
dependent, such as coefficient estimates in a multiple regression
model.
The knockoff approach is applicable whenever the test statistic can be
computed entirely from x'y and x'x, where x is the design matrix and y
is the vector of responses.
Reference
---------
Rina Foygel Barber, Emmanuel Candes (2015). Controlling the False
Discovery Rate via Knockoffs. Annals of Statistics 43:5.
http://statweb.stanford.edu/~candes/papers/FDR_regression.pdf
"""
import numpy as np
import pandas as pd
from statsmodels.iolib import summary2
[docs]class RegressionFDR(object):
"""
Control FDR in a regression procedure.
Parameters
----------
endog : array_like
The dependent variable of the regression
exog : array_like
The independent variables of the regression
regeffects : RegressionEffects instance
An instance of a RegressionEffects class that can compute
effect sizes for the regression coefficients.
method : str
The approach used to assess and control FDR, currently
must be 'knockoff'.
Returns
-------
Returns an instance of the RegressionFDR class. The `fdr` attribute
holds the estimated false discovery rates.
Notes
-----
This class Implements the knockoff method of Barber and Candes.
This is an approach for controlling the FDR of a variety of
regression estimation procedures, including correlation
coefficients, OLS regression, OLS with forward selection, and
LASSO regression.
For other approaches to FDR control in regression, see the
statsmodels.stats.multitest module. Methods provided in that
module use Z-scores or p-values, and therefore require standard
errors for the coefficient estimates to be available.
The default method for constructing the augmented design matrix is
the 'equivariant' approach, set `design_method='sdp'` to use an
alternative approach involving semidefinite programming. See
Barber and Candes for more information about both approaches. The
sdp approach requires that the cvxopt package be installed.
"""
def __init__(self, endog, exog, regeffects, method="knockoff",
**kwargs):
if hasattr(exog, "columns"):
self.xnames = exog.columns
else:
self.xnames = ["x%d" % j for j in range(exog.shape[1])]
exog = np.asarray(exog)
endog = np.asarray(endog)
if "design_method" not in kwargs:
kwargs["design_method"] = "equi"
nobs, nvar = exog.shape
if kwargs["design_method"] == "equi":
exog1, exog2, _ = _design_knockoff_equi(exog)
elif kwargs["design_method"] == "sdp":
exog1, exog2, _ = _design_knockoff_sdp(exog)
endog = endog - np.mean(endog)
self.endog = endog
self.exog = np.concatenate((exog1, exog2), axis=1)
self.exog1 = exog1
self.exog2 = exog2
self.stats = regeffects.stats(self)
unq, inv, cnt = np.unique(self.stats, return_inverse=True,
return_counts=True)
# The denominator of the FDR
cc = np.cumsum(cnt)
denom = len(self.stats) - cc + cnt
denom[denom < 1] = 1
# The numerator of the FDR
ii = np.searchsorted(unq, -unq, side='right') - 1
numer = cc[ii]
numer[ii < 0] = 0
# The knockoff+ estimated FDR
fdrp = (1 + numer) / denom
# The knockoff estimated FDR
fdr = numer / denom
self.fdr = fdr[inv]
self.fdrp = fdrp[inv]
self._ufdr = fdr
self._unq = unq
df = pd.DataFrame(index=self.xnames)
df["Stat"] = self.stats
df["FDR+"] = self.fdrp
df["FDR"] = self.fdr
self.fdr_df = df
def threshold(self, tfdr):
"""
Returns the threshold statistic for a given target FDR.
"""
if np.min(self._ufdr) <= tfdr:
return self._unq[self._ufdr <= tfdr][0]
else:
return np.inf
def summary(self):
summ = summary2.Summary()
summ.add_title("Regression FDR results")
summ.add_df(self.fdr_df)
return summ
def _design_knockoff_sdp(exog):
"""
Use semidefinite programming to construct a knockoff design
matrix.
Requires cvxopt to be installed.
"""
try:
from cvxopt import solvers, matrix
except ImportError:
raise ValueError("SDP knockoff designs require installation of cvxopt")
nobs, nvar = exog.shape
# Standardize exog
xnm = np.sum(exog**2, 0)
xnm = np.sqrt(xnm)
exog = exog / xnm
Sigma = np.dot(exog.T, exog)
c = matrix(-np.ones(nvar))
h0 = np.concatenate((np.zeros(nvar), np.ones(nvar)))
h0 = matrix(h0)
G0 = np.concatenate((-np.eye(nvar), np.eye(nvar)), axis=0)
G0 = matrix(G0)
h1 = 2 * Sigma
h1 = matrix(h1)
i, j = np.diag_indices(nvar)
G1 = np.zeros((nvar*nvar, nvar))
G1[i*nvar + j, i] = 1
G1 = matrix(G1)
solvers.options['show_progress'] = False
sol = solvers.sdp(c, G0, h0, [G1], [h1])
sl = np.asarray(sol['x']).ravel()
xcov = np.dot(exog.T, exog)
exogn = _get_knmat(exog, xcov, sl)
return exog, exogn, sl
def _design_knockoff_equi(exog):
"""
Construct an equivariant design matrix for knockoff analysis.
Follows the 'equi-correlated knockoff approach of equation 2.4 in
Barber and Candes.
Constructs a pair of design matrices exogs, exogn such that exogs
is a scaled/centered version of the input matrix exog, exogn is
another matrix of the same shape with cov(exogn) = cov(exogs), and
the covariances between corresponding columns of exogn and exogs
are as small as possible.
"""
nobs, nvar = exog.shape
if nobs < 2*nvar:
msg = "The equivariant knockoff can ony be used when n >= 2*p"
raise ValueError(msg)
# Standardize exog
xnm = np.sum(exog**2, 0)
xnm = np.sqrt(xnm)
exog = exog / xnm
xcov = np.dot(exog.T, exog)
ev, _ = np.linalg.eig(xcov)
evmin = np.min(ev)
sl = min(2*evmin, 1)
sl = sl * np.ones(nvar)
exogn = _get_knmat(exog, xcov, sl)
return exog, exogn, sl
def _get_knmat(exog, xcov, sl):
# Utility function, see equation 2.2 of Barber & Candes.
nobs, nvar = exog.shape
ash = np.linalg.inv(xcov)
ash *= -np.outer(sl, sl)
i, j = np.diag_indices(nvar)
ash[i, j] += 2 * sl
umat = np.random.normal(size=(nobs, nvar))
u, _ = np.linalg.qr(exog)
umat -= np.dot(u, np.dot(u.T, umat))
umat, _ = np.linalg.qr(umat)
ashr, xc, _ = np.linalg.svd(ash, 0)
ashr *= np.sqrt(xc)
ashr = ashr.T
ex = (sl[:, None] * np.linalg.solve(xcov, exog.T)).T
exogn = exog - ex + np.dot(umat, ashr)
return exogn