Source code for statsmodels.stats.correlation_tools

# -*- coding: utf-8 -*-
"""

Created on Fri Aug 17 13:10:52 2012

Author: Josef Perktold
License: BSD-3
"""

from statsmodels.tools.sm_exceptions import (IterationLimitWarning,
    iteration_limit_doc)
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import svds
from scipy.optimize import fminbound

def clip_evals(x, value=0): #threshold=0, value=0):
    evals, evecs = np.linalg.eigh(x)
    clipped = np.any(evals < 0)
    x_new = np.dot(evecs * np.maximum(evals, value), evecs.T)
    return x_new, clipped


[docs]def corr_nearest(corr, threshold=1e-15, n_fact=100): ''' Find the nearest correlation matrix that is positive semi-definite. The function iteratively adjust the correlation matrix by clipping the eigenvalues of a difference matrix. The diagonal elements are set to one. Parameters ---------- corr : ndarray, (k, k) initial correlation matrix threshold : float clipping threshold for smallest eigenvalue, see Notes n_fact : int or float factor to determine the maximum number of iterations. The maximum number of iterations is the integer part of the number of columns in the correlation matrix times n_fact. Returns ------- corr_new : ndarray, (optional) corrected correlation matrix Notes ----- The smallest eigenvalue of the corrected correlation matrix is approximately equal to the ``threshold``. If the threshold=0, then the smallest eigenvalue of the correlation matrix might be negative, but zero within a numerical error, for example in the range of -1e-16. Assumes input correlation matrix is symmetric. Stops after the first step if correlation matrix is already positive semi-definite or positive definite, so that smallest eigenvalue is above threshold. In this case, the returned array is not the original, but is equal to it within numerical precision. See Also -------- corr_clipped cov_nearest ''' k_vars = corr.shape[0] if k_vars != corr.shape[1]: raise ValueError("matrix is not square") diff = np.zeros(corr.shape) x_new = corr.copy() diag_idx = np.arange(k_vars) for ii in range(int(len(corr) * n_fact)): x_adj = x_new - diff x_psd, clipped = clip_evals(x_adj, value=threshold) if not clipped: x_new = x_psd break diff = x_psd - x_adj x_new = x_psd.copy() x_new[diag_idx, diag_idx] = 1 else: import warnings warnings.warn(iteration_limit_doc, IterationLimitWarning) return x_new
[docs]def corr_clipped(corr, threshold=1e-15): ''' Find a near correlation matrix that is positive semi-definite This function clips the eigenvalues, replacing eigenvalues smaller than the threshold by the threshold. The new matrix is normalized, so that the diagonal elements are one. Compared to corr_nearest, the distance between the original correlation matrix and the positive definite correlation matrix is larger, however, it is much faster since it only computes eigenvalues once. Parameters ---------- corr : ndarray, (k, k) initial correlation matrix threshold : float clipping threshold for smallest eigenvalue, see Notes Returns ------- corr_new : ndarray, (optional) corrected correlation matrix Notes ----- The smallest eigenvalue of the corrected correlation matrix is approximately equal to the ``threshold``. In examples, the smallest eigenvalue can be by a factor of 10 smaller than the threshold, e.g. threshold 1e-8 can result in smallest eigenvalue in the range between 1e-9 and 1e-8. If the threshold=0, then the smallest eigenvalue of the correlation matrix might be negative, but zero within a numerical error, for example in the range of -1e-16. Assumes input correlation matrix is symmetric. The diagonal elements of returned correlation matrix is set to ones. If the correlation matrix is already positive semi-definite given the threshold, then the original correlation matrix is returned. ``cov_clipped`` is 40 or more times faster than ``cov_nearest`` in simple example, but has a slightly larger approximation error. See Also -------- corr_nearest cov_nearest ''' x_new, clipped = clip_evals(corr, value=threshold) if not clipped: return corr #cov2corr x_std = np.sqrt(np.diag(x_new)) x_new = x_new / x_std / x_std[:,None] return x_new
[docs]def cov_nearest(cov, method='clipped', threshold=1e-15, n_fact=100, return_all=False): ''' Find the nearest covariance matrix that is postive (semi-) definite This leaves the diagonal, i.e. the variance, unchanged Parameters ---------- cov : ndarray, (k,k) initial covariance matrix method : string if "clipped", then the faster but less accurate ``corr_clipped`` is used. if "nearest", then ``corr_nearest`` is used threshold : float clipping threshold for smallest eigen value, see Notes nfact : int or float factor to determine the maximum number of iterations in ``corr_nearest``. See its doc string return_all : bool if False (default), then only the covariance matrix is returned. If True, then correlation matrix and standard deviation are additionally returned. Returns ------- cov_ : ndarray corrected covariance matrix corr_ : ndarray, (optional) corrected correlation matrix std_ : ndarray, (optional) standard deviation Notes ----- This converts the covariance matrix to a correlation matrix. Then, finds the nearest correlation matrix that is positive semidefinite and converts it back to a covariance matrix using the initial standard deviation. The smallest eigenvalue of the intermediate correlation matrix is approximately equal to the ``threshold``. If the threshold=0, then the smallest eigenvalue of the correlation matrix might be negative, but zero within a numerical error, for example in the range of -1e-16. Assumes input covariance matrix is symmetric. See Also -------- corr_nearest corr_clipped ''' from statsmodels.stats.moment_helpers import cov2corr, corr2cov cov_, std_ = cov2corr(cov, return_std=True) if method == 'clipped': corr_ = corr_clipped(cov_, threshold=threshold) elif method == 'nearest': corr_ = corr_nearest(cov_, threshold=threshold, n_fact=n_fact) cov_ = corr2cov(corr_, std_) if return_all: return cov_, corr_, std_ else: return cov_
def _nmono_linesearch(obj, grad, x, d, obj_hist, M=10, sig1=0.1, sig2=0.9, gam=1e-4, maxiter=100): """ Implements the non-monotone line search of Grippo et al. (1986), as described in Birgin, Martinez and Raydan (2013). Parameters ---------- obj : real-valued function The objective function, to be minimized grad : vector-valued function The gradient of the objective function x : array_like The starting point for the line search d : array_like The search direction obj_hist : array_like Objective function history (must contain at least one value) M : positive integer Number of previous function points to consider (see references for details). sig1 : real Tuning parameter, see references for details. sig2 : real Tuning parameter, see references for details. gam : real Tuning parameter, see references for details. maxiter : positive integer The maximum number of iterations; returns Nones if convergence does not occur by this point Returns ------- alpha : real The step value x : Array_like The function argument at the final step obval : Real The function value at the final step g : Array_like The gradient at the final step Notes ----- The basic idea is to take a big step in the direction of the gradient, even if the function value is not decreased (but there is a maximum allowed increase in terms of the recent history of the iterates). References ---------- Grippo L, Lampariello F, Lucidi S (1986). A Nonmonotone Line Search Technique for Newton's Method. SIAM Journal on Numerical Analysis, 23, 707-716. E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected gradient methods: Review and perspectives. Journal of Statistical Software (preprint). """ alpha = 1. last_obval = obj(x) obj_max = max(obj_hist[-M:]) for iter in range(maxiter): obval = obj(x + alpha*d) g = grad(x) gtd = (g * d).sum() if obval <= obj_max + gam*alpha*gtd: return alpha, x + alpha*d, obval, g a1 = -0.5*alpha**2*gtd / (obval - last_obval - alpha*gtd) if (sig1 <= a1) and (a1 <= sig2*alpha): alpha = a1 else: alpha /= 2. last_obval = obval return None, None, None, None def _spg_optim(func, grad, start, project, maxiter=1e4, M=10, ctol=1e-3, maxiter_nmls=200, lam_min=1e-30, lam_max=1e30, sig1=0.1, sig2=0.9, gam=1e-4): """ Implements the spectral projected gradient method for minimizing a differentiable function on a convex domain. Parameters ---------- func : real valued function The objective function to be minimized. grad : real array-valued function The gradient of the objective function start : array_like The starting point project : function In-place projection of the argument to the domain of func. ... See notes regarding additional arguments Returns ------- rslt : Bunch rslt.params is the final iterate, other fields describe convergence status. Notes ----- This can be an effective heuristic algorithm for problems where no gauranteed algorithm for computing a global minimizer is known. There are a number of tuning parameters, but these generally should not be changed except for `maxiter` (positive integer) and `ctol` (small positive real). See the Birgin et al reference for more information about the tuning parameters. Reference --------- E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected gradient methods: Review and perspectives. Journal of Statistical Software (preprint). Available at: http://www.ime.usp.br/~egbirgin/publications/bmr5.pdf """ lam = min(10*lam_min, lam_max) params = start.copy() gval = grad(params) obj_hist = [func(params),] for itr in range(int(maxiter)): # Check convergence df = params - gval project(df) df -= params if np.max(np.abs(df)) < ctol: return Bunch(**{"Converged": True, "params": params, "objective_values": obj_hist, "Message": "Converged successfully"}) # The line search direction d = params - lam*gval project(d) d -= params # Carry out the nonmonotone line search alpha, params1, fval, gval1 = _nmono_linesearch(func, grad, params, d, obj_hist, M=M, sig1=sig1, sig2=sig2, gam=gam, maxiter=maxiter_nmls) if alpha is None: return Bunch(**{"Converged": False, "params": params, "objective_values": obj_hist, "Message": "Failed in nmono_linesearch"}) obj_hist.append(fval) s = params1 - params y = gval1 - gval sy = (s*y).sum() if sy <= 0: lam = lam_max else: ss = (s*s).sum() lam = max(lam_min, min(ss/sy, lam_max)) params = params1 gval = gval1 return Bunch(**{"Converged": False, "params": params, "objective_values": obj_hist, "Message": "spg_optim did not converge"}) def _project_correlation_factors(X): """ Project a matrix into the domain of matrices whose row-wise sums of squares are less than or equal to 1. The input matrix is modified in-place. """ nm = np.sqrt((X*X).sum(1)) ii = np.flatnonzero(nm > 1) if len(ii) > 0: X[ii,:] /= nm[ii][:, None] #TODO does this belong in a tools module somewhere? class Bunch(object): def __init__(self, **kwargs): self.__dict__.update(kwargs)
[docs]class FactoredPSDMatrix: """ Representation of a positive semidefinite matrix in factored form. The representation is constructed based on a vector `diag` and rectangular matrix `root`, such that the PSD matrix represented by the class instance is Diag + root * root', where Diag is the square diagonal matrix with `diag` on its main diagonal. Parameters ---------- diag : 1d array-like See above root : 2d array-like See above Notes ----- The matrix is represented internally in the form Diag^{1/2}(I + factor * scales * factor')Diag^{1/2}, where `Diag` and `scales` are diagonal matrices, and `factor` is an orthogonal matrix. """ def __init__(self, diag, root): self.diag = diag self.root = root root = root / np.sqrt(diag)[:, None] u, s, vt = np.linalg.svd(root, 0) self.factor = u self.scales = s**2
[docs] def to_matrix(self): """ Returns the PSD matrix represented by this instance as a full (square) matrix. """ return np.diag(self.diag) + np.dot(self.root, self.root.T)
[docs] def decorrelate(self, rhs): """ Decorrelate the columns of `rhs`. Parameters ---------- rhs : array-like A 2 dimensional array with the same number of rows as the PSD matrix represented by the class instance. Returns ------- C^{-1/2} * rhs, where C is the covariance matrix represented by this class instance. Notes ----- The returned matrix has the identity matrix as its row-wise population covariance matrix. This function exploits the factor structure for efficiency. """ # I + factor * qval * factor' is the inverse square root of # the covariance matrix in the homogeneous case where diag = # 1. qval = -1 + 1 / np.sqrt(1 + self.scales) # Decorrelate in the general case. rhs = rhs / np.sqrt(self.diag)[:, None] rhs1 = np.dot(self.factor.T, rhs) rhs1 *= qval[:, None] rhs1 = np.dot(self.factor, rhs1) rhs += rhs1 return rhs
[docs] def solve(self, rhs): """ Solve a linear system of equations with factor-structured coefficients. Parameters ---------- rhs : array-like A 2 dimensional array with the same number of rows as the PSD matrix represented by the class instance. Returns ------- C^{-1} * rhs, where C is the covariance matrix represented by this class instance. Notes ----- This function exploits the factor structure for efficiency. """ qval = -self.scales / (1 + self.scales) dr = np.sqrt(self.diag) rhs = rhs / dr[:, None] mat = qval[:, None] * np.dot(self.factor.T, rhs) rhs = rhs + np.dot(self.factor, mat) return rhs / dr[:, None]
[docs] def logdet(self): """ Returns the logarithm of the determinant of a factor-structured matrix. """ logdet = np.sum(np.log(self.diag)) logdet += np.sum(np.log(self.scales)) logdet += np.sum(np.log(1 + 1 / self.scales)) return logdet
[docs]def corr_nearest_factor(corr, rank, ctol=1e-6, lam_min=1e-30, lam_max=1e30, maxiter=1000): """ Find the nearest correlation matrix with factor structure to a given square matrix. Parameters ---------- corr : square array The target matrix (to which the nearest correlation matrix is sought). Must be square, but need not be positive semidefinite. rank : positive integer The rank of the factor structure of the solution, i.e., the number of linearly independent columns of X. ctol : positive real Convergence criterion. lam_min : float Tuning parameter for spectral projected gradient optimization (smallest allowed step in the search direction). lam_max : float Tuning parameter for spectral projected gradient optimization (largest allowed step in the search direction). maxiter : integer Maximum number of iterations in spectral projected gradient optimization. Returns ------- rslt : Bunch rslt.corr is a FactoredPSDMatrix defining the estimated correlation structure. Other fields of `rslt` contain returned values from spg_optim. Notes ----- A correlation matrix has factor structure if it can be written in the form I + XX' - diag(XX'), where X is n x k with linearly independent columns, and with each row having sum of squares at most equal to 1. The approximation is made in terms of the Frobenius norm. This routine is useful when one has an approximate correlation matrix that is not positive semidefinite, and there is need to estimate the inverse, square root, or inverse square root of the population correlation matrix. The factor structure allows these tasks to be done without constructing any n x n matrices. This is a non-convex problem with no known gauranteed globally convergent algorithm for computing the solution. Borsdof, Higham and Raydan (2010) compared several methods for this problem and found the spectral projected gradient (SPG) method (used here) to perform best. The input matrix `corr` can be a dense numpy array or any scipy sparse matrix. The latter is useful if the input matrix is obtained by thresholding a very large sample correlation matrix. If `corr` is sparse, the calculations are optimized to save memory, so no working matrix with more than 10^6 elements is constructed. References ---------- R Borsdof, N Higham, M Raydan (2010). Computing a nearest correlation matrix with factor structure. SIAM J Matrix Anal Appl, 31:5, 2603-2622. http://eprints.ma.man.ac.uk/1523/01/covered/MIMS_ep2009_87.pdf Examples -------- Hard thresholding a correlation matrix may result in a matrix that is not positive semidefinite. We can approximate a hard thresholded correlation matrix with a PSD matrix as follows, where `corr` is the input correlation matrix. >>> import numpy as np >>> from statsmodels.stats.correlation_tools import corr_nearest_factor >>> np.random.seed(1234) >>> b = 1.5 - np.random.rand(10, 1) >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10) >>> corr = np.corrcoef(x.T) >>> corr = corr * (np.abs(corr) >= 0.3) >>> rslt = corr_nearest_factor(corr, 3) """ p, _ = corr.shape # Starting values (following the PCA method in BHR). u,s,vt = svds(corr, rank) X = u * np.sqrt(s) nm = np.sqrt((X**2).sum(1)) ii = np.flatnonzero(nm > 1e-5) X[ii,:] /= nm[ii][:, None] # Zero the diagonal corr1 = corr.copy() if type(corr1) == np.ndarray: np.fill_diagonal(corr1, 0) elif sparse.issparse(corr1): corr1.setdiag(np.zeros(corr1.shape[0])) corr1.eliminate_zeros() corr1.sort_indices() else: raise ValueError("Matrix type not supported") # The gradient, from lemma 4.1 of BHR. def grad(X): gr = np.dot(X, np.dot(X.T, X)) if type(corr1) == np.ndarray: gr -= np.dot(corr1, X) else: gr -= corr1.dot(X) gr -= (X*X).sum(1)[:, None] * X return 4*gr # The objective function (sum of squared deviations between fitted # and observed arrays). def func(X): if type(corr1) == np.ndarray: M = np.dot(X, X.T) np.fill_diagonal(M, 0) M -= corr1 fval = (M*M).sum() return fval else: fval = 0. # Control the size of intermediates max_ws = 1e6 bs = int(max_ws / X.shape[0]) ir = 0 while ir < X.shape[0]: ir2 = min(ir+bs, X.shape[0]) u = np.dot(X[ir:ir2, :], X.T) ii = np.arange(u.shape[0]) u[ii, ir+ii] = 0 u -= np.asarray(corr1[ir:ir2, :].todense()) fval += (u*u).sum() ir += bs return fval rslt = _spg_optim(func, grad, X, _project_correlation_factors) root = rslt.params diag = 1 - (root**2).sum(1) soln = FactoredPSDMatrix(diag, root) rslt.corr = soln del rslt.params return rslt
[docs]def cov_nearest_factor_homog(cov, rank): """ Approximate an arbitrary square matrix with a factor-structured matrix of the form k*I + XX'. Parameters ---------- cov : array-like The input array, must be square but need not be positive semidefinite rank : positive integer The rank of the fitted factor structure Returns ------- A FactoredPSDMatrix instance containing the fitted matrix Notes ----- This routine is useful if one has an estimated covariance matrix that is not SPD, and the ultimate goal is to estimate the inverse, square root, or inverse square root of the true covariance matrix. The factor structure allows these tasks to be performed without constructing any n x n matrices. The calculations use the fact that if k is known, then X can be determined from the eigen-decomposition of cov - k*I, which can in turn be easily obtained form the eigen-decomposition of `cov`. Thus the problem can be reduced to a 1-dimensional search for k that does not require repeated eigen-decompositions. If the input matrix is sparse, then cov - k*I is also sparse, so the eigen-decomposition can be done effciciently using sparse routines. The one-dimensional search for the optimal value of k is not convex, so a local minimum could be obtained. Examples -------- Hard thresholding a covariance matrix may result in a matrix that is not positive semidefinite. We can approximate a hard thresholded covariance matrix with a PSD matrix as follows: >>> import numpy as np >>> np.random.seed(1234) >>> b = 1.5 - np.random.rand(10, 1) >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10) >>> cov = np.cov(x) >>> cov = cov * (np.abs(cov) >= 0.3) >>> rslt = cov_nearest_factor_homog(cov, 3) """ m, n = cov.shape Q, Lambda, _ = svds(cov, rank) if sparse.issparse(cov): QSQ = np.dot(Q.T, cov.dot(Q)) ts = cov.diagonal().sum() tss = cov.dot(cov).diagonal().sum() else: QSQ = np.dot(Q.T, np.dot(cov, Q)) ts = np.trace(cov) tss = np.trace(np.dot(cov, cov)) def fun(k): Lambda_t = Lambda - k v = tss + m*(k**2) + np.sum(Lambda_t**2) - 2*k*ts v += 2*k*np.sum(Lambda_t) - 2*np.sum(np.diag(QSQ) * Lambda_t) return v # Get the optimal decomposition k_opt = fminbound(fun, 0, 1e5) Lambda_opt = Lambda - k_opt fac_opt = Q * np.sqrt(Lambda_opt) diag = k_opt * np.ones(m, dtype=np.float64) #- (fac_opt**2).sum(1) return FactoredPSDMatrix(diag, fac_opt)
[docs]def corr_thresholded(data, minabs=None, max_elt=1e7): """ Construct a sparse matrix containing the thresholded row-wise correlation matrix from a data array. Parameters ---------- data : array_like The data from which the row-wise thresholded correlation matrix is to be computed. minabs : non-negative real The threshold value; correlation coefficients smaller in magnitude than minabs are set to zero. If None, defaults to 1 / sqrt(n), see Notes for more information. Returns ------- cormat : sparse.coo_matrix The thresholded correlation matrix, in COO format. Notes ----- This is an alternative to C = np.corrcoef(data); C \*= (np.abs(C) >= absmin), suitable for very tall data matrices. If the data are jointly Gaussian, the marginal sampling distributions of the elements of the sample correlation matrix are approximately Gaussian with standard deviation 1 / sqrt(n). The default value of ``minabs`` is thus equal to 1 standard error, which will set to zero approximately 68% of the estimated correlation coefficients for which the population value is zero. No intermediate matrix with more than ``max_elt`` values will be constructed. However memory use could still be high if a large number of correlation values exceed `minabs` in magnitude. The thresholded matrix is returned in COO format, which can easily be converted to other sparse formats. Examples -------- Here X is a tall data matrix (e.g. with 100,000 rows and 50 columns). The row-wise correlation matrix of X is calculated and stored in sparse form, with all entries smaller than 0.3 treated as 0. >>> import numpy as np >>> np.random.seed(1234) >>> b = 1.5 - np.random.rand(10, 1) >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10) >>> cmat = corr_thresholded(x, 0.3) """ nrow, ncol = data.shape if minabs is None: minabs = 1. / float(ncol) # Row-standardize the data data = data.copy() data -= data.mean(1)[:, None] sd = data.std(1, ddof=1) ii = np.flatnonzero(sd > 1e-5) data[ii, :] /= sd[ii][:, None] ii = np.flatnonzero(sd <= 1e-5) data[ii, :] = 0 # Number of rows to process in one pass bs = int(np.floor(max_elt / nrow)) ipos_all, jpos_all, cor_values = [], [], [] ir = 0 while ir < nrow: ir2 = min(data.shape[0], ir + bs) cm = np.dot(data[ir:ir2,:], data.T) / (ncol - 1) cma = np.abs(cm) ipos, jpos = np.nonzero(cma >= minabs) ipos_all.append(ipos + ir) jpos_all.append(jpos) cor_values.append(cm[ipos, jpos]) ir += bs ipos = np.concatenate(ipos_all) jpos = np.concatenate(jpos_all) cor_values = np.concatenate(cor_values) cmat = sparse.coo_matrix((cor_values, (ipos, jpos)), (nrow, nrow)) return cmat
if __name__ == '__main__': pass