Source code for statsmodels.multivariate.multivariate_ols

# -*- coding: utf-8 -*-

"""General linear model

author: Yichuan Liu
"""
import numpy as np
from numpy.linalg import eigvals, inv, solve, matrix_rank, pinv, svd
from scipy import stats
import pandas as pd
from patsy import DesignInfo

from statsmodels.compat.pandas import Substitution
from statsmodels.base.model import Model
from statsmodels.iolib import summary2
__docformat__ = 'restructuredtext en'

_hypotheses_doc = \
"""hypotheses : list[tuple]
    Hypothesis `L*B*M = C` to be tested where B is the parameters in
    regression Y = X*B. Each element is a tuple of length 2, 3, or 4:

      * (name, contrast_L)
      * (name, contrast_L, transform_M)
      * (name, contrast_L, transform_M, constant_C)

    containing a string `name`, the contrast matrix L, the transform
    matrix M (for transforming dependent variables), and right-hand side
    constant matrix constant_C, respectively.

    contrast_L : 2D array or an array of strings
        Left-hand side contrast matrix for hypotheses testing.
        If 2D array, each row is an hypotheses and each column is an
        independent variable. At least 1 row
        (1 by k_exog, the number of independent variables) is required.
        If an array of strings, it will be passed to
        patsy.DesignInfo().linear_constraint.

    transform_M : 2D array or an array of strings or None, optional
        Left hand side transform matrix.
        If `None` or left out, it is set to a k_endog by k_endog
        identity matrix (i.e. do not transform y matrix).
        If an array of strings, it will be passed to
        patsy.DesignInfo().linear_constraint.

    constant_C : 2D array or None, optional
        Right-hand side constant matrix.
        if `None` or left out it is set to a matrix of zeros
        Must has the same number of rows as contrast_L and the same
        number of columns as transform_M

    If `hypotheses` is None: 1) the effect of each independent variable
    on the dependent variables will be tested. Or 2) if model is created
    using a formula,  `hypotheses` will be created according to
    `design_info`. 1) and 2) is equivalent if no additional variables
    are created by the formula (e.g. dummy variables for categorical
    variables and interaction terms)
"""


def _multivariate_ols_fit(endog, exog, method='svd', tolerance=1e-8):
    """
    Solve multivariate linear model y = x * params
    where y is dependent variables, x is independent variables

    Parameters
    ----------
    endog : array_like
        each column is a dependent variable
    exog : array_like
        each column is a independent variable
    method : str
        'svd' - Singular value decomposition
        'pinv' - Moore-Penrose pseudoinverse
    tolerance : float, a small positive number
        Tolerance for eigenvalue. Values smaller than tolerance is considered
        zero.
    Returns
    -------
    a tuple of matrices or values necessary for hypotheses testing

    .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
    Notes
    -----
    Status: experimental and incomplete
    """
    y = endog
    x = exog
    nobs, k_endog = y.shape
    nobs1, k_exog= x.shape
    if nobs != nobs1:
        raise ValueError('x(n=%d) and y(n=%d) should have the same number of '
                         'rows!' % (nobs1, nobs))

    # Calculate the matrices necessary for hypotheses testing
    df_resid = nobs - k_exog
    if method == 'pinv':
        # Regression coefficients matrix
        pinv_x = pinv(x)
        params = pinv_x.dot(y)

        # inverse of x'x
        inv_cov = pinv_x.dot(pinv_x.T)
        if matrix_rank(inv_cov,tol=tolerance) < k_exog:
            raise ValueError('Covariance of x singular!')

        # Sums of squares and cross-products of residuals
        # Y'Y - (X * params)'B * params
        t = x.dot(params)
        sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
        return (params, df_resid, inv_cov, sscpr)
    elif method == 'svd':
        u, s, v = svd(x, 0)
        if (s > tolerance).sum() < len(s):
            raise ValueError('Covariance of x singular!')
        invs = 1. / s

        params = v.T.dot(np.diag(invs)).dot(u.T).dot(y)
        inv_cov = v.T.dot(np.diag(np.power(invs, 2))).dot(v)
        t = np.diag(s).dot(v).dot(params)
        sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
        return (params, df_resid, inv_cov, sscpr)
    else:
        raise ValueError('%s is not a supported method!' % method)


def multivariate_stats(eigenvals,
                       r_err_sscp,
                       r_contrast, df_resid, tolerance=1e-8):
    """
    For multivariate linear model Y = X * B
    Testing hypotheses
        L*B*M = 0
    where L is contrast matrix, B is the parameters of the
    multivariate linear model and M is dependent variable transform matrix.
        T = L*inv(X'X)*L'
        H = M'B'L'*inv(T)*LBM
        E =  M'(Y'Y - B'X'XB)M

    Parameters
    ----------
    eigenvals : array
        The eigenvalues of inv(E + H)*H
    r_err_sscp : int
        Rank of E + H
    r_contrast : int
        Rank of T matrix
    df_resid : int
        Residual degree of freedom (n_samples minus n_variables of X)
    tolerance : float
        smaller than which eigenvalue is considered 0

    Returns
    -------
    A DataFrame

    References
    ----------
    .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
    """
    v = df_resid
    p = r_err_sscp
    q = r_contrast
    s = np.min([p, q])
    ind = eigenvals > tolerance
    n_e = ind.sum()
    eigv2 = eigenvals[ind]
    eigv1 = np.array([i / (1 - i) for i in eigv2])
    m = (np.abs(p - q) - 1) / 2
    n = (v - p - 1) / 2

    cols = ['Value', 'Num DF', 'Den DF', 'F Value', 'Pr > F']
    index = ["Wilks' lambda", "Pillai's trace",
             "Hotelling-Lawley trace", "Roy's greatest root"]
    results = pd.DataFrame(columns=cols,
                           index=index)

    def fn(x):
        return np.real([x])[0]

    results.loc["Wilks' lambda", 'Value'] = fn(np.prod(1 - eigv2))

    results.loc["Pillai's trace", 'Value'] = fn(eigv2.sum())

    results.loc["Hotelling-Lawley trace", 'Value'] = fn(eigv1.sum())

    results.loc["Roy's greatest root", 'Value'] = fn(eigv1.max())

    r = v - (p - q + 1)/2
    u = (p*q - 2) / 4
    df1 = p * q
    if p*p + q*q - 5 > 0:
        t = np.sqrt((p*p*q*q - 4) / (p*p + q*q - 5))
    else:
        t = 1
    df2 = r*t - 2*u
    lmd = results.loc["Wilks' lambda", 'Value']
    lmd = np.power(lmd, 1 / t)
    F = (1 - lmd) / lmd * df2 / df1
    results.loc["Wilks' lambda", 'Num DF'] = df1
    results.loc["Wilks' lambda", 'Den DF'] = df2
    results.loc["Wilks' lambda", 'F Value'] = F
    pval = stats.f.sf(F, df1, df2)
    results.loc["Wilks' lambda", 'Pr > F'] = pval

    V = results.loc["Pillai's trace", 'Value']
    df1 = s * (2*m + s + 1)
    df2 = s * (2*n + s + 1)
    F = df2 / df1 * V / (s - V)
    results.loc["Pillai's trace", 'Num DF'] = df1
    results.loc["Pillai's trace", 'Den DF'] = df2
    results.loc["Pillai's trace", 'F Value'] = F
    pval = stats.f.sf(F, df1, df2)
    results.loc["Pillai's trace", 'Pr > F'] = pval

    U = results.loc["Hotelling-Lawley trace", 'Value']
    if n > 0:
        b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
        df1 = p * q
        df2 = 4 + (p*q + 2) / (b - 1)
        c = (df2 - 2) / 2 / n
        F = df2 / df1 * U / c
    else:
        df1 = s * (2*m + s + 1)
        df2 = s * (s*n + 1)
        F = df2 / df1 / s * U
    results.loc["Hotelling-Lawley trace", 'Num DF'] = df1
    results.loc["Hotelling-Lawley trace", 'Den DF'] = df2
    results.loc["Hotelling-Lawley trace", 'F Value'] = F
    pval = stats.f.sf(F, df1, df2)
    results.loc["Hotelling-Lawley trace", 'Pr > F'] = pval

    sigma = results.loc["Roy's greatest root", 'Value']
    r = np.max([p, q])
    df1 = r
    df2 = v - r + q
    F = df2 / df1 * sigma
    results.loc["Roy's greatest root", 'Num DF'] = df1
    results.loc["Roy's greatest root", 'Den DF'] = df2
    results.loc["Roy's greatest root", 'F Value'] = F
    pval = stats.f.sf(F, df1, df2)
    results.loc["Roy's greatest root", 'Pr > F'] = pval
    return results


def _multivariate_ols_test(hypotheses, fit_results, exog_names,
                            endog_names):
    def fn(L, M, C):
        # .. [1] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
        params, df_resid, inv_cov, sscpr = fit_results
        # t1 = (L * params)M
        t1 = L.dot(params).dot(M) - C
        # H = t1'L(X'X)^L't1
        t2 = L.dot(inv_cov).dot(L.T)
        q = matrix_rank(t2)
        H = t1.T.dot(inv(t2)).dot(t1)

        # E = M'(Y'Y - B'(X'X)B)M
        E = M.T.dot(sscpr).dot(M)
        return E, H, q, df_resid

    return _multivariate_test(hypotheses, exog_names, endog_names, fn)


@Substitution(hypotheses_doc=_hypotheses_doc)
def _multivariate_test(hypotheses, exog_names, endog_names, fn):
    """
    Multivariate linear model hypotheses testing

    For y = x * params, where y are the dependent variables and x are the
    independent variables, testing L * params * M = 0 where L is the contrast
    matrix for hypotheses testing and M is the transformation matrix for
    transforming the dependent variables in y.

    Algorithm:
        T = L*inv(X'X)*L'
        H = M'B'L'*inv(T)*LBM
        E =  M'(Y'Y - B'X'XB)M
    And then finding the eigenvalues of inv(H + E)*H

    .. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm

    Parameters
    ----------
    %(hypotheses_doc)s
    k_xvar : int
        The number of independent variables
    k_yvar : int
        The number of dependent variables
    fn : function
        a function fn(contrast_L, transform_M) that returns E, H, q, df_resid
        where q is the rank of T matrix

    Returns
    -------
    results : MANOVAResults
    """

    k_xvar = len(exog_names)
    k_yvar = len(endog_names)
    results = {}
    for hypo in hypotheses:
        if len(hypo) ==2:
            name, L = hypo
            M = None
            C = None
        elif len(hypo) == 3:
            name, L, M = hypo
            C = None
        elif len(hypo) == 4:
            name, L, M, C = hypo
        else:
            raise ValueError('hypotheses must be a tuple of length 2, 3 or 4.'
                             ' len(hypotheses)=%d' % len(hypo))
        if any(isinstance(j, str) for j in L):
            L = DesignInfo(exog_names).linear_constraint(L).coefs
        else:
            if not isinstance(L, np.ndarray) or len(L.shape) != 2:
                raise ValueError('Contrast matrix L must be a 2-d array!')
            if L.shape[1] != k_xvar:
                raise ValueError('Contrast matrix L should have the same '
                                 'number of columns as exog! %d != %d' %
                                 (L.shape[1], k_xvar))
        if M is None:
            M = np.eye(k_yvar)
        elif any(isinstance(j, str) for j in M):
            M = DesignInfo(endog_names).linear_constraint(M).coefs.T
        else:
            if M is not None:
                if not isinstance(M, np.ndarray) or len(M.shape) != 2:
                    raise ValueError('Transform matrix M must be a 2-d array!')
                if M.shape[0] != k_yvar:
                    raise ValueError('Transform matrix M should have the same '
                                     'number of rows as the number of columns '
                                     'of endog! %d != %d' %
                                     (M.shape[0], k_yvar))
        if C is None:
            C = np.zeros([L.shape[0], M.shape[1]])
        elif not isinstance(C, np.ndarray):
            raise ValueError('Constant matrix C must be a 2-d array!')

        if C.shape[0] != L.shape[0]:
            raise ValueError('contrast L and constant C must have the same '
                             'number of rows! %d!=%d'
                             % (L.shape[0], C.shape[0]))
        if C.shape[1] != M.shape[1]:
            raise ValueError('transform M and constant C must have the same '
                             'number of columns! %d!=%d'
                             % (M.shape[1], C.shape[1]))
        E, H, q, df_resid = fn(L, M, C)
        EH = np.add(E, H)
        p = matrix_rank(EH)

        # eigenvalues of inv(E + H)H
        eigv2 = np.sort(eigvals(solve(EH, H)))
        stat_table = multivariate_stats(eigv2, p, q, df_resid)

        results[name] = {'stat':stat_table, 'contrast_L':L,
                         'transform_M':M, 'constant_C':C}
    return results


[docs]class _MultivariateOLS(Model): """ Multivariate linear model via least squares Parameters ---------- endog : array_like Dependent variables. A nobs x k_endog array where nobs is the number of observations and k_endog is the number of dependent variables exog : array_like Independent variables. A nobs x k_exog array where nobs is the number of observations and k_exog is the number of independent variables. An intercept is not included by default and should be added by the user (models specified using a formula include an intercept by default) Attributes ---------- endog : array See Parameters. exog : array See Parameters. """ _formula_max_endog = None def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs): if len(endog.shape) == 1 or endog.shape[1] == 1: raise ValueError('There must be more than one dependent variable' ' to fit multivariate OLS!') super(_MultivariateOLS, self).__init__(endog, exog, missing=missing, hasconst=hasconst, **kwargs)
[docs] def fit(self, method='svd'): self._fittedmod = _multivariate_ols_fit( self.endog, self.exog, method=method) return _MultivariateOLSResults(self)
[docs]class _MultivariateOLSResults(object): """ _MultivariateOLS results class """ def __init__(self, fitted_mv_ols): if (hasattr(fitted_mv_ols, 'data') and hasattr(fitted_mv_ols.data, 'design_info')): self.design_info = fitted_mv_ols.data.design_info else: self.design_info = None self.exog_names = fitted_mv_ols.exog_names self.endog_names = fitted_mv_ols.endog_names self._fittedmod = fitted_mv_ols._fittedmod def __str__(self): return self.summary().__str__()
[docs] @Substitution(hypotheses_doc=_hypotheses_doc) def mv_test(self, hypotheses=None): """ Linear hypotheses testing Parameters ---------- %(hypotheses_doc)s Returns ------- results: _MultivariateOLSResults Notes ----- Tests hypotheses of the form L * params * M = C where `params` is the regression coefficient matrix for the linear model y = x * params, `L` is the contrast matrix, `M` is the dependent variable transform matrix and C is the constant matrix. """ k_xvar = len(self.exog_names) if hypotheses is None: if self.design_info is not None: terms = self.design_info.term_name_slices hypotheses = [] for key in terms: L_contrast = np.eye(k_xvar)[terms[key], :] hypotheses.append([key, L_contrast, None]) else: hypotheses = [] for i in range(k_xvar): name = 'x%d' % (i) L = np.zeros([1, k_xvar]) L[i] = 1 hypotheses.append([name, L, None]) results = _multivariate_ols_test(hypotheses, self._fittedmod, self.exog_names, self.endog_names) return MultivariateTestResults(results, self.endog_names, self.exog_names)
[docs] def summary(self): raise NotImplementedError
[docs]class MultivariateTestResults(object): """ Multivariate test results class Returned by `mv_test` method of `_MultivariateOLSResults` class Attributes ---------- results : dict For hypothesis name `key`: results[key]['stat'] contains the multivariate test results results[key]['contrast_L'] contains the contrast_L matrix results[key]['transform_M'] contains the transform_M matrix results[key]['constant_C'] contains the constant_C matrix endog_names : str exog_names : str summary_frame : multiindex dataframe Returns results as a multiindex dataframe """ def __init__(self, mv_test_df, endog_names, exog_names): self.results = mv_test_df self.endog_names = endog_names self.exog_names = exog_names def __str__(self): return self.summary().__str__() def __getitem__(self, item): return self.results[item] @property def summary_frame(self): """ Return results as a multiindex dataframe """ df = [] for key in self.results: tmp = self.results[key]['stat'].copy() tmp.loc[:, 'Effect'] = key df.append(tmp.reset_index()) df = pd.concat(df, axis=0) df = df.set_index(['Effect', 'index']) df.index.set_names(['Effect', 'Statistic'], inplace=True) return df
[docs] def summary(self, show_contrast_L=False, show_transform_M=False, show_constant_C=False): """ Parameters ---------- contrast_L : True or False Whether to show contrast_L matrix transform_M : True or False Whether to show transform_M matrix """ summ = summary2.Summary() summ.add_title('Multivariate linear model') for key in self.results: summ.add_dict({'':''}) df = self.results[key]['stat'].copy() df = df.reset_index() c = df.columns.values c[0] = key df.columns = c df.index = ['', '', '', ''] summ.add_df(df) if show_contrast_L: summ.add_dict({key:' contrast L='}) df = pd.DataFrame(self.results[key]['contrast_L'], columns=self.exog_names) summ.add_df(df) if show_transform_M: summ.add_dict({key:' transform M='}) df = pd.DataFrame(self.results[key]['transform_M'], index=self.endog_names) summ.add_df(df) if show_constant_C: summ.add_dict({key:' constant C='}) df = pd.DataFrame(self.results[key]['constant_C']) summ.add_df(df) return summ