Source code for statsmodels.sandbox.descstats

'''
Glue for returning descriptive statistics.
'''
import numpy as np
from scipy import stats
import os
from statsmodels.stats.descriptivestats import sign_test

#############################################
#
#============================================
#       Univariate Descriptive Statistics
#============================================
#

[docs]def descstats(data, cols=None, axis=0): ''' Prints descriptive statistics for one or multiple variables. Parameters ---------- data: numpy array `x` is the data v: list, optional A list of the column number or field names (for a recarray) of variables. Default is all columns. axis: 1 or 0 axis order of data. Default is 0 for column-ordered data. Examples -------- >>> descstats(data.exog,v=['x_1','x_2','x_3']) ''' x = np.array(data) # or rather, the data we're interested in if isinstance(x, np.recarray): # deprecated: remove recarray support after 0.12 import warnings from statsmodels.tools.sm_exceptions import recarray_warning warnings.warn(recarray_warning, FutureWarning) if cols is None: # if isinstance(x, np.recarray): # cols = np.array(len(x.dtype.names)) if not isinstance(x, np.recarray) and x.ndim == 1: x = x[:,None] if x.shape[1] == 1: desc = ''' --------------------------------------------- Univariate Descriptive Statistics --------------------------------------------- Var. Name %(name)12s ---------- Obs. %(nobs)22i Range %(range)22s Sum of Wts. %(sum)22s Coeff. of Variation %(coeffvar)22.4g Mode %(mode)22.4g Skewness %(skewness)22.4g Repeats %(nmode)22i Kurtosis %(kurtosis)22.4g Mean %(mean)22.4g Uncorrected SS %(uss)22.4g Median %(median)22.4g Corrected SS %(ss)22.4g Variance %(variance)22.4g Sum Observations %(sobs)22.4g Std. Dev. %(stddev)22.4g ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \ stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \ 'mean': x.mean(), 'median': np.median(x), 'range': \ '('+str(x.min())+', '+str(x.max())+')', 'variance': \ x.var(), 'stddev': x.std(), 'coeffvar': \ stats.variation(x), 'skewness': stats.skew(x), \ 'kurtosis': stats.kurtosis(x), 'uss': np.sum(x**2, axis=0),\ 'ss': np.sum((x-x.mean())**2, axis=0), 'sobs': np.sum(x)} desc+= ''' Percentiles ------------- 1 %% %12.4g 5 %% %12.4g 10 %% %12.4g 25 %% %12.4g 50 %% %12.4g 75 %% %12.4g 90 %% %12.4g 95 %% %12.4g 99 %% %12.4g ''' % tuple([stats.scoreatpercentile(x,per) for per in (1,5,10,25, 50,75,90,95,99)]) t,p_t=stats.ttest_1samp(x,0) M,p_M=sign_test(x) S,p_S=stats.wilcoxon(np.squeeze(x)) desc+= ''' Tests of Location (H0: Mu0=0) ----------------------------- Test Statistic Two-tailed probability -----------------+----------------------------------------- Student's t | t %7.5f Pr > |t| <%.4f Sign | M %8.2f Pr >= |M| <%.4f Signed Rank | S %8.2f Pr >= |S| <%.4f ''' % (t,p_t,M,p_M,S,p_S) # Should this be part of a 'descstats' # in any event these should be split up, so that they can be called # individually and only returned together if someone calls summary # or something of the sort elif x.shape[1] > 1: desc =''' Var. Name | Obs. Mean Std. Dev. Range ------------+--------------------------------------------------------'''+\ os.linesep # for recarrays with columns passed as names # if isinstance(cols[0],str): # for var in cols: # desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \ #%(range)20s" % {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(), # 'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\ # +str(x[var].max())+')'+os.linesep} # else: for var in range(x.shape[1]): xv = x[:, var] kwargs = { 'name': var, 'obs': len(xv), 'mean': xv.mean(), 'stddev': xv.std(), 'range': '('+str(xv.min())+', '+str(xv.max())+')'+os.linesep } desc += ("%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g " "%(range)20s" % kwargs) else: raise ValueError("data not understood") return desc
#if __name__=='__main__': # test descstats # import os # loc='http://eagle1.american.edu/~js2796a/data/handguns_data.csv' # relpath=(load_dataset(loc)) # dta=np.recfromcsv(relpath) # descstats(dta,['stpop']) # raw_input('Hit enter for multivariate test') # descstats(dta,['stpop','avginc','vio']) # with plain arrays # import string2dummy as s2d # dts=s2d.string2dummy(dta) # ndts=np.vstack(dts[col] for col in dts.dtype.names) # observations in columns and data in rows # is easier for the call to stats # what to make of # ndts=np.column_stack(dts[col] for col in dts.dtype.names) # ntda=ntds.swapaxis(1,0) # ntda is ntds returns false? # or now we just have detailed information about the different strings # would this approach ever be inappropriate for a string typed variable # other than dates? # descstats(ndts, [1]) # raw_input("Enter to try second part") # descstats(ndts, [1,20,3]) if __name__ == '__main__': import statsmodels.api as sm data = sm.datasets.longley.load(as_pandas=False) data.exog = sm.add_constant(data.exog, prepend=False) sum1 = descstats(data.exog) sum1a = descstats(data.exog[:,:1]) # loc='http://eagle1.american.edu/~js2796a/data/handguns_data.csv' # dta=np.recfromcsv(loc) # summary2 = descstats(dta,['stpop']) # summary3 = descstats(dta,['stpop','avginc','vio']) #TODO: needs a by argument # summary4 = descstats(dta) this fails # this is a bug # p = dta[['stpop']] # p.view(dtype = np.float, type = np.ndarray) # this works # p.view(dtype = np.int, type = np.ndarray) ### This is *really* slow ### if os.path.isfile('./Econ724_PS_I_Data.csv'): data2 = np.recfromcsv('./Econ724_PS_I_Data.csv') sum2 = descstats(data2.ahe) sum3 = descstats(np.column_stack((data2.ahe,data2.yrseduc))) sum4 = descstats(np.column_stack(([data2[_] for \ _ in data2.dtype.names])))