from __future__ import division from scipy import c_, ones, dot, stats, diff from scipy.linalg import inv, solve, det from numpy import log, pi, sqrt, square, diagonal from numpy.random import randn, seed import time class ols: """ Author: Vincent Nijs (+ ?) Email: v-nijs at kellogg.northwestern.edu Last Modified: Mon Jan 15 17:56:17 CST 2007 Dependencies: See import statement at the top of this file Doc: Class for multi-variate regression using OLS For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action simply run this file using 'python ols.py'. This will generate some simulated data and run various analyses. If you have rpy installed the same model will also be estimated by R for confirmation. Input: y = dependent variable y_varnm = string with the variable label for y x = independent variables, note that a constant is added by default x_varnm = string or list of variable labels for the independent variables Output: There are no values returned by the class. Summary provides printed output. All other measures can be accessed as follows: Step 1: Create an OLS instance by passing data to the class m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4']) Step 2: Get specific metrics To print the coefficients: >>> print m.b To print the coefficients p-values: >>> print m.p """ def __init__(self,y,x,y_varnm = 'y',x_varnm = ''): """ Initializing the ols class. """ self.y = y self.x = c_[ones(x.shape[0]),x] self.y_varnm = y_varnm if not isinstance(x_varnm,list): self.x_varnm = ['const'] + list(x_varnm) else: self.x_varnm = ['const'] + x_varnm # Estimate model using OLS self.estimate() def estimate(self): # estimating coefficients, and basic stats self.inv_xx = inv(dot(self.x.T,self.x)) xy = dot(self.x.T,self.y) self.b = dot(self.inv_xx,xy) # estimate coefficients self.nobs = self.y.shape[0] # number of observations self.ncoef = self.x.shape[1] # number of coef. self.df_e = self.nobs - self.ncoef # degrees of freedom, error self.df_r = self.ncoef - 1 # degrees of freedom, regression self.e = self.y - dot(self.x,self.b) # residuals self.sse = dot(self.e,self.e)/self.df_e # SSE self.se = sqrt(diagonal(self.sse*self.inv_xx)) # coef. standard errors self.t = self.b / self.se # coef. t-statistics self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2 # coef. p-values self.R2 = 1 - self.e.var()/self.y.var() # model R-squared self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef)) # adjusted R-square self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e) # model F-statistic self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e) # F-statistic p-value def dw(self): """ Calculates the Durbin-Waston statistic """ de = diff(self.e,1) dw = dot(de,de) / dot(self.e,self.e); return dw def omni(self): """ Omnibus test for normality """ return stats.normaltest(self.e) def JB(self): """ Calculate residual skewness, kurtosis, and do the JB test for normality """ # Calculate residual skewness and kurtosis skew = stats.skew(self.e) kurtosis = 3 + stats.kurtosis(self.e) # Calculate the Jarque-Bera test for normality JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3)) JBpv = 1-stats.chi2.cdf(JB,2); return JB, JBpv, skew, kurtosis def ll(self): """ Calculate model log-likelihood and two information criteria """ # Model log-likelihood, AIC, and BIC criterion values ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs) aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs) bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs return ll, aic, bic def summary(self): """ Printing model output to screen """ # local time & date t = time.localtime() # extra stats ll, aic, bic = self.ll() JB, JBpv, skew, kurtosis = self.JB() omni, omnipv = self.omni() # printing output to screen print '\n==============================================================================' print "Dependent Variable: " + self.y_varnm print "Method: Least Squares" print "Date: ", time.strftime("%a, %d %b %Y",t) print "Time: ", time.strftime("%H:%M:%S",t) print '# obs: %5.0f' % self.nobs print '# variables: %5.0f' % self.ncoef print '==============================================================================' print 'variable coefficient std. Error t-statistic prob.' print '==============================================================================' for i in range(len(self.x_varnm)): print '''% -5s % -5.6f % -5.6f % -5.6f % -5.6f''' % tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]]) print '==============================================================================' print 'Models stats Residual stats' print '==============================================================================' print 'R-squared % -5.6f Durbin-Watson stat % -5.6f' % tuple([self.R2, self.dw()]) print 'Adjusted R-squared % -5.6f Omnibus stat % -5.6f' % tuple([self.R2adj, omni]) print 'F-statistic % -5.6f Prob(Omnibus stat) % -5.6f' % tuple([self.F, omnipv]) print 'Prob (F-statistic) % -5.6f JB stat % -5.6f' % tuple([self.Fpv, JB]) print 'Log likelihood % -5.6f Prob(JB) % -5.6f' % tuple([ll, JBpv]) print 'AIC criterion % -5.6f Skew % -5.6f' % tuple([aic, skew]) print 'BIC criterion % -5.6f Kurtosis % -5.6f' % tuple([bic, kurtosis]) print '==============================================================================' if __name__ == '__main__': ########################## ### testing the ols class ########################## # creating simulated data and variable labels seed(1) data = randn(100,5) # the data array # intercept is added, by default m = ols(data[:,0],data[:,1:],y_varnm = 'y',x_varnm = ['x1','x2','x3','x4']) m.summary() # if you have rpy installed, use it to test the results have_rpy = False try: print "\n" print "="*30 print "Validating OLS results in R" print "="*30 import rpy have_rpy = True except ImportError: print "\n" print "="*30 print "Validating OLS-class results in R" print "="*30 print "rpy is not installed" print "="*30 if have_rpy: y = data[:,0] x1 = data[:,1] x2 = data[:,2] x3 = data[:,3] x4 = data[:,4] rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y)) rpy.set_default_mode(rpy.BASIC_CONVERSION) print linear_model.as_py()['coefficients'] summary = rpy.r.summary(linear_model) print summary