使用具有特定数据的OLS代码的Python多元线性回归?

时间:2011-09-17 22:58:28

标签: python scipy regression

我正在使用scipy Cookbook下载的ols.py代码(下载是第一段带有粗体OLS),但我需要了解而不是使用随机数据来执行ols功能多元线性回归。

我有一个特定的因变量y和三个解释变量。每当我尝试用变量代替随机变量时,它就会给出错误:

  

TypeError:此构造函数不带参数。

有人可以帮忙吗?这可能吗?


以下是我尝试使用的ols代码的副本以及我尝试输入的变量

from __future__ import division
from scipy import c_, ones, dot, stats, diff
from scipy.linalg import inv, solve, det
from numpy import log, pi, sqrt, square, diagonal
from numpy.random import randn, seed
import time

class ols:
"""
Author: Vincent Nijs (+ ?)

Email: v-nijs at kellogg.northwestern.edu

Last Modified: Mon Jan 15 17:56:17 CST 2007

Dependencies: See import statement at the top of this file

Doc: Class for multi-variate regression using OLS

Input:
     dependent variable
    y_varnm = string with the variable label for y
    x = independent variables, note that a constant is added by default
    x_varnm = string or list of variable labels for the independent variables

Output:
    There are no values returned by the class. Summary provides printed output.
    All other measures can be accessed as follows:

    Step 1: Create an OLS instance by passing data to the class

        m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])

    Step 2: Get specific metrics

        To print the coefficients: 
            >>> print m.b
        To print the coefficients p-values: 
            >>> print m.p

"""

y = [29.4, 29.9, 31.4, 32.8, 33.6, 34.6, 35.5, 36.3, 37.2, 37.8, 38.5, 38.8,
            38.6, 38.8, 39, 39.7, 40.6, 41.3, 42.5, 43.9, 44.9, 45.3, 45.8, 46.5,
            77.1, 48.2, 48.8, 50.5, 51, 51.3, 50.7, 50.7, 50.6, 50.7, 50.6, 50.7]
        #tuition
x1 = [376, 407, 438, 432, 433, 479, 512, 543, 583, 635, 714, 798, 891,
            971, 1045, 1106, 1218, 1285, 1356, 1454, 1624, 1782, 1942, 2057, 2179,
            2271, 2360, 2506, 2562, 2700, 2903, 3319, 3629, 3874, 4102, 4291]
        #research and development
x2 = [28740.00, 30952.00, 33359.00, 35671.00, 39435.00, 43338.00, 48719.00, 55379.00, 63224.00,
            72292.00, 80748.00, 89950.00, 102244.00, 114671.00, 120249.00, 126360.00, 133881.00, 141891.00,
            151993.00, 160876.00, 165350.00, 165730.00, 169207.00, 183625.00, 197346.00, 212152.00, 226402.00, 
            267298.00, 277366.00, 276022.00, 288324.00, 299201.00, 322104.00, 347048.00, 372535.00,
            397629.00]
        #one/none parents 
x3 = [11610, 12143, 12486, 13015, 13028, 13327, 14074, 14094, 14458, 14878, 15610, 15649,
            15584, 16326, 16379, 16923, 17237, 17088, 17634, 18435, 19327, 19712, 21424, 21978,
            22684, 22597, 22735, 22217, 22214, 22655, 23098, 23602, 24013, 24003, 21593, 22319]


def __init__(self,y,x1,y_varnm = 'y',x_varnm = ''):
"""
Initializing the ols class. 
"""
self.y = y
#self.x1 = c_[ones(x1.shape[0]),x1]
self.y_varnm = y_varnm
if not isinstance(x_varnm,list): 
    self.x_varnm = ['const'] + list(x_varnm)
else:
    self.x_varnm = ['const'] + x_varnm

# Estimate model using OLS
self.estimate()

def estimate(self):

# estimating coefficients, and basic stats
self.inv_xx = inv(dot(self.x.T,self.x))
xy = dot(self.x.T,self.y)
self.b = dot(self.inv_xx,xy)                    # estimate coefficients

self.nobs = self.y.shape[0]                     # number of observations
self.ncoef = self.x.shape[1]                    # number of coef.
self.df_e = self.nobs - self.ncoef              # degrees of freedom, error 
self.df_r = self.ncoef - 1                      # degrees of freedom, regression 

self.e = self.y - dot(self.x,self.b)            # residuals
self.sse = dot(self.e,self.e)/self.df_e         # SSE
self.se = sqrt(diagonal(self.sse*self.inv_xx))  # coef. standard errors
self.t = self.b / self.se                       # coef. t-statistics
self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2    # coef. p-values

self.R2 = 1 - self.e.var()/self.y.var()         # model R-squared
self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef))   # adjusted R-square

self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e)  # model F-statistic
self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e)  # F-statistic p-value

def dw(self):
"""
Calculates the Durbin-Waston statistic
"""
de = diff(self.e,1)
dw = dot(de,de) / dot(self.e,self.e);

return dw

def omni(self):
"""
Omnibus test for normality
"""
return stats.normaltest(self.e) 

def JB(self):
"""
Calculate residual skewness, kurtosis, and do the JB test for normality
"""

# Calculate residual skewness and kurtosis
skew = stats.skew(self.e) 
kurtosis = 3 + stats.kurtosis(self.e) 

# Calculate the Jarque-Bera test for normality
JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
JBpv = 1-stats.chi2.cdf(JB,2);

return JB, JBpv, skew, kurtosis

def ll(self):
"""
Calculate model log-likelihood and two information criteria
"""

# Model log-likelihood, AIC, and BIC criterion values 
ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs

return ll, aic, bic

def summary(self):
"""
Printing model output to screen
"""

# local time & date
t = time.localtime()

# extra stats
ll, aic, bic = self.ll()
JB, JBpv, skew, kurtosis = self.JB()
omni, omnipv = self.omni()

# printing output to screen
                                                                                         print                                                                                         '\n=============================================================================='
print "Dependent Variable: " + self.y_varnm
print "Method: Least Squares"
print "Date: ", time.strftime("%a, %d %b %Y",t)
print "Time: ", time.strftime("%H:%M:%S",t)
print '# obs:               %5.0f' % self.nobs
print '# variables:     %5.0f' % self.ncoef 
print '=============================================================================='
print 'variable     coefficient     std. Error      t-statistic     prob.'
print '=============================================================================='
for i in range(len(self.x_varnm)):
    print '''% -5s          % -5.6f     % -5.6f     % -5.6f     % -5.6f'''  %            tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]]) 
print '=============================================================================='
print 'Models stats                         Residual stats'
print '=============================================================================='
print 'R-squared            % -5.6f         Durbin-Watson stat  % -5.6f' %              tuple([self.R2, self.dw()])
print 'Adjusted R-squared   % -5.6f         Omnibus stat        % -5.6f' % tuple([self.R2adj, omni])
print 'F-statistic          % -5.6f         Prob(Omnibus stat)  % -5.6f' % tuple([self.F, omnipv])
print 'Prob (F-statistic)   % -5.6f         JB stat             % -5.6f' % tuple([self.Fpv, JB])
print 'Log likelihood       % -5.6f         Prob(JB)            % -5.6f' % tuple([ll, JBpv])
print 'AIC criterion        % -5.6f         Skew                % -5.6f' % tuple([aic, skew])
print 'BIC criterion        % -5.6f         Kurtosis            % -5.6f' % tuple([bic, kurtosis])
print '=============================================================================='

if __name__ == '__main__':

##########################
### testing the ols class
##########################

# intercept is added, by default
m = ols(y,x1,y_varnm = 'y',x_varnm = ['x1','x2','x3'])
m.summary()

3 个答案:

答案 0 :(得分:5)

你应区分两种情况:i)你只想解决这个问题。 ii)您还想知道有关您的模型的统计信息。你可以用np.linalg.lstsq做i);对于ii),你最好使用statsmodels。

下面是两个解决方案的示例示例:

# The standard imports
import numpy as np
import pandas as pd

# For the statistic
from statsmodels.formula.api import ols

def generatedata():
    ''' Generate and show the data '''
    x = np.linspace(-5,5,101)
    (X,Y) = np.meshgrid(x,x)

    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   

    Z = -5 + 3*X-0.5*Y+np.random.randn(np.shape(X)[0], np.shape(X)[1])

    return (X.flatten(),Y.flatten(),Z.flatten())

def regressionmodel(X,Y,Z):
    '''Multilinear regression model, calculating fit, P-values, confidence intervals etc.'''

    # Convert the data into a Pandas DataFrame
    df = pd.DataFrame({'x':X, 'y':Y, 'z':Z})

    # Fit the model
    model = ols("z ~ x + y", df).fit()

    # Print the summary
    print(model.summary())

    return model._results.params  # should be array([-4.99754526,  3.00250049, -0.50514907])

def linearmodel(X,Y,Z):
    '''Just fit the plane'''

    M = np.vstack((np.ones(len(X)), X, Y)).T
    bestfit = np.linalg.lstsq(M,Z)[0]
    print('Best fit plane:', bestfit)

    return bestfit

if __name__ == '__main__':
    (X,Y,Z) = generatedata()    
    regressionmodel(X,Y,Z)    
    linearmodel(X,Y,Z)

答案 1 :(得分:3)

使用http://pypi.python.org/pypi/scikits.statsmodels可能更容易,而且功能更多

import numpy as np
import scikits.statsmodels.api as sm


y = [29.4, 29.9, 31.4, 32.8, 33.6, 34.6, 35.5, 36.3, 37.2, 37.8, 38.5, 38.8,
            38.6, 38.8, 39, 39.7, 40.6, 41.3, 42.5, 43.9, 44.9, 45.3, 45.8, 46.5,
            77.1, 48.2, 48.8, 50.5, 51, 51.3, 50.7, 50.7, 50.6, 50.7, 50.6, 50.7]
        #tuition
x1 = [376, 407, 438, 432, 433, 479, 512, 543, 583, 635, 714, 798, 891,
            971, 1045, 1106, 1218, 1285, 1356, 1454, 1624, 1782, 1942, 2057, 2179,
            2271, 2360, 2506, 2562, 2700, 2903, 3319, 3629, 3874, 4102, 4291]
        #research and development
x2 = [28740.00, 30952.00, 33359.00, 35671.00, 39435.00, 43338.00, 48719.00, 55379.00, 63224.00,
            72292.00, 80748.00, 89950.00, 102244.00, 114671.00, 120249.00, 126360.00, 133881.00, 141891.00,
            151993.00, 160876.00, 165350.00, 165730.00, 169207.00, 183625.00, 197346.00, 212152.00, 226402.00, 
            267298.00, 277366.00, 276022.00, 288324.00, 299201.00, 322104.00, 347048.00, 372535.00,
            397629.00]
        #one/none parents 
x3 = [11610, 12143, 12486, 13015, 13028, 13327, 14074, 14094, 14458, 14878, 15610, 15649,
            15584, 16326, 16379, 16923, 17237, 17088, 17634, 18435, 19327, 19712, 21424, 21978,
            22684, 22597, 22735, 22217, 22214, 22655, 23098, 23602, 24013, 24003, 21593, 22319]


x = np.column_stack((x1,x2,x3))  #stack explanatory variables into an array
x = sm.add_constant(x, prepend=True) #add a constant

res = sm.OLS(y,x).fit() #create a model and fit it
print res.params
print res.bse
print res.summary()

答案 2 :(得分:1)

ols函数将整个独立数据集作为第二个参数。试试

m = ols(y, [x1, x2, x3], ...)

虽然我怀疑你可能需要将它包装在numpy数组中:

x = numpy.ndarray([3, len(x1)])
x[0] = numpy.array(x1)
x[1] = numpy.array(x2)
x[2] = numpy.array(x3)
m = ols(y, x, ...)