Question

我正在尝试对一个相当大的数据集进行线性回归，实际上我可以为每个系数获得p值。当数据集较小时，这相当简单，但是当我在实际数据集上使用它时，一切都会中断。此代码使用玩具数据集复制了该问题。我可以使用sklearn进行线性回归，但无法使用此方法获得p值，我也更喜欢statsmodels来完成此任务，尤其是b / c处理分类数据的方式。 / p>

如何使statsmodels在更复杂的数据集上运行线性模型？我知道从统计学上讲，这不建议使用b / c属性，而不是观察值，但我正在尝试一些练习。

使用OLS，GLM和MixedLM也会发生这种情况。

I even tried setting my recursion limit higher but it did not work...

有关此主题的文章很少，但没有一篇涉及产生递归错误的数据集： Find p-value (significance) in scikit-learn LinearRegression

https://datascience.stackexchange.com/questions/15398/how-to-get-p-value-and-confident-interval-in-logisticregression-with-sklearn

# Make dataset
from sklearn.datasets import make_regression
import numpy as np
import pandas as pd

X, y = make_regression(n_features = 4000)
X = pd.DataFrame(X, 
                 index=[*map(lambda i:f"sample_{i}", range(X.shape[0]))], 
                 columns=[*map(lambda j:f"attr_{j}", range(X.shape[1]))], 
)
y = pd.Series(y,index=X.index)
# X.iloc[:5,:5]
#   attr_0  attr_1  attr_2  attr_3  attr_4
# sample_0  -2.077675   -0.222409   -0.782709   1.265239    1.606933
# sample_1  0.040124    -1.427598   -0.595388   0.403271    2.098169
# sample_2  -0.864165   0.465151    0.636452    -0.127071   -0.405423
# sample_3  -1.725911   0.148566    0.343320    -0.351172   1.755546
# sample_4  0.695828    1.313974    1.149156    1.846968    -0.009125

# Import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
data = X.copy()
data["y"] = y
formula = "y ~ " + " + ".join(X.columns)
model = smf.ols(formula=formula, data=data).fit()

# ---------------------------------------------------------------------------
# RecursionError                            Traceback (most recent call last)
# <ipython-input-11-4479099d07d7> in <module>()
#      24 data["y"] = y
#      25 formula = "y ~ " + " + ".join(X.columns)
# ---> 26 model = smf.ols(formula=formula, data=data)

# ... 

# ~/anaconda/envs/python3/lib/python3.6/site-packages/patsy/desc.py in eval(self, tree, require_evalexpr)
#     398                                 "'%s' operator" % (tree.type,),
#     399                                 tree.token)
# --> 400         result = self._evaluators[key](self, tree)
#     401         if require_evalexpr and not isinstance(result, IntermediateExpr):
#     402             if isinstance(result, ModelDesc):

# RecursionError: maximum recursion depth exceeded

# https://pastebin.com/JhmqPKp4

或者，我尝试调整一些使用sklearn时发现的代码，但是出现了相同的错误：

# Sklearn method
# https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d
from sklearn.linear_model import LinearRegression
class LinearRegression:
    """
    Wrapper Class for Logistic Regression which has the usual sklearn instance 
    in an attribute self.model, and pvalues, z scores and estimated 
    errors for each coefficient in 

    self.z_scores
    self.p_values
    self.sigma_estimates

    as well as the negative hessian of the log Likelihood (Fisher information)

    self.F_ij
    """

    def __init__(self,*args,**kwargs):#,**kwargs):
        self.model = LinearRegression(*args,**kwargs)#,**args)

    def fit(self,X,y):
        self.model.fit(X,y)
        #### Get p-values for the fitted model ####
        denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
        F_ij = np.dot((X/denom[:,None]).T,X) ## Fisher Information Matrix
        Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
        sigma_estimates = np.array([np.sqrt(Cramer_Rao[i,i]) for i in range(Cramer_Rao.shape[0])]) # sigma for each coefficient
        z_scores = self.model.coef_[0]/sigma_estimates # z-score for eaach model coefficient
        p_values = [stat.norm.sf(abs(x))*2 for x in z_scores] ### two tailed test for p-values

        self.z_scores = z_scores
        self.p_values = p_values
        self.sigma_estimates = sigma_estimates
        self.F_ij = F_iJ

model = LinearRegression().fit(X,y)
# RecursionError                            Traceback (most recent call last)
# <ipython-input-18-6f8d228c181e> in <module>()
#      35         self.F_ij = F_iJ
#      36 
# ---> 37 model = LinearRegression().fit(X,y)

# <ipython-input-18-6f8d228c181e> in __init__(self, *args, **kwargs)
#      18 
#      19     def __init__(self,*args,**kwargs):#,**kwargs):
# ---> 20         self.model = LinearRegression(*args,**kwargs)#,**args)
#      21 
#      22     def fit(self,X,y):

# ... last 1 frames repeated, from the frame below ...

# <ipython-input-18-6f8d228c181e> in __init__(self, *args, **kwargs)
#      18 
#      19     def __init__(self,*args,**kwargs):#,**kwargs):
# ---> 20         self.model = LinearRegression(*args,**kwargs)#,**args)
#      21 
#      22     def fit(self,X,y):

# RecursionError: maximum recursion depth exceeded

使用statsmodels OLS时出现“ RecursionError：超出最大递归深度”？

0 个答案: