鉴于代码:
import statsmodels.api as sm
import statsmodels.formula.api as smf
df.reset_index(drop=True, inplace=True)
display(df.describe())
md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
mdf = md.fit()
如果df
是pandas.DataFrame
,我会从smf.mixedlm
中收到以下错误:
IndexError Traceback (most recent call last)
<ipython-input-34-5373fe9b774a> in <module>()
4 df.reset_index(drop=True, inplace=True)
5 display(df.describe())
----> 6 md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
7 # mdf = md.fit()
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in from_formula(cls, formula, data, re_formula, subset, *args, **kwargs)
651 subset=None,
652 exog_re=exog_re,
--> 653 *args, **kwargs)
654
655 # expand re names to account for pairs of RE
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, *args, **kwargs)
148 kwargs.update({'missing_idx': missing_idx,
149 'missing': missing})
--> 150 mod = cls(endog, exog, *args, **kwargs)
151 mod.formula = formula
152
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in __init__(self, endog, exog, groups, exog_re, use_sqrt, missing, **kwargs)
537
538 # Split the data by groups
--> 539 self.endog_li = self.group_list(self.endog)
540 self.exog_li = self.group_list(self.exog)
541 self.exog_re_li = self.group_list(self.exog_re)
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in group_list(self, array)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in <listcomp>(.0)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
IndexError: index 7214 is out of bounds for axis 1 with size 7214
为什么会出现此错误? len(df)
报告有7296行,因此应该没有问题索引7214th,显式重新索引确保索引从0到7295。
如果您愿意,可以下载df
here来摆弄它。
答案 0 :(得分:4)
iscorr
中有82个空值:
>>> df.iscorr.isnull().sum()
82
放弃它们,你会没事的:
df = df[df.iscorr.notnull()]
根据函数的文档字符串:
Notes
------
`data` must define __getitem__ with the keys in the formula
terms args and kwargs are passed on to the model
instantiation. E.g., a numpy structured or rec array, a
dictionary, or a pandas DataFrame.
If `re_formula` is not provided, the default is a random
intercept for each group.
This method currently does not correctly handle missing
values, so missing values should be explicitly dropped from
the DataFrame before calling this method.
"""
输出:
>>> mdf.params
Intercept 0.032000
iscorr[T.True] 0.030670
Intercept RE -0.057462