我正在尝试使用CSV格式的数据构建混合效果模型。 数据集(https://github.com/nisengweregis/mixed-effects-model/blob/master/mixed_effects.csv)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn as sk
import scipy.stats as stats
import seaborn as sns; sns.set()
from math import sqrt
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
df = pd.read_csv(r"C:\Users\E0434232\Documents\PlatformIO\Python DS\mixed_effects.csv")
# model = sm.MixedLM.from_formula('score ~ exercise_hrs + age + gender', df, groups =df['trainer'])
# result = model.fit()
# result.summary()
将数据加载到df中不会产生任何错误,但是在创建模型时,出现以下错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-68-7f704114eb8c> in <module>
----> 1 model = sm.MixedLM.from_formula('score ~ exercise_hrs + age + gender', df, groups =df['trainer'])
2 result = model.fit()
3 result.summary()
~\Anaconda3\lib\site-packages\statsmodels\regression\mixed_linear_model.py in from_formula(cls, formula, data, re_formula, vc_formula, subset, use_sparse, missing, *args, **kwargs)
1039 kwargs["exog_vc"] = exog_vc
1040 kwargs["groups"] = groups
-> 1041 mod = super(MixedLM, cls).from_formula(
1042 formula, data, *args, **kwargs)
1043
~\Anaconda3\lib\site-packages\statsmodels\base\model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
193 'formula': formula, # attach formula for unpckling
194 'design_info': design_info})
--> 195 mod = cls(endog, exog, *args, **kwargs)
196 mod.formula = formula
197
~\Anaconda3\lib\site-packages\statsmodels\regression\mixed_linear_model.py in __init__(self, endog, exog, groups, exog_re, exog_vc, use_sqrt, missing, **kwargs)
787 # list of arrays, corresponding to the groups.
788 group_labels = list(set(groups))
--> 789 group_labels.sort()
790 row_indices = dict((s, []) for s in group_labels)
791 for i, g in enumerate(groups):
TypeError: '<' not supported between instances of 'str' and 'float'
我怀疑这可能是由于“ Trainer”列的dtype造成的,所以我尝试将其更改为“ str”和“ Category”
df['trainer']=df['trainer'].astype('category')
df.dtypes
student_id float64
exercise_hrs float64
books float64
score float64
gender float64
trainer category
age float64
dtype: object
但随后出现索引错误:
IndexError Traceback (most recent call last)
<ipython-input-73-7f704114eb8c> in <module>
----> 1 model = sm.MixedLM.from_formula('score ~ exercise_hrs + age + gender', df, groups =df['trainer'])
2 result = model.fit()
3 result.summary()
~\Anaconda3\lib\site-packages\statsmodels\regression\mixed_linear_model.py in from_formula(cls, formula, data, re_formula, vc_formula, subset, use_sparse, missing, *args, **kwargs)
1039 kwargs["exog_vc"] = exog_vc
1040 kwargs["groups"] = groups
-> 1041 mod = super(MixedLM, cls).from_formula(
1042 formula, data, *args, **kwargs)
1043
~\Anaconda3\lib\site-packages\statsmodels\base\model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
193 'formula': formula, # attach formula for unpckling
194 'design_info': design_info})
--> 195 mod = cls(endog, exog, *args, **kwargs)
196 mod.formula = formula
197
~\Anaconda3\lib\site-packages\statsmodels\regression\mixed_linear_model.py in __init__(self, endog, exog, groups, exog_re, exog_vc, use_sqrt, missing, **kwargs)
796
797 # Split the data by groups
--> 798 self.endog_li = self.group_list(self.endog)
799 self.exog_li = self.group_list(self.exog)
800 self.exog_re_li = self.group_list(self.exog_re)
~\Anaconda3\lib\site-packages\statsmodels\regression\mixed_linear_model.py in group_list(self, array)
1097
1098 if array.ndim == 1:
-> 1099 return [np.array(array[self.row_indices[k]])
1100 for k in self.group_labels]
1101 else:
~\Anaconda3\lib\site-packages\statsmodels\regression\mixed_linear_model.py in <listcomp>(.0)
1097
1098 if array.ndim == 1:
-> 1099 return [np.array(array[self.row_indices[k]])
1100 for k in self.group_labels]
1101 else:
IndexError: index 105 is out of bounds for axis 0 with size 105
在csv文件中,数据看起来正常,也没有缺失值。但是显然熊猫在第103行之后不喜欢该数据。请问数据有什么问题吗?