创建线性模型以检查相关标记化错误

时间:2019-01-05 19:34:03

标签: python-3.x pandas linear-regression statsmodels

我有类似以下示例的数据,该数据具有4个连续列[x0至x3]和一个二进制列y。 y具有两个值1.0和0.0。我正在尝试使用下面的CatConCor函数检查二进制列y和连续列x0之一之间的相关性,但是我收到以下错误消息。该函数创建一个线性回归模型,并计算带有和不带有分类变量的残差的p值。如果有人可以指出问题或解决方法,将不胜感激。

数据:

   x_r        x0        x1        x2        x3    y
0    0  0.466726  0.030126  0.998330  0.892770  0.0
1    1  0.173168  0.525810 -0.079341 -0.112151  0.0
2    2 -0.854467  0.770712  0.929614 -0.224779  0.0
3    3 -0.370574  0.568183 -0.928269  0.843253  0.0
4    4 -0.659431 -0.948491 -0.091534  0.706157  0.0

代码:

import numpy as np
import pandas as pd
from time import time
import scipy.stats as stats

from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################

import matplotlib.pyplot as plt
import matplotlib.cm as cm

# correlation between categorical variable and continuous variable

def CatConCor(df,catVar,conVar):
    import statsmodels.api as sm
    from statsmodels.formula.api import ols

    # subsetting data for one categorical column and one continuous column
    data2=df.copy()[[catVar,conVar]]
    data2[catVar]=data2[catVar].astype('category')

    mod = ols(conVar+'~'+catVar,
                data=data2).fit()

    aov_table = sm.stats.anova_lm(mod, typ=2)

    if aov_table['PR(>F)'][0] < 0.05:
        print('Correlated p='+str(aov_table['PR(>F)'][0]))
    else:
        print('Uncorrelated p='+str(aov_table['PR(>F)'][0]))



# checking for correlation between categorical and continuous variables

CatConCor(df=train_df,catVar='y',conVar='x0')

错误:

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-6-80f83b8c8e14> in <module>()
      1 # checking for correlation between categorical and continuous variables
      2 
----> 3 CatConCor(df=train_df,catVar='y',conVar='x0')

<ipython-input-2-35404ba1d697> in CatConCor(df, catVar, conVar)
    103 
    104     mod = ols(conVar+'~'+catVar,
--> 105                 data=data2).fit()
    106 
    107     aov_table = sm.stats.anova_lm(mod, typ=2)

~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
    153 
    154         tmp = handle_formula_data(data, None, formula, depth=eval_env,
--> 155                                   missing=missing)
    156         ((endog, exog), missing_idx, design_info) = tmp
    157 

~/anaconda2/envs/py36/lib/python3.6/site-packages/statsmodels/formula/formulatools.py in handle_formula_data(Y, X, formula, depth, missing)
     63         if data_util._is_using_pandas(Y, None):
     64             result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65                                NA_action=na_action)
     66         else:
     67             result = dmatrices(formula, Y, depth, return_type='dataframe',

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in dmatrices(formula_like, data, eval_env, NA_action, return_type)
    308     eval_env = EvalEnvironment.capture(eval_env, reference=1)
    309     (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 310                                       NA_action, return_type)
    311     if lhs.shape[1] == 0:
    312         raise PatsyError("model is missing required outcome variables")

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
    163         return iter([data])
    164     design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 165                                       NA_action)
    166     if design_infos is not None:
    167         return build_design_matrices(design_infos, data,

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/highlevel.py in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
     60                 "ascii-only, or else upgrade to Python 3.")
     61     if isinstance(formula_like, str):
---> 62         formula_like = ModelDesc.from_formula(formula_like)
     63         # fallthrough
     64     if isinstance(formula_like, ModelDesc):

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/desc.py in from_formula(cls, tree_or_string)
    162             tree = tree_or_string
    163         else:
--> 164             tree = parse_formula(tree_or_string)
    165         value = Evaluator().eval(tree, require_evalexpr=False)
    166         assert isinstance(value, cls)

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in parse_formula(code, extra_operators)
    146     tree = infix_parse(_tokenize_formula(code, operator_strings),
    147                        operators,
--> 148                        _atomic_token_types)
    149     if not isinstance(tree, ParseNode) or tree.type != "~":
    150         tree = ParseNode("~", None, [tree], tree.origin)

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/infix_parser.py in infix_parse(tokens, operators, atomic_types, trace)
    208 
    209     want_noun = True
--> 210     for token in token_source:
    211         if c.trace:
    212             print("Reading next token (want_noun=%r)" % (want_noun,))

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _tokenize_formula(code, operator_strings)
     92         else:
     93             it.push_back((pytype, token_string, origin))
---> 94             yield _read_python_expr(it, end_tokens)
     95 
     96 def test__tokenize_formula():

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/parse_formula.py in _read_python_expr(it, end_tokens)
     42     origins = []
     43     bracket_level = 0
---> 44     for pytype, token_string, origin in it:
     45         assert bracket_level >= 0
     46         if bracket_level == 0 and token_string in end_tokens:

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/util.py in next(self)
    330         else:
    331             # May raise StopIteration
--> 332             return six.advance_iterator(self._it)
    333     __next__ = next
    334 

~/anaconda2/envs/py36/lib/python3.6/site-packages/patsy/tokens.py in python_tokenize(code)
     33                 break
     34             origin = Origin(code, start, end)
---> 35             assert pytype not in (tokenize.NL, tokenize.NEWLINE)
     36             if pytype == tokenize.ERRORTOKEN:
     37                 raise PatsyError("error tokenizing input "

AssertionError:

1 个答案:

答案 0 :(得分:1)

将patsy升级到0.5.1可以解决此问题。我在这里找到提示: https://github.com/statsmodels/statsmodels/issues/5343