Question

我需要一种基于p-values的正向选择方法我很累：

model = lm(price ~ sqft_living , data = hs.sample)
s = summary(model)

cat(format("+sqft_living", width = 15), " Adj. R2:", format(s$adj.r.squared,width = 15, justify = 'right'),"t-value:", format(s$coefficients[2,3],width = 15, nsmall = 3, digits = 3), " p-value:", s$coefficients[2,4], "\n")

model = lm(price ~ yr_built , data = hs.sample)
s = summary(model)
cat(format("+yr_built", width = 15)," Adj. R2:", format(s$adj.r.squared,width = 15, justify = 'right'),"t-value:", format(s$coefficients[2,3],width = 15, nsmall = 3, digits = 5), " p-value:", s$coefficients[2,4], "\n")

我应该针对上述不同特征运行模型，并记录或打印p值，然后选择一个具有最低p值的特征。这样，我选择了一个特征，然后必须再次添加所有其他特征，并记录所得模型的p值，然后再次添加具有最佳p值的特征。我也可以从所有功能开始，然后删除最差的功能，以向后进行。您可以在以下地址了解有关该过程的更多信息

https://gerardnico.com/data_mining/stepwise_regression#forward

我的问题是如何使用循环在r中将其自动化？至少，我要遍历hs.sample的所有列以在模型上进行测试的部分。...

Answer 1

以下是两种方法的实现

基于p值的前向选择
基于调整后R平方的向后消除

代码：

forward_p_value = function(dtframe, response, exclude, alpha=0.05)
{
  exclude = c(exclude, response)
  cols = names(dtframe)
  min_val = 0
  sel_cols = ""
  model = NULL
  while (min_val < alpha)
  {
    min_val = 100
    min_col = ""
    for (col in cols)
    {
      if (!(col %in% exclude))
      {
        col2add = if (sel_cols == "") col else paste(" + ", col)
        formula = paste(response, " ~ ", sel_cols, col2add)
        model = lm(formula = formula, data = dtframe)
        s = summary(model)
        # print(s$coefficients)
        col2retrive = col
        if (is.factor(dtframe[,col])) {
          levels = levels(dtframe[,col])
          if (length(levels) > 1)
            col2retrive =paste(col,levels[2],sep = "")
        }
        p_val = s$coefficients[col2retrive,4]
        adj_r2 = s$adj.r.squared
        if (p_val < min_val)
        {
          min_val = p_val
          min_col = col
        }
        #print(s$coefficients)
        cat(format(paste("+",col), width = 15), " Adj. R2:", format(adj_r2,width = 15, justify = 'right'),"t-value:", ... = format(s$coefficients[2,2],width = 15, nsmall = 3, digits = 3), " p-value:", p_val, "\n")
      }
    }
    cat("\n==> +",min_col, ": p_vlaue:", min_val, "\n\n")
    if (min_val  < alpha)
    {
      sel_cols = if (sel_cols != "") paste(sel_cols, " + ", min_col) else min_col
    }
    exclude = c(exclude, min_col)
  }
  formula = paste(response, " ~ ", sel_cols)
  model = lm(formula = formula, data = dtframe)
  return(model)
}

backward_adj_r2 = function(dtframe, response, exclude)
{
  exclude = c(exclude, response)
  cols = names(dtframe)
  col_list = c()
  for (col in cols)
  {
    if (!(col %in% exclude))
    { 
      col_list = c(col_list, col)
    }
  }
  all_cols = c(cols,"")
  max_adj_r2 = 0
  old_adj_r2 = -1
  model = NULL
  while (max_adj_r2 > old_adj_r2)
  {
    min_col = ""
    old_adj_r2 = max_adj_r2
    for (col in all_cols)
    {
      if (!(col %in% exclude))
      {
        sel_cols = setdiff(col_list, col)
        sel_cols = paste(sel_cols,collapse = "+")
        # print(sel_cols)
        model = lm(formula = paste(response, " ~ ",sel_cols) , data = dtframe)
        s = summary(model)
        adj_r2 = s$adj.r.squared
        if (adj_r2 > max_adj_r2)
        {
          max_adj_r2 = adj_r2
          min_col = col
        }
        #print(s$coefficients)
        col = if(col=="") "NONE" else col
        cat(format(paste("-",col), width = 15), " Adj. R2:", format(adj_r2,width = 15, justify = 'right'),"\n")
      }
    }
    min_col = if(min_col=="") "NONE" else min_col
    cat("\n==> -",min_col,": Adj R2:",max_adj_r2, "\n\n")
    col_list = setdiff(col_list, min_col)
    exclude = c(exclude, min_col)
  }
  # print("Backward (Adj R2) Selected features:")
  # cat(col_list, sep = ", ") 
  return(model)
}


model1 = backward_adj_r2(dtframe = mtcars,response = "mpg", exclude = c("wt"))
summary(model1)

model2 = forward_p_value(dtframe = mtcars,response = "mpg", exclude = c("wt"))
summary(model2)

我在github上为代码创建了一个存储库

https://github.com/puraminy/model_selection/

R中基于p值（正向）的模型选择

1 个答案: