我需要一种基于p-values
的正向选择方法
我很累:
model = lm(price ~ sqft_living , data = hs.sample)
s = summary(model)
cat(format("+sqft_living", width = 15), " Adj. R2:", format(s$adj.r.squared,width = 15, justify = 'right'),"t-value:", format(s$coefficients[2,3],width = 15, nsmall = 3, digits = 3), " p-value:", s$coefficients[2,4], "\n")
model = lm(price ~ yr_built , data = hs.sample)
s = summary(model)
cat(format("+yr_built", width = 15)," Adj. R2:", format(s$adj.r.squared,width = 15, justify = 'right'),"t-value:", format(s$coefficients[2,3],width = 15, nsmall = 3, digits = 5), " p-value:", s$coefficients[2,4], "\n")
我应该针对上述不同特征运行模型,并记录或打印p值,然后选择一个具有最低p值的特征。这样,我选择了一个特征,然后必须再次添加所有其他特征,并记录所得模型的p值,然后再次添加具有最佳p值的特征。我也可以从所有功能开始,然后删除最差的功能,以向后进行。您可以在以下地址了解有关该过程的更多信息
https://gerardnico.com/data_mining/stepwise_regression#forward
我的问题是如何使用循环在r中将其自动化?至少,我要遍历hs.sample
的所有列以在模型上进行测试的部分。...
答案 0 :(得分:0)
以下是两种方法的实现
代码:
forward_p_value = function(dtframe, response, exclude, alpha=0.05)
{
exclude = c(exclude, response)
cols = names(dtframe)
min_val = 0
sel_cols = ""
model = NULL
while (min_val < alpha)
{
min_val = 100
min_col = ""
for (col in cols)
{
if (!(col %in% exclude))
{
col2add = if (sel_cols == "") col else paste(" + ", col)
formula = paste(response, " ~ ", sel_cols, col2add)
model = lm(formula = formula, data = dtframe)
s = summary(model)
# print(s$coefficients)
col2retrive = col
if (is.factor(dtframe[,col])) {
levels = levels(dtframe[,col])
if (length(levels) > 1)
col2retrive =paste(col,levels[2],sep = "")
}
p_val = s$coefficients[col2retrive,4]
adj_r2 = s$adj.r.squared
if (p_val < min_val)
{
min_val = p_val
min_col = col
}
#print(s$coefficients)
cat(format(paste("+",col), width = 15), " Adj. R2:", format(adj_r2,width = 15, justify = 'right'),"t-value:", ... = format(s$coefficients[2,2],width = 15, nsmall = 3, digits = 3), " p-value:", p_val, "\n")
}
}
cat("\n==> +",min_col, ": p_vlaue:", min_val, "\n\n")
if (min_val < alpha)
{
sel_cols = if (sel_cols != "") paste(sel_cols, " + ", min_col) else min_col
}
exclude = c(exclude, min_col)
}
formula = paste(response, " ~ ", sel_cols)
model = lm(formula = formula, data = dtframe)
return(model)
}
backward_adj_r2 = function(dtframe, response, exclude)
{
exclude = c(exclude, response)
cols = names(dtframe)
col_list = c()
for (col in cols)
{
if (!(col %in% exclude))
{
col_list = c(col_list, col)
}
}
all_cols = c(cols,"")
max_adj_r2 = 0
old_adj_r2 = -1
model = NULL
while (max_adj_r2 > old_adj_r2)
{
min_col = ""
old_adj_r2 = max_adj_r2
for (col in all_cols)
{
if (!(col %in% exclude))
{
sel_cols = setdiff(col_list, col)
sel_cols = paste(sel_cols,collapse = "+")
# print(sel_cols)
model = lm(formula = paste(response, " ~ ",sel_cols) , data = dtframe)
s = summary(model)
adj_r2 = s$adj.r.squared
if (adj_r2 > max_adj_r2)
{
max_adj_r2 = adj_r2
min_col = col
}
#print(s$coefficients)
col = if(col=="") "NONE" else col
cat(format(paste("-",col), width = 15), " Adj. R2:", format(adj_r2,width = 15, justify = 'right'),"\n")
}
}
min_col = if(min_col=="") "NONE" else min_col
cat("\n==> -",min_col,": Adj R2:",max_adj_r2, "\n\n")
col_list = setdiff(col_list, min_col)
exclude = c(exclude, min_col)
}
# print("Backward (Adj R2) Selected features:")
# cat(col_list, sep = ", ")
return(model)
}
model1 = backward_adj_r2(dtframe = mtcars,response = "mpg", exclude = c("wt"))
summary(model1)
model2 = forward_p_value(dtframe = mtcars,response = "mpg", exclude = c("wt"))
summary(model2)
我在github上为代码创建了一个存储库