我想使用5倍交叉验证对高基数变量执行均值编码。我的代码是:
df <- data.frame(sample(c(1,2,3,4,5), 1000, replace=T), sample(c(1,0), 1000, replace=T))
colnames(df) <- c("var", "target")
encode <- function(df, target_var, column_var){
set.seed(520)
df$group <- as.factor(sample(c(1,2,3,4,5), nrow(df), replace=T, prob=c(0.2,0.2,0.2,0.2,0.2)))
var.enc <- df %>%
select_("group", column_var, target_var) %>%
group_by_("group", column_var) %>%
mutate(var_encoded = mean(target_var)) %>%
ungroup() %>%
select_(column_var, "var_encoded") %>%
distinct() %>%
group_by_(column_var) %>%
mutate(var.enc = mean(var_encoded)) %>%
distinct()
return(var.enc)
}
encoding <- encode(df = df, column_var = "var", target_var = "target")
当我运行上面的代码时,我得到了警告:
mean.default(target_var):参数不是数字或逻辑:返回NA
那么,如何将参数正确传递给函数中的平均值?我尝试使用 as.name(),但也无法正常工作。另外,我使用了 mean(df [[target_var]]),但是这样 group_by 无法正常工作,因此我得到了全局平均值。>
编辑:我添加了一个可复制的示例。
答案 0 :(得分:3)
由于输入是字符串,因此将其转换为符号(sym
),然后执行求值(!!
)
encode <- function(df, target_var, column_var){
set.seed(520)
df$group <- as.factor(sample(c(1,2,3,4,5), nrow(df),
replace=T, prob=c(0.2,0.2,0.2,0.2,0.2)))
column_var <- rlang::sym(column_var)
target_var <- rlang::sym(target_var)
df %>%
select(group, !! column_var, !!target_var) %>%
group_by(group, !! column_var) %>%
mutate(var_encoded = mean(!!target_var)) %>%
ungroup() %>%
select(!! column_var, var_encoded) %>%
distinct() %>%
group_by(!! column_var) %>%
mutate(var.enc = mean(var_encoded)) %>%
distinct()
}
-检查
encoding <- encode(df = df, target_var = "target", column_var = "var")
encoding
# A tibble: 25 x 3
# Groups: var [5]
# var var_encoded var.enc
# <dbl> <dbl> <dbl>
# 1 5 0.462 0.497
# 2 5 0.553 0.497
# 3 4 0.585 0.493
# 4 2 0.543 0.536
# 5 3 0.364 0.453
# 6 4 0.46 0.493
# 7 1 0.465 0.476
# 8 3 0.474 0.453
# 9 5 0.529 0.497
#10 1 0.417 0.476
# ... with 15 more rows