我正在寻找用于此操作的最优雅的方法。我目前有一个小标题,其中包含一些模型的列表列,另一列中包含用于预测的测试数据集。
我可以使用dplyr :: mutate
手动计算每个模型的预测值,但我想知道是否存在一些套用或循环会加快该过程。
lab_formula <- as.formula("pop ~ lifeExp ")
temp_formula <- as.formula("gdpPercap ~ year")
last_formula <- as.formula("year ~ gdpPercap")
formula_list <- list(lab_formula,temp_formula,last_formula)
country_model <- function(df, formula_list, index) {
list(lm(formula = formula_list[[index]] , data = df),
randomForest(formula=formula_list[[index]], data = df),
gbm(formula=formula_list[[index]], data = df, n.minobsinnode = 2))
}
by_country <- gapminder %>%
dplyr :: group_by(country, continent) %>%
nest()
df1 <- by_country %>%
mutate(model1 = map(data, ~country_model(., formula_list, 1)),
model2 = map(data, ~country_model(., formula_list, 2)),
model3 = map(data, ~country_model(., formula_list, 3))
)
pred_1 <- df1 %>%
mutate(pred_1= map2(data,model1, function(x, y)
map(seq_along(y), function(i)
if (i == 3) predict(y[[i]], n.trees = y[[i]]$n.trees)
else as.numeric(predict(y[[i]])))))
Is there an elegant code to apply model1, model2, model3 on the data column? And to subsequently extract the predictions from each list of models embedded within the list column? (something to do with ```unnest```)
country data model1 model2 model3 pred_1
<fct> <list> <list> <list> <list> <list>
1 Afghanistan <tibble [12 x 4]> <list [3]> <list [3]> <list [3]> <list [3]>
2 Albania <tibble [12 x 4]> <list [3]> <list [3]> <list [3]> <list [3]>
3 Algeria <tibble [12 x 4]> <list [3]> <list [3]> <list [3]> <list [3]>
4 Angola <tibble [12 x 4]> <list [3]> <list [3]> <list [3]> <list [3]>
5 Argentina <tibble [12 x 4]> <list [3]> <list [3]> <list [3]> <list [3]>
Desired Outcome:
country data model1 model2 model3 pred_1 pred_2 pred_3
答案 0 :(得分:1)
我们可以使用switch
创建函数以识别正确的模型,然后使用map
遍历创建的列
library(randomForest)
library(gbm)
library(purrr)
library(dplyr)
library(stringr)
功能
country_model <- function(df, formula, model_name) {
switch(model_name,
"model1" = lm(formula = formula , data = df),
"model2" = randomForest(formula= formula, data = df),
"model3" = gbm(formula=formula, data = df, n.minobsinnode = 2)
)
}
country_pred <- function(model, model_name) {
switch(model_name,
"model1" = as.numeric(predict(model)),
"model2" = as.numeric(predict(model)),
"model3" = predict(model, n.trees = model[["n.trees"]])
)
}
使用模型名称设置公式列表的名称
fmlst <- set_names(formula_list, str_c("model", seq_along(formula_list)))
使用imap
df1 <- imap_dfc(fmlst, ~ by_country %>%
transmute(!! .y := map(data,
country_model, formula = .x, model_name = .y ))) %>%
bind_cols(by_country, .)
str1 <- names(df1)[startsWith(names(df1), "model")]
str2 <- str_c("pred_", 1:3)
也创建预测列
df2 <- map_dfc(str1, ~ {
nm1 <- .x
df1 %>%
select(.x) %>%
pull(1) %>%
map(., country_pred, model_name = nm1) %>%
list
}
) %>%
rename_all(~ str2) %>%
bind_cols(df1, .)
df2
# A tibble: 142 x 9
# country continent data model1 model2 model3 pred_1 pred_2 pred_3
# <fct> <fct> <list> <list> <list> <list> <list> <list> <list>
# 1 Afghanistan Asia <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 2 Albania Europe <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 3 Algeria Africa <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 4 Angola Africa <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 5 Argentina Americas <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 6 Australia Oceania <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 7 Austria Europe <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 8 Bahrain Asia <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# 9 Bangladesh Asia <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
#10 Belgium Europe <tibble [12 × 4]> <lm> <rndmFrs.> <gbm> <dbl [12]> <dbl [12]> <dbl [12]>
# … with 132 more rows
答案 1 :(得分:-1)
基本上,我尝试使用for循环遍历所有存储的变量名称,在其上应用模型1/2/3,然后使用这些名称在现有的df1数据框中创建新的变量列。
pred_names <- c('labour_pred','temp_pred', 'last_pred')
for (c in seq_along(pred_names)) {
model_pred <- df1 %>%
mutate(pred_names[c] = map2(data_2018,model_list[c], function(x, y)
map(seq_along(y), function(i)
if (i == 3) predict(y[[i]], n.trees = y[[i]]$n.trees)
else as.numeric(predict(y[[i]])))))
}
However, I get these errors instead:
Error: unexpected '=' in:
" model_pred <- model_fit %>%
mutate(pred_names[c] ="
Error: unexpected ')' in:
" if (i == 3) predict(y[[i]], n.trees = y[[i]]$n.trees)
else as.numeric(predict(y[[i]]))))"
> }
Error: unexpected '}' in "}"