Question

我想将3个线性模型应用于我的数据，并为每个模型提取残差。我想知道是否有办法使用dplyr和purrr的组合为每个模型应用相同的步骤：

我想保留：

每个模型的lm对象
每个型号的augment输出
每个型号的残差

这是一个分析mpg数据集的工作示例：

library(dplyr)
library(tidyr)
library(purrr)
library(broom)
library(ggplot2)

以下是我想用于我的lm的三种不同公式

f1 = hwy ~ cyl
f2 = hwy ~ displ
f3 = hwy ~ cyl + displ

lin_mod = function(formula) {
  function(data) {
    lm(formula, data = data)
  }
}

这是我为单个公式提取残差的方法：

mpg %>% 
group_by(manufacturer) %>% 
nest() %>% 
mutate(model = map(data, lin_mod(f1)), 
       aug = map(model, augment), 
       res = map(aug, ".resid"))

但是，对于所有公式来说，这种技术似乎是一种不好的方法，因为我重写了很多代码：

mpg %>% 
group_by(manufacturer) %>% 
nest() %>% 
mutate(model1 = map(data, lin_mod(f1)), 
       aug1 = map(model1, augment), 
       res1 = map(aug1, ".resid"),
       model2 = map(data, lin_mod(f2)), 
       aug2 = map(model2, augment), 
       res2 = map(aug2, ".resid"),
       model3 = map(data, lin_mod(f3)), 
       aug3 = map(model3, augment), 
       res3 = map(aug3, ".resid"))

如何以优雅的方式将此功能应用于每个公式？我当时认为mutate_all，或者将公式放入列表可能会有所帮助，但唉，我被卡住了。

Answer 1

您可以使用mutate_at（或mutate_if）改变列表列的位置。这样可以节省多次迭代，并使代码可管理且更紧凑。

library(dplyr)
library(tidyr)
library(purrr)
library(broom)

lin_mod = function(formula) {
  function(data,...){
  map(data,~lm(formula, data = .x))
  }
}

list_model <- list(cyl_model= hwy ~ cyl,
                   displ_model= hwy ~ displ,
                   full_model= hwy ~ cyl + displ) %>% 
              lapply(lin_mod)

ggplot2::mpg %>% 
  group_by(manufacturer) %>% nest() %>% 
    mutate_at(.vars=("data"),.funs=list_model) %>% 
    mutate_at(.vars=vars(ends_with("model")), .funs=~map(.x, augment)) %>% 
    mutate_at(.vars=vars(ends_with("model")), .funs=~map(.x, ".resid")) %>% unnest()

Answer 2

这是我能够提出的最接近的例子here

library(dplyr)
library(tidyr)
library(purrr)
library(broom)
library(ggplot2)

# Here are the three different formulas I want to use for my lm

f1 = hwy ~ cyl
f2 = hwy ~ displ
f3 = hwy ~ cyl + displ

formulas = c(f1,f2,f3)

lin_mod = function(formula) {
  function(data) {
    lm(formula, data = data)
  }
}

list_model = lapply(formulas, lin_mod)
names(list_model) = c('cyl_model', 'displ_model', 'full_model')


fn_model <- function(.model, df){
  df$model <- map(df$data, possibly(.model, NULL))
  df
}

mpg_nested = mpg %>% 
group_by(manufacturer) %>% 
nest()

mpg_nested_new = list_model %>% 
                 map_df(fn_model, mpg_nested, .id = 'id_model') %>% 
                 arrange(manufacturer) %>% 
                 mutate(aug = map(model, augment), 
                 res = map(aug, ".resid"))


output = mpg_nested_new %>% 
gather(Var, val, c('model', 'aug', 'res')) %>% 
unite(desc, id_model, Var)%>% 
spread(desc, val)

将多个模型公式应用于数据组

2 个答案: