重新排列列表:我正在进行类似于以下代码的交叉验证,其中cv_chunk
的数量是任意的:
library(purrr)
# randomly assign to a cross validation chunk
set.seed(11)
mtcars$cv_chunk <- sample(rep(1:3), nrow(mtcars), 1)
model_confint <- mtcars %>%
split(.$cv_chunk) %>%
map(~lm(mpg ~ cyl*qsec + gear - cv_chunk, data = .)) %>%
map(confint, levels = 0.95) %>%
map(t)
names(model_confint) <- paste0("CV_", names(model_confint))
# first element of the list
$CV_1
(Intercept) cyl qsec gear cyl:qsec
2.5 % -54.8983 -25.691233 -8.958490 -5.175215 -0.7161008
97.5 % 215.2629 9.901694 4.322784 5.185608 1.2804372
为了进一步处理每个这样的模型的置信水平,我需要重新排列model_confint
,以便获得模型中每个系数的data.frame / list。例如。对于(Intercept)
(依据cyl
,qsec
,...):
$`(Intercept)`
2.5 % 97.5 %
CV_1 -54.8983 215.26290
CV_2 -193.2070 84.48072
CV_3 -361.1489 545.04010
我确信使用apply
函数或purrr
包有一个很好的方法。但我被卡住了。
感谢您的帮助。
答案 0 :(得分:2)
谢谢你的好榜样!我有两个直觉可以使这个问题更容易:您可以使用purrr::set_names
在循环中更早地命名列表,并且可以通过先前映射到数据帧来避免转置。
library(purrr)
# randomly assign to a cross validation chunk
set.seed(11)
mtcars$cv_chunk <- sample(rep(1:3), nrow(mtcars), 1)
mtcars %>%
split(.$cv_chunk) %>%
# Name the list here to use it as an argument later
set_names(paste0("CV_", names(.))) %>%
# Map the actual list and the names of the list to create one dataframe
map(~lm(mpg ~ cyl*qsec + gear - cv_chunk, data = .)) %>%
map2_dfr(.x = ., .y = names(.), function(x, y) {
df <- as.data.frame(confint(x, levels = .95))
df$coeffs <- rownames(df)
df$cv_chunk <- y
df
}
) %>%
# Split the dataframe on the "coeffs" column
split(.$coeffs) %>%
# Remove the "coeffs" column from each dataframe
map(function(x) x[colnames(x) != 'coeffs'])
#> $`(Intercept)`
#> 2.5 % 97.5 % cv_chunk
#> 1 -54.8983 215.26286 CV_1
#> 6 -193.2069 84.48072 CV_2
#> 11 -361.1489 545.04010 CV_3
#>
#> $cyl
#> 2.5 % 97.5 % cv_chunk
#> 2 -25.69123 9.901694 CV_1
#> 7 -11.44012 24.073391 CV_2
#> 12 -26.36288 16.589356 CV_3
#>
#> $`cyl:qsec`
#> 2.5 % 97.5 % cv_chunk
#> 5 -0.7161008 1.2804372 CV_1
#> 10 -1.4870954 0.5295074 CV_2
#> 15 NA NA CV_3
#>
#> $gear
#> 2.5 % 97.5 % cv_chunk
#> 4 -5.175215 5.185608 CV_1
#> 9 -3.975621 8.092960 CV_2
#> 14 -30.785104 33.533967 CV_3
#>
#> $qsec
#> 2.5 % 97.5 % cv_chunk
#> 3 -8.958490 4.322784 CV_1
#> 8 -2.257827 11.261528 CV_2
#> 13 -17.741244 12.929339 CV_3
答案 1 :(得分:2)
可以使用broom
:
library(purrr)
library(dplyr)
library(tidyr)
# randomly assign to a cross validation chunk
set.seed(11)
mtcars$cv_chunk <- sample(seq(3), nrow(mtcars), replace = TRUE)
mtcars %>%
split(.$cv_chunk) %>%
map(~lm(mpg ~ cyl*qsec + gear - cv_chunk, data = .)) %>%
# the following will work uder different seed. I will report a bug to `broom``
#map_dfr(~broom::tidy(.x, conf.int=TRUE), .id="cv_chunk")
map_dfr(~bind_cols(broom::tidy(.x), drop_na(broom::confint_tidy(.x))), .id="cv_chunk") %>%
select(cv_chunk, term, conf.low, conf.high) %>%
split(.$term)
正确的tidyverse
做事方式将会使用
group_by(term) %>% nest() %>% pull(data)
,但base
函数split
正在提供您想要的内容
$`(Intercept)`
cv_chunk term conf.low conf.high
1 1 (Intercept) -54.8983 215.26286
6 2 (Intercept) -193.2069 84.48072
11 3 (Intercept) -361.1489 545.04010
$cyl
cv_chunk term conf.low conf.high
2 1 cyl -25.69123 9.901694
7 2 cyl -11.44012 24.073391
12 3 cyl -26.36288 16.589356
$`cyl:qsec`
cv_chunk term conf.low conf.high
5 1 cyl:qsec -0.7161008 1.2804372
10 2 cyl:qsec -1.4870954 0.5295074
$gear
cv_chunk term conf.low conf.high
4 1 gear -5.175215 5.185608
9 2 gear -3.975621 8.092960
14 3 gear -30.785104 33.533967
$qsec
cv_chunk term conf.low conf.high
3 1 qsec -8.958490 4.322784
8 2 qsec -2.257827 11.261528
13 3 qsec -17.741244 12.929339