已编辑以添加更多详细信息并进行澄清。 基本上,我有一个数据帧列表,它们的 行号相同,但列号不同,所以每个数据帧的维数都不相同 。我现在想要做的是选择每个数据框的第一行,将它们放入新的数据框,并将其用作新列表的第一元素,然后对第二行,第三行执行相同的操作...
我已经考虑过使用2 for循环来重新分配行,但是考虑到嵌套的for循环非常慢并且我拥有的数据很大,所以这似乎是一种非常糟糕的方法。衷心感谢您的深刻见解和帮助。
myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2
print(myList)
当前示例数据-
> print(myList)
[[1]]
V1 V2
1 1 4
2 2 5
3 3 6
[[2]]
V1 V2 V3
1 7 10 13
2 8 11 14
3 9 12 15
期望的结果
> print(myList2)
[[1]]
V1 V2 V3
1 1 4 0
2 7 10 13
[[2]]
V1 V2 V3
1 2 5 0
2 8 11 14
[[3]]
V1 V2 V3
1 3 6 0
2 9 12 15
当前数据帧的不同尺寸使其难以处理。
答案 0 :(得分:3)
这是以下的基本方法:
aperm
转置数组以匹配您的预期输出apply
将数组转换为列表。myListBase <- myList #added because we modify the original list
#get all of the unique names from the list of dataframes
##default ordering is by ordering in list
all_cols <- Reduce(base::union, lapply(myListBase, names))
#loop, add new columns, and then re-order them so all data.frames
# have the same order
myListBase <- lapply(myListBase,
function(DF){
DF[, base::setdiff(all_cols, names(DF))] <- 0 #initialze columns
DF[, all_cols] #reorder columns
}
)
#create 3D array - could be simplified using abind::abind(abind(myListBase, along = 3))
myArrayBase <- array(unlist(myListBase, use.names = F),
dim = c(nrow(myListBase[[1]]), #rows
length(all_cols), #columns
length(myListBase) #3rd dimension
),
dimnames = list(NULL, all_cols, NULL))
#rows and 3rd dimension are transposed
myPermBase <- aperm(myArrayBase, c(3,2,1))
myPermBase
#, , 1
#
# V1 V2 V3
#[1,] 1 4 0
#[2,] 7 10 13
#
#, , 2
#
# V1 V2 V3
#[1,] 2 5 0
#[2,] 8 11 14
#
#, , 3
#
# V1 V2 V3
#[1,] 3 6 0
#[2,] 9 12 15
#make list of dataframes - likely not necessary
apply(myPermBase, 3, data.frame)
#[[1]]
# V1 V2 V3
#1 1 4 0
#2 7 10 13
#
#[[2]]
# V1 V2 V3
#1 2 5 0
#2 8 11 14
#
#[[3]]
# V1 V2 V3
#1 3 6 0
#2 9 12 15
性能
答案的第一个版本具有data.table
和abind
方法,但我已将其删除-base
版本更快,并且没有太多其他清晰度。
Unit: microseconds
expr min lq mean median uq max neval
camille_purrr_dplyr 7910.9 8139.25 8614.956 8246.30 8387.20 60159.5 1000
cole_DT_abind 2555.8 2804.75 3012.671 2917.95 3061.55 6602.3 1000
cole_base 600.3 634.40 697.987 663.00 733.10 3761.6 1000
完整的代码以供参考:
library(dplyr)
library(purrr)
library(data.table)
library(abind)
library(microbenchmark)
myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2
microbenchmark(
camille_purrr_dplyr = {
myList %>%
map_dfr(tibble::rownames_to_column, var = "id") %>%
mutate_at(vars(-id), ~ifelse(is.na(.), 0, .)) %>%
split(.$id) %>%
map(select, -id)
}
,
cole_DT_abind = {
myListDT <- copy(myList)
all_cols <- Reduce(base::union, lapply(myListDT, names))
# data.table used for side effects of updating-by-reference in lapply
lapply(myListDT, setDT)
# add non-existing columns
lapply(myListDT,
function(DT) {
DT[, base::setdiff(all_cols, names(DT)) := 0]
setorderv(DT, all_cols)
})
# abind is used to make an array
myArray <- abind(myListDT, along = 3)
# aperm is used to transpose the array to the preferred route
myPermArray <- aperm(myArray, c(3,2,1))
# myPermArray
#or as a list of data.frames
apply(myPermArray, 3, data.frame)
}
,
cole_base = {
myListBase <- myList
all_cols <- Reduce(base::union, lapply(myListBase, names))
myListBase <- lapply(myListBase,
function(DF){
DF[, base::setdiff(all_cols, names(DF))] <- 0
DF[, all_cols]
}
)
myArrayBase <- array(unlist(myListBase, use.names = F),
dim = c(nrow(myListBase[[1]]), length(all_cols), length(myListBase)),
dimnames = list(NULL, all_cols, NULL))
myPermBase <- aperm(myArrayBase, c(3,2,1))
apply(myPermBase, 3, data.frame)
}
# ,
# cole_base_aperm = {
# myListBase <- myList
#
# all_cols <- Reduce(base::union, lapply(myListBase, names))
#
# myListBase <- lapply(myListBase,
# function(DF){
# DF[, base::setdiff(all_cols, names(DF))] <- 0
# DF[, all_cols]
# }
# )
#
# myArrayABind <- abind(myListBase, along = 3)
#
# myPermBase <- aperm(myArrayABind, c(3,2,1))
# apply(myPermBase, 3, data.frame)
# }
, times = 1000
)
答案 1 :(得分:1)
具有几个dplyr
和purrr
函数的一种方法是在每个数据帧的每一行中添加一个ID列,将它们全部绑定,然后按该ID进行拆分。由于列名不匹配,基本的rbind
会引发错误,但是dplyr::bind_rows
会获取任意数量的数据帧的列表,并为丢失的任何内容添加NA
列。
第一步为您提供一个数据框:
library(dplyr)
library(purrr)
myList %>%
map_dfr(tibble::rownames_to_column, var = "id")
#> id V1 V2 V3
#> 1 1 1 4 NA
#> 2 2 2 5 NA
#> 3 3 3 6 NA
#> 4 1 7 10 13
#> 5 2 8 11 14
#> 6 3 9 12 15
在NA
中填写ID除ID以外的所有列,并在其中填充0,也可以根据需要进行调整。按ID拆分,然后删除ID列,因为您不再需要它。
myList %>%
map_dfr(tibble::rownames_to_column, var = "id") %>%
mutate_at(vars(-id), ~ifelse(is.na(.), 0, .)) %>%
split(.$id) %>%
map(select, -id)
#> $`1`
#> V1 V2 V3
#> 1 1 4 0
#> 4 7 10 13
#>
#> $`2`
#> V1 V2 V3
#> 2 2 5 0
#> 5 8 11 14
#>
#> $`3`
#> V1 V2 V3
#> 3 3 6 0
#> 6 9 12 15