我有一个可变长度列表,看起来像这样
chr [1:249] "1" "29.12" "2" "20.78" "3" "12.09" ...
chr [1:200] "1" "20.45" "3" "10.56" "4" "12.34" ...
chr [1:213] "2" "12.20" "3" "19.93" "5" "23.05" ...
奇数位置的值(" 1"," 3"," 4"等)表示具有特定含义的变量,而偶数位置的值为变量的值由前面的数字表示。例如。在列表的第二个元素中,变量" 3"有价值" 10.56"。
我试图将其转换为数据框,其值为" 10.56"进入数据框的正确列,即列" 3"。这是我正在使用的代码
e <- unlist(d[[k]]) ## d is my list. k is the index for a for loop
pos_index <- seq(1, length(e), 2) ## gives positions for the variables
val_index <- seq(2, length(e), 2) ## gives positions for corresponding values
df_index <- as.numeric(e[pos_index])
## Populate a pre-defined data frame at calculated positions
CNNIBN_DF[k, df_index] <- as.numeric(e[val_index])
数据框应该看起来像这样
X1 X2 X3 X4 X5
1 29.12 20.78 12.09 NA NA
2 20.45 NA 10.56 12.34 NA
3 NA 12.20 19.93 NA 23.05
这可行,但需要很长时间。 1000个实体的system.time
给出了这个
user system elapsed
57.64 0.06 58.14
列表本身有33k个实体,每个实体有200多个元素。我只使用for
循环尝试了相同的操作,但两者往往需要大约相同的时间。
有更快的方法吗?我使用的win32机器配备4GB RAM,运行Intel Core i3 M350 CPU @ 2.27 GHz。
提前致谢!
答案 0 :(得分:3)
Akrun已经发布了一些可能的替代品;我只是添加一个更明确的方法,似乎尽可能少(使用akrun&#39; s&#34; lst&#34;):
ulst = unlist(lst)
cols = seq(1, length(ulst), 2)
inds = cbind(row = rep(seq_along(lst), lengths(lst) %/% 2),
col = as.integer(ulst[cols]))
vals = as.numeric(ulst[-cols])
ans = matrix(, max(inds[, "row"]), max(inds[, "col"]))
ans[inds] = vals
# [,1] [,2] [,3] [,4] [,5]
#[1,] 29.12 20.78 12.09 NA NA
#[2,] 20.45 NA 10.56 12.34 NA
#[3,] NA 12.20 19.93 NA 23.05
从你的目标来看,似乎你不一定需要一个&#34; data.frame&#34;,但是&#34;矩阵&#34;很容易转换成一个。此外,可能值得研究一下您是否可以操纵数据的构建/获取以避免这种奇怪的格式。
答案 1 :(得分:1)
尝试
lst1 <- lapply(lst, function(x) { x<- as.numeric(x)
indx <- c(TRUE, FALSE)
v1 <- tabulate(x[indx])
is.na(v1) <- v1==0
v1[!is.na(v1)] <- x[!indx]
v1 })
setNames(do.call(rbind.data.frame,lapply(lst1, `length<-`,
max(lengths(lst1)))), paste0('X', 1:5))
# X1 X2 X3 X4 X5
#1 29.12 20.78 12.09 NA NA
#2 20.45 NA 10.56 12.34 NA
#3 NA 12.20 19.93 NA 23.05
或者
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
m2[m1[,-3]] <- m1[,3]
我们可以使用sparseMatrix
Matrix
library(Matrix)
d1 <- setNames(as.data.frame(m1), c('Row', 'Col', 'Value'))
with(d1, sparseMatrix(Row, Col, x=Value))
#3 x 5 sparse Matrix of class "dgCMatrix"
#[1,] 29.12 20.78 12.09 . .
#[2,] 20.45 . 10.56 12.34 .
#[3,] . 12.20 19.93 . 23.05
可以通过matrix
转换为as.matrix
。
或
library(tidyr)
library(dplyr)
d1 <- unnest(lst, group)
d2 <- bind_cols(slice(d1, seq(1, n(), by=2)), slice(d1, seq(2, n(), by=2))[2])
colnames(d2)[3] <- 'val'
spread(d2, x, val) %>%
select(-group)
# 1 2 3 4 5
#1 29.12 20.78 12.09 <NA> <NA>
#2 20.45 <NA> 10.56 12.34 <NA>
#3 <NA> 12.20 19.93 <NA> 23.05
或者
library(data.table)#v1.9.5+
library(reshape2)
dcast(setDT(melt(lst))[, list(indx= value[c(TRUE, FALSE)],
value=value[c(FALSE, TRUE)]) ,L1], L1~paste0('X', indx), value.var='value')
# L1 X1 X2 X3 X4 X5
#1: 1 29.12 20.78 12.09 NA NA
#2: 2 20.45 NA 10.56 12.34 NA
#3: 3 NA 12.20 19.93 NA 23.05
对于1000个实体列表,
set.seed(42)
lst <- lapply(1:1000, function(i) {v1 <- sample(50:200)[1L]
v2 <- sample(1:200, v1, replace=FALSE)
as.character(c(rbind(v2, rnorm(v1))))})
system.time({
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
m2[m1[,-3]] <- m1[,3]
})
# user system elapsed
# 0.064 0.004 0.067
system.time({
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
d1 <- setNames(as.data.frame(m1), c('Row', 'Col', 'Value'))
with(d1, sparseMatrix(Row, Col, x=Value))
})
# user system elapsed
# 0.068 0.003 0.070
system.time({d1 <- unnest(lst, group)
d2 <- bind_cols(slice(d1, seq(1, n(), by=2)),
slice(d1, seq(2, n(), by=2))[2])
colnames(d2)[3] <- 'val'
res <- spread(d2, x, val) %>%
select(-group)})
# user system elapsed
# 0.259 0.002 0.261
使用第一种方法稍慢
system.time({
lst1 <- lapply(lst, function(x) { x<- as.numeric(x)
indx <- c(TRUE, FALSE)
v1 <- tabulate(x[indx])
is.na(v1) <- v1==0
v1[!is.na(v1)] <- x[!indx]
v1 })
setNames(do.call(rbind.data.frame,lapply(lst1, `length<-`,
max(lengths(lst1)))), paste0('X', 1:5))
})
# user system elapsed
#1.459 0.004 1.463
在33000
列表
set.seed(42)
lst <- lapply(1:33000, function(i) {v1 <- sample(50:200)[1L]
v2 <- sample(1:200, v1, replace=FALSE)
as.character(c(rbind(v2, rnorm(v1))))})
system.time({
m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
m2[m1[,-3]] <- m1[,3]
})
# user system elapsed
# 6.160 0.102 6.260
@alexis_laz方法更快
system.time({
ulst = unlist(lst)
cols = seq(1, length(ulst), 2)
inds = cbind(row = rep(seq_along(lst), lengths(lst) %/% 2),
col = as.integer(ulst[cols]))
vals = as.numeric(ulst[-cols])
ans = matrix(, max(inds[, "row"]), max(inds[, "col"]))
ans[inds] = vals
})
# user system elapsed
# 2.421 0.041 2.460
lst <- list(c('1', '29.12', '2', '20.78', '3', '12.09'), c('1', '20.45',
'3', '10.56', '4', '12.34'), c('2', '12.20', '3', '19.93', '5', '23.05'))