将可变长度列表转换为R中的数据帧的更快方法

时间:2015-05-30 11:47:06

标签: r

我有一个可变长度列表,看起来像这样

chr [1:249] "1" "29.12" "2" "20.78" "3" "12.09" ...
chr [1:200] "1" "20.45" "3" "10.56" "4" "12.34" ...
chr [1:213] "2" "12.20" "3" "19.93" "5" "23.05" ...

奇数位置的值(" 1"," 3"," 4"等)表示具有特定含义的变量,而偶数位置的值为变量的值由前面的数字表示。例如。在列表的第二个元素中,变量" 3"有价值" 10.56"。

我试图将其转换为数据框,其值为" 10.56"进入数据框的正确列,即列" 3"。这是我正在使用的代码

e <- unlist(d[[k]])  ## d is my list. k is the index for a for loop
pos_index <- seq(1, length(e), 2) ## gives positions for the variables
val_index <- seq(2, length(e), 2) ## gives positions for corresponding values
df_index <- as.numeric(e[pos_index])

## Populate a pre-defined data frame at calculated positions
CNNIBN_DF[k, df_index] <- as.numeric(e[val_index])

数据框应该看起来像这样

   X1    X2    X3    X4    X5
1  29.12 20.78 12.09 NA    NA
2  20.45 NA    10.56 12.34 NA
3  NA    12.20 19.93 NA    23.05

这可行,但需要很长时间。 1000个实体的system.time给出了这个

user   system  elapsed 
57.64  0.06    58.14

列表本身有33k个实体,每个实体有200多个元素。我只使用for循环尝试了相同的操作,但两者往往需要大约相同的时间。

有更快的方法吗?我使用的win32机器配备4GB RAM,运行Intel Core i3 M350 CPU @ 2.27 GHz。

提前致谢!

2 个答案:

答案 0 :(得分:3)

Akrun已经发布了一些可能的替代品;我只是添加一个更明确的方法,似乎尽可能少(使用akrun&#39; s&#34; lst&#34;):

ulst = unlist(lst)
cols = seq(1, length(ulst), 2)
inds = cbind(row = rep(seq_along(lst), lengths(lst) %/% 2), 
             col = as.integer(ulst[cols]))
vals = as.numeric(ulst[-cols])
ans = matrix(, max(inds[, "row"]), max(inds[, "col"]))
ans[inds] = vals
#      [,1]  [,2]  [,3]  [,4]  [,5]
#[1,] 29.12 20.78 12.09    NA    NA
#[2,] 20.45    NA 10.56 12.34    NA
#[3,]    NA 12.20 19.93    NA 23.05

从你的目标来看,似乎你不一定需要一个&#34; data.frame&#34;,但是&#34;矩阵&#34;很容易转换成一个。此外,可能值得研究一下您是否可以操纵数据的构建/获取以避免这种奇怪的格式。

答案 1 :(得分:1)

尝试

lst1 <- lapply(lst, function(x) { x<- as.numeric(x)
                    indx <- c(TRUE, FALSE)
                   v1 <- tabulate(x[indx])
                   is.na(v1) <- v1==0
                   v1[!is.na(v1)] <- x[!indx]
                   v1 })

setNames(do.call(rbind.data.frame,lapply(lst1, `length<-`,
                         max(lengths(lst1)))), paste0('X', 1:5))
#     X1    X2    X3    X4    X5
#1 29.12 20.78 12.09    NA    NA
#2 20.45    NA 10.56 12.34    NA
#3    NA 12.20 19.93    NA 23.05

或者

m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
          nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
m2[m1[,-3]] <- m1[,3]

我们可以使用sparseMatrix

中的Matrix
 library(Matrix)
 d1 <- setNames(as.data.frame(m1), c('Row', 'Col', 'Value'))
 with(d1, sparseMatrix(Row, Col, x=Value))
  #3 x 5 sparse Matrix of class "dgCMatrix"

 #[1,] 29.12 20.78 12.09  .     .   
 #[2,] 20.45  .    10.56 12.34  .   
 #[3,]  .    12.20 19.93  .    23.05

可以通过matrix转换为as.matrix

library(tidyr)
library(dplyr)
d1 <- unnest(lst, group) 
d2 <- bind_cols(slice(d1, seq(1, n(), by=2)), slice(d1, seq(2, n(), by=2))[2])
colnames(d2)[3] <- 'val'
spread(d2, x, val) %>%
                  select(-group)
#     1     2     3     4     5
#1 29.12 20.78 12.09  <NA>  <NA>
#2 20.45  <NA> 10.56 12.34  <NA>
#3  <NA> 12.20 19.93  <NA> 23.05

或者

library(data.table)#v1.9.5+
library(reshape2)
dcast(setDT(melt(lst))[, list(indx= value[c(TRUE, FALSE)], 
   value=value[c(FALSE, TRUE)]) ,L1], L1~paste0('X', indx), value.var='value')
#   L1    X1    X2    X3    X4    X5
#1:  1 29.12 20.78 12.09    NA    NA
#2:  2 20.45    NA 10.56 12.34    NA
#3:  3    NA 12.20 19.93    NA 23.05

基准

对于1000个实体列表,

set.seed(42)
lst <- lapply(1:1000, function(i) {v1 <- sample(50:200)[1L]
                     v2 <- sample(1:200, v1, replace=FALSE)
                     as.character(c(rbind(v2, rnorm(v1))))})

system.time({
  m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y), 
     nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
  m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
  m2[m1[,-3]] <- m1[,3]
 })
#  user  system elapsed 
# 0.064   0.004   0.067 


system.time({
  m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y),
          nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
  d1 <- setNames(as.data.frame(m1), c('Row', 'Col', 'Value'))
   with(d1, sparseMatrix(Row, Col, x=Value))
   })
#  user  system elapsed 
# 0.068   0.003   0.070 


system.time({d1 <- unnest(lst, group)
          d2 <-  bind_cols(slice(d1, seq(1, n(), by=2)),
                  slice(d1, seq(2, n(), by=2))[2])
          colnames(d2)[3] <- 'val'
          res <- spread(d2, x, val) %>%
                       select(-group)}) 
 #    user  system elapsed 
 #  0.259   0.002   0.261 

使用第一种方法稍慢

 system.time({
       lst1 <- lapply(lst, function(x) { x<- as.numeric(x)
                indx <- c(TRUE, FALSE)
               v1 <- tabulate(x[indx])
               is.na(v1) <- v1==0
               v1[!is.na(v1)] <- x[!indx]
               v1 })

   setNames(do.call(rbind.data.frame,lapply(lst1, `length<-`,
                     max(lengths(lst1)))), paste0('X', 1:5))
      })

 #  user  system elapsed 
 #1.459   0.004   1.463 

33000列表

set.seed(42)
lst <- lapply(1:33000, function(i) {v1 <- sample(50:200)[1L]
                 v2 <- sample(1:200, v1, replace=FALSE)
                   as.character(c(rbind(v2, rnorm(v1))))})
 system.time({
  m1 <- do.call(rbind,Map(function(x,y) cbind(x,matrix(as.numeric(y), 
     nrow=length(y)/2, byrow=TRUE)), seq_along(lst), lst))
  m2 <- matrix(NA, ncol=max(m1[,2]), nrow=length(lst))
  m2[m1[,-3]] <- m1[,3]
 })
 #  user  system elapsed 
 # 6.160   0.102   6.260 

@alexis_laz方法更快

 system.time({
  ulst = unlist(lst)
  cols = seq(1, length(ulst), 2)
 inds = cbind(row = rep(seq_along(lst), lengths(lst) %/% 2), 
            col = as.integer(ulst[cols]))
 vals = as.numeric(ulst[-cols])
 ans = matrix(, max(inds[, "row"]), max(inds[, "col"]))
  ans[inds] = vals
 })
#  user  system elapsed 
#  2.421   0.041   2.460 

数据

lst <- list(c('1', '29.12', '2', '20.78', '3', '12.09'), c('1', '20.45',
'3', '10.56', '4', '12.34'), c('2', '12.20', '3', '19.93', '5', '23.05'))