R将不相等的列表加入单个数据帧

时间:2016-03-11 23:13:58

标签: xml r list data.table

我已将以下代码(基于此post)应用于我的示例data,以生成三个不同的列表,我尝试合并到一个数据框中。

idNodes <- getNodeSet(plans, "//person[@id]") ids <- lapply(idNodes, function(x) xmlAttrs(x)['id']) attribact <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//act", xmlAttrs) attribleg <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//leg", xmlAttrs)

要生成数据框,我尝试使用x <- do.call(rbind.data.frame, mapply(cbind, ids, attribact, attribleg)),但它给了我以下错误:

  

(function(...,deparse.level = 1,make.row.names = TRUE)中的错误:   参数列数不匹配另外:有   50个或更多警告(使用警告()查看前50个)

我还想指出,上面的do.call命令适用于小数据样本(带警告)但不适用于大样本。

所需的输出

id        type   link   x              y              start_time end_time   mode  dep_time   trav_time arr_time
10000061  home   21258  334867.243653  3126570.70778  03:00:00   15:07:00   ride  15:07:00   00:03:28  15:10:28 
10000061  shop   13904  332634.86999   3127078.96383  15:12:00   16:21:00   car   16:21:00   00:09:02  16:30:02 
10000061  shop   14129  331666.364904  3129306.48785  16:25:00   17:37:00   ride  17:37:00   00:10:33  17:47:33 
10000061  home   21258  334867.243653  3126570.70778  17:45:00   26:59:00   NA    NA         NA        NA
10000302  home   21256  334598.361546  3126269.05167  03:00:00   07:56:00   car   07:56:00   00:03:31  07:59:31 
10000302  work   14057  335957.065395  3128105.16619  08:04:00   10:28:00   car   10:28:00   00:06:47  10:34:47 
10000302  social 21191  333032.807855  3128759.66141  10:33:00   11:52:00   car   11:52:00   00:07:50  11:59:50 
10000302  home   21256  334598.361546  3126269.05167  11:59:00   12:11:00   car   12:11:00   00:04:49  12:15:49 
10000302  social 13906  332302.159169  3127536.46778  12:17:00   13:30:00   car   13:30:00   00:05:30  13:35:30 
10000302  home   21256  334598.361546  3126269.05167  13:36:00   26:59:00   NA    NA         NA        NA

示例数据

> dput(head(ids,2))
list(structure("10000061", .Names = "id"), structure("10000302", .Names = "id"))

> dput(head(attribact,2))
list(list(structure(c("home", "21258", "334867.243653", "3126570.70778", "03:00:00", "15:07:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "13904", "332634.86999", "3127078.96383", "15:12:00", "16:21:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "14129", "331666.364904", "3129306.48785", "16:25:00", "17:37:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21258", "334867.243653", "3126570.70778", "17:45:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))), list(structure(c("home", "21256", "334598.361546", "3126269.05167", "03:00:00", "07:56:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("work", "14057", "335957.065395", "3128105.16619", "08:04:00", "10:28:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "21191", "333032.807855", "3128759.66141", "10:33:00", "11:52:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "11:59:00", "12:11:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "13906", "332302.159169", "3127536.46778", "12:17:00", "13:30:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "13:36:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))))

> dput(head(attribleg,2))
list(list(structure(c("ride", "15:07:00", "00:03:28", "15:10:28"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "16:21:00", "00:09:02", "16:30:02"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("ride", "17:37:00", "00:10:33", "17:47:33"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))), list(structure(c("car", "07:56:00", "00:03:31", "07:59:31"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "10:28:00", "00:06:47", "10:34:47"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "11:52:00", "00:07:50", "11:59:50"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "12:11:00", "00:04:49", "12:15:49"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "13:30:00", "00:05:30", "13:35:30"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))))

更新

我尝试过以下解决方案。但是,对于我的目的来说,这是非常缓慢的(尽管预先分配)。任何提高效率的建议都非常受欢迎。

library(data.table)
df <- data.table(id=rep(0,10*length(ids)), type=rep("c",10*length(ids)), link=rep(0,10*length(ids)), x=rep(0,10*length(ids)), y=rep(0,10*length(ids)), start_time=rep("c",10*length(ids)), end_time=rep("c",10*length(ids)), mode=rep("c",10*length(ids)), dep_time=rep("c",10*length(ids)), trav_time=rep("c",10*length(ids)), arr_time=rep("c",10*length(ids)))
m <- 1
for (i in 1:length(ids))
{
  for(k in 1: length(attribact[[i]]))
  {
    df[m,id := ids[[i]]]
    df[m,type := attribact[[i]][[k]][[1]]]
    df[m,link := attribact[[i]][[k]][[2]]]
    df[m,x := attribact[[i]][[k]][[3]]]
    df[m,y := attribact[[i]][[k]][[4]]]
    df[m,start_time := attribact[[i]][[k]][[5]]]
    df[m,end_time := attribact[[i]][[k]][[6]]]
    df[m,mode := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[1]], NA)]
    df[m,dep_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[2]], NA)]
    df[m,trav_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[3]], NA)]
    df[m,arr_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[4]], NA)]
    m <- m+1
  }
}

2 个答案:

答案 0 :(得分:1)

我会使用/ *将actleg标记放在一起,而不是使用三个单独的列表,并添加ids作为列表名称。

a <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']/*", xmlAttrs)
names(a) <- sapply(idNodes, xmlGetAttr, "id")
# combine using ldply
library(plyr)
x1 <- lapply(a, ldply, "rbind")
x <- ldply( x1, "rbind", .id="id")

现在你只需要格式化data.frame并将leg属性向上移动1行(如果leg始终是act的下一个兄弟?)。

n <- which(is.na(x$type) )
x[n-1, 8:11] <- x[n,8:11]
x <- subset(x,!is.na(type))
rownames(x) <- NULL
x   
         id   type  link             x             y start_time end_time mode dep_time trav_time arr_time
1  10000061   home 21258 334867.243653 3126570.70778   03:00:00 15:07:00 ride 15:07:00  00:03:27 15:10:27
2  10000061   shop 13904  332634.86999 3127078.96383   15:12:00 16:21:00  car 16:21:00  00:09:44 16:30:44
3  10000061   shop 14129 331666.364904 3129306.48785   16:25:00 17:37:00 ride 17:37:00  00:09:46 17:46:46
4  10000061   home 21258 334867.243653 3126570.70778   17:45:00 26:59:00 <NA>     <NA>      <NA>     <NA>
5  10000302   home 21256 334598.361546 3126269.05167   03:00:00 07:56:00  car 07:56:00  00:03:00 07:59:00
6  10000302   work 14057 335957.065395 3128105.16619   08:04:00 10:28:00  car 10:28:00  00:08:20 10:36:20
7  10000302 social 21191 333032.807855 3128759.66141   10:33:00 11:52:00  car 11:52:00  00:08:33 12:00:33
8  10000302   home 21256 334598.361546 3126269.05167   11:59:00 12:11:00  car 12:11:00  00:06:35 12:17:35
9  10000302 social 13906 332302.159169 3127536.46778   12:17:00 13:30:00  car 13:30:00  00:05:30 13:35:30
10 10000302   home 21256 334598.361546 3126269.05167   13:36:00 26:59:00 <NA>     <NA>      <NA>     <NA>

另一种选择是跳过idNodes,可能只是格式化下面的xmlAttrsToDataFrame输出。

x <- XML:::xmlAttrsToDataFrame(plans["//person[@id]|//plan[@selected='yes']/*"])

答案 1 :(得分:1)

这可以是一个选项,将三个列表视为&#39; a&#39;&#39; b&#39;和&#39; c&#39;

首先在列表中分配ID&#39; a&#39;作为列表的名称&#39; b&#39;和&#39; c&#39;然后rbind列表中的每个元素&#39; b&#39;和&#39; c&#39;如下图所示

names(b) = unlist(a) 
names(c) = unlist(a)

list1 = lapply(b, function(x) do.call(rbind, x)) # rbind list elements
list2 = lapply(c, function(x) do.call(rbind, x))

下一个cbind list1和list2元素,考虑list1中列表元素的长度,最后使用rbind将新列表元素放在一起

out = do.call(rbind, 
      lapply(names(list1), 
        function(x){ 
          cbind(id = x, 
             data.frame(list1[[x]]), 
             data.frame(list2[[x]])[1:nrow(list1[[x]]),])
      }))


#> out
#         id   type  link             x             y start_time end_time mode
#1   10000061   home 21258 334867.243653 3126570.70778   03:00:00 15:07:00 ride
#2   10000061   shop 13904  332634.86999 3127078.96383   15:12:00 16:21:00  car
#3   10000061   shop 14129 331666.364904 3129306.48785   16:25:00 17:37:00 ride
#NA  10000061   home 21258 334867.243653 3126570.70778   17:45:00 26:59:00 <NA>
#11  10000302   home 21256 334598.361546 3126269.05167   03:00:00 07:56:00  car
#21  10000302   work 14057 335957.065395 3128105.16619   08:04:00 10:28:00  car
#31  10000302 social 21191 333032.807855 3128759.66141   10:33:00 11:52:00  car
#4   10000302   home 21256 334598.361546 3126269.05167   11:59:00 12:11:00  car
#5   10000302 social 13906 332302.159169 3127536.46778   12:17:00 13:30:00  car
#NA1 10000302   home 21256 334598.361546 3126269.05167   13:36:00 26:59:00 <NA>
#    dep_time trav_time arr_time
#1   15:07:00  00:03:28 15:10:28
#2   16:21:00  00:09:02 16:30:02
#3   17:37:00  00:10:33 17:47:33
#NA      <NA>      <NA>     <NA>
#11  07:56:00  00:03:31 07:59:31
#21  10:28:00  00:06:47 10:34:47
#31  11:52:00  00:07:50 11:59:50
#4   12:11:00  00:04:49 12:15:49
#5   13:30:00  00:05:30 13:35:30
#NA1     <NA>      <NA>     <NA>