我已将以下代码(基于此post)应用于我的示例data,以生成三个不同的列表,我尝试合并到一个数据框中。
idNodes <- getNodeSet(plans, "//person[@id]")
ids <- lapply(idNodes, function(x) xmlAttrs(x)['id'])
attribact <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//act", xmlAttrs)
attribleg <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//leg", xmlAttrs)
要生成数据框,我尝试使用x <- do.call(rbind.data.frame, mapply(cbind, ids, attribact, attribleg))
,但它给了我以下错误:
(function(...,deparse.level = 1,make.row.names = TRUE)中的错误: 参数列数不匹配另外:有 50个或更多警告(使用警告()查看前50个)
我还想指出,上面的do.call
命令适用于小数据样本(带警告)但不适用于大样本。
所需的输出
id type link x y start_time end_time mode dep_time trav_time arr_time
10000061 home 21258 334867.243653 3126570.70778 03:00:00 15:07:00 ride 15:07:00 00:03:28 15:10:28
10000061 shop 13904 332634.86999 3127078.96383 15:12:00 16:21:00 car 16:21:00 00:09:02 16:30:02
10000061 shop 14129 331666.364904 3129306.48785 16:25:00 17:37:00 ride 17:37:00 00:10:33 17:47:33
10000061 home 21258 334867.243653 3126570.70778 17:45:00 26:59:00 NA NA NA NA
10000302 home 21256 334598.361546 3126269.05167 03:00:00 07:56:00 car 07:56:00 00:03:31 07:59:31
10000302 work 14057 335957.065395 3128105.16619 08:04:00 10:28:00 car 10:28:00 00:06:47 10:34:47
10000302 social 21191 333032.807855 3128759.66141 10:33:00 11:52:00 car 11:52:00 00:07:50 11:59:50
10000302 home 21256 334598.361546 3126269.05167 11:59:00 12:11:00 car 12:11:00 00:04:49 12:15:49
10000302 social 13906 332302.159169 3127536.46778 12:17:00 13:30:00 car 13:30:00 00:05:30 13:35:30
10000302 home 21256 334598.361546 3126269.05167 13:36:00 26:59:00 NA NA NA NA
示例数据
> dput(head(ids,2))
list(structure("10000061", .Names = "id"), structure("10000302", .Names = "id"))
> dput(head(attribact,2))
list(list(structure(c("home", "21258", "334867.243653", "3126570.70778", "03:00:00", "15:07:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "13904", "332634.86999", "3127078.96383", "15:12:00", "16:21:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "14129", "331666.364904", "3129306.48785", "16:25:00", "17:37:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21258", "334867.243653", "3126570.70778", "17:45:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))), list(structure(c("home", "21256", "334598.361546", "3126269.05167", "03:00:00", "07:56:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("work", "14057", "335957.065395", "3128105.16619", "08:04:00", "10:28:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "21191", "333032.807855", "3128759.66141", "10:33:00", "11:52:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "11:59:00", "12:11:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "13906", "332302.159169", "3127536.46778", "12:17:00", "13:30:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "13:36:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))))
> dput(head(attribleg,2))
list(list(structure(c("ride", "15:07:00", "00:03:28", "15:10:28"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "16:21:00", "00:09:02", "16:30:02"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("ride", "17:37:00", "00:10:33", "17:47:33"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))), list(structure(c("car", "07:56:00", "00:03:31", "07:59:31"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "10:28:00", "00:06:47", "10:34:47"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "11:52:00", "00:07:50", "11:59:50"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "12:11:00", "00:04:49", "12:15:49"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "13:30:00", "00:05:30", "13:35:30"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))))
更新
我尝试过以下解决方案。但是,对于我的目的来说,这是非常缓慢的(尽管预先分配)。任何提高效率的建议都非常受欢迎。
library(data.table)
df <- data.table(id=rep(0,10*length(ids)), type=rep("c",10*length(ids)), link=rep(0,10*length(ids)), x=rep(0,10*length(ids)), y=rep(0,10*length(ids)), start_time=rep("c",10*length(ids)), end_time=rep("c",10*length(ids)), mode=rep("c",10*length(ids)), dep_time=rep("c",10*length(ids)), trav_time=rep("c",10*length(ids)), arr_time=rep("c",10*length(ids)))
m <- 1
for (i in 1:length(ids))
{
for(k in 1: length(attribact[[i]]))
{
df[m,id := ids[[i]]]
df[m,type := attribact[[i]][[k]][[1]]]
df[m,link := attribact[[i]][[k]][[2]]]
df[m,x := attribact[[i]][[k]][[3]]]
df[m,y := attribact[[i]][[k]][[4]]]
df[m,start_time := attribact[[i]][[k]][[5]]]
df[m,end_time := attribact[[i]][[k]][[6]]]
df[m,mode := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[1]], NA)]
df[m,dep_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[2]], NA)]
df[m,trav_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[3]], NA)]
df[m,arr_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[4]], NA)]
m <- m+1
}
}
答案 0 :(得分:1)
我会使用/ *将act
和leg
标记放在一起,而不是使用三个单独的列表,并添加ids作为列表名称。
a <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']/*", xmlAttrs)
names(a) <- sapply(idNodes, xmlGetAttr, "id")
# combine using ldply
library(plyr)
x1 <- lapply(a, ldply, "rbind")
x <- ldply( x1, "rbind", .id="id")
现在你只需要格式化data.frame并将leg
属性向上移动1行(如果leg始终是act的下一个兄弟?)。
n <- which(is.na(x$type) )
x[n-1, 8:11] <- x[n,8:11]
x <- subset(x,!is.na(type))
rownames(x) <- NULL
x
id type link x y start_time end_time mode dep_time trav_time arr_time
1 10000061 home 21258 334867.243653 3126570.70778 03:00:00 15:07:00 ride 15:07:00 00:03:27 15:10:27
2 10000061 shop 13904 332634.86999 3127078.96383 15:12:00 16:21:00 car 16:21:00 00:09:44 16:30:44
3 10000061 shop 14129 331666.364904 3129306.48785 16:25:00 17:37:00 ride 17:37:00 00:09:46 17:46:46
4 10000061 home 21258 334867.243653 3126570.70778 17:45:00 26:59:00 <NA> <NA> <NA> <NA>
5 10000302 home 21256 334598.361546 3126269.05167 03:00:00 07:56:00 car 07:56:00 00:03:00 07:59:00
6 10000302 work 14057 335957.065395 3128105.16619 08:04:00 10:28:00 car 10:28:00 00:08:20 10:36:20
7 10000302 social 21191 333032.807855 3128759.66141 10:33:00 11:52:00 car 11:52:00 00:08:33 12:00:33
8 10000302 home 21256 334598.361546 3126269.05167 11:59:00 12:11:00 car 12:11:00 00:06:35 12:17:35
9 10000302 social 13906 332302.159169 3127536.46778 12:17:00 13:30:00 car 13:30:00 00:05:30 13:35:30
10 10000302 home 21256 334598.361546 3126269.05167 13:36:00 26:59:00 <NA> <NA> <NA> <NA>
另一种选择是跳过idNodes,可能只是格式化下面的xmlAttrsToDataFrame
输出。
x <- XML:::xmlAttrsToDataFrame(plans["//person[@id]|//plan[@selected='yes']/*"])
答案 1 :(得分:1)
这可以是一个选项,将三个列表视为&#39; a&#39;&#39; b&#39;和&#39; c&#39;
首先在列表中分配ID&#39; a&#39;作为列表的名称&#39; b&#39;和&#39; c&#39;然后rbind
列表中的每个元素&#39; b&#39;和&#39; c&#39;如下图所示
names(b) = unlist(a)
names(c) = unlist(a)
list1 = lapply(b, function(x) do.call(rbind, x)) # rbind list elements
list2 = lapply(c, function(x) do.call(rbind, x))
下一个cbind
list1和list2元素,考虑list1中列表元素的长度,最后使用rbind
将新列表元素放在一起
out = do.call(rbind,
lapply(names(list1),
function(x){
cbind(id = x,
data.frame(list1[[x]]),
data.frame(list2[[x]])[1:nrow(list1[[x]]),])
}))
#> out
# id type link x y start_time end_time mode
#1 10000061 home 21258 334867.243653 3126570.70778 03:00:00 15:07:00 ride
#2 10000061 shop 13904 332634.86999 3127078.96383 15:12:00 16:21:00 car
#3 10000061 shop 14129 331666.364904 3129306.48785 16:25:00 17:37:00 ride
#NA 10000061 home 21258 334867.243653 3126570.70778 17:45:00 26:59:00 <NA>
#11 10000302 home 21256 334598.361546 3126269.05167 03:00:00 07:56:00 car
#21 10000302 work 14057 335957.065395 3128105.16619 08:04:00 10:28:00 car
#31 10000302 social 21191 333032.807855 3128759.66141 10:33:00 11:52:00 car
#4 10000302 home 21256 334598.361546 3126269.05167 11:59:00 12:11:00 car
#5 10000302 social 13906 332302.159169 3127536.46778 12:17:00 13:30:00 car
#NA1 10000302 home 21256 334598.361546 3126269.05167 13:36:00 26:59:00 <NA>
# dep_time trav_time arr_time
#1 15:07:00 00:03:28 15:10:28
#2 16:21:00 00:09:02 16:30:02
#3 17:37:00 00:10:33 17:47:33
#NA <NA> <NA> <NA>
#11 07:56:00 00:03:31 07:59:31
#21 10:28:00 00:06:47 10:34:47
#31 11:52:00 00:07:50 11:59:50
#4 12:11:00 00:04:49 12:15:49
#5 13:30:00 00:05:30 13:35:30
#NA1 <NA> <NA> <NA>