我在网上找到了几个可用于完成此任务的解决方案。然而,它们似乎都没有在R中扩展。最好的方法是什么?
问题描述:
数据: DT或DF是一个具有id列和其他大量随机数据的表
该表已按ID排序(由于构造)
任务:为表格中的每个ID提取最后n行
library(data.table)
library(dplyr)
library(magrittr)
# config
set.seed(20160313)
num.rows = 10^5
# build data set
DT <- data.table( c1=runif(num.rows) )
for( i in 2:9 )
DT[[paste0("c",i)]] <- runif(num.rows)
DT$id <- 1:num.rows %/% 4
# solution with data.table
setkey(DT,id)
cat( "solution: data.table\n" )
print(system.time(
t1 <- DT[,tail(.SD,n=n),by=id]
))
# solution with dplyr
DF <- as.data.frame(DT)
cat( "solution: dply\n" )
print(system.time(
t2 <- DF %>% group_by(id) %>% do( tail(.,n=n) )
))
# second solution with dplyr
cat( "solution: dplyr 2\n" )
print(system.time({
t3 <- DF %>% group_by(id) %>% filter(rank(-row_number()) <= n)
}))
# solution with by command
cat( "solution: by\n" )
print(system.time( {
temp <- by( DT, DT$id, tail, n=n )
t4 <- do.call( "rbind", as.list( temp ) )
}))
# solution using split and lapply
cat( "solution: split and lapply\n" )
print(system.time( {
temp <- split(DT,DT$id)
temp <- lapply(temp, tail, n=n)
t5 <- do.call( "rbind", temp )
}))
cat( "solution: via data.table 3\n" )
print(system.time( {
t6 <- DT[DT[,tail(.I,n),by=id]$V1,]
}))
# failsafe checks
if( all(t1$c1 == t2$c1) )
cat( "1==2 OK\n" )
if( all(t1$c1 == t3$c1) )
cat( "1==3 OK\n" )
if( all(t1$c1 == t4$c1) )
cat( "1==4 OK\n" )
if( all(t1$c1 == t5$c1) )
cat( "1==5 OK\n" )
if( all(t1$c1 == t6$c1) )
cat( "1==6 OK\n" )
修改
我用10 ^ 7行测试了下面的答案(注意:上面的一些解决方案对于这么多行不起作用)
最佳表现者:
对于n = 1 ,即提取每组的最后一行
system.time( unique(DT,by="id",fromLast=T))
# user system elapsed
# 0.376 0.036 0.411
system.time( DT[,.SD[.N],by=id])
# user system elapsed
# 10.636 0.020 10.652
其他n
system.time( DT[DT[,tail(.I,n),by=id]$V1,] )
# for n=2
# user system elapsed
# 33.740 0.112 33.872
# for n=3
# user system elapsed
# 33.988 0.184 34.194
这似乎仍然有点多,但它适用于我的情况。