Question

我在网上找到了几个可用于完成此任务的解决方案。然而，它们似乎都没有在R中扩展。最好的方法是什么？

问题描述：
数据： DT或DF是一个具有id列和其他大量随机数据的表该表已按ID排序（由于构造）

任务：为表格中的每个ID提取最后n行

library(data.table)
library(dplyr)
library(magrittr)

# config
set.seed(20160313)
num.rows = 10^5

# build data set
DT <- data.table( c1=runif(num.rows) )
for( i in 2:9 )
    DT[[paste0("c",i)]] <- runif(num.rows)

DT$id <- 1:num.rows %/% 4

# solution with data.table
setkey(DT,id)
cat( "solution: data.table\n" )
print(system.time(
    t1 <- DT[,tail(.SD,n=n),by=id]
))

# solution with dplyr
DF <- as.data.frame(DT)
cat( "solution: dply\n" )
print(system.time(
    t2 <- DF %>% group_by(id) %>% do( tail(.,n=n) )
))

# second solution with dplyr
cat( "solution: dplyr 2\n" )
print(system.time({
    t3 <- DF %>% group_by(id) %>% filter(rank(-row_number()) <= n)
}))


# solution with by command
cat( "solution: by\n" )
print(system.time( {
    temp <- by( DT, DT$id, tail, n=n )
    t4 <- do.call( "rbind", as.list( temp ) )
}))

# solution using split and lapply
cat( "solution: split and lapply\n" )
print(system.time( {
    temp <- split(DT,DT$id)
    temp <- lapply(temp, tail, n=n)
    t5 <- do.call( "rbind", temp )
}))

cat( "solution: via data.table 3\n" )
print(system.time( {
    t6 <- DT[DT[,tail(.I,n),by=id]$V1,]
}))


# failsafe checks
if( all(t1$c1 == t2$c1) )
    cat( "1==2 OK\n" )
if( all(t1$c1 == t3$c1) )
    cat( "1==3 OK\n" )
if( all(t1$c1 == t4$c1) )
    cat( "1==4 OK\n" )
if( all(t1$c1 == t5$c1) )
    cat( "1==5 OK\n" )
if( all(t1$c1 == t6$c1) )
    cat( "1==6 OK\n" )

修改
我用10 ^ 7行测试了下面的答案（注意：上面的一些解决方案对于这么多行不起作用）

最佳表现者：
对于n = 1 ，即提取每组的最后一行

system.time( unique(DT,by="id",fromLast=T))
#   user  system elapsed 
#  0.376   0.036   0.411

system.time( DT[,.SD[.N],by=id])
#   user  system elapsed
# 10.636   0.020  10.652

其他n

system.time( DT[DT[,tail(.I,n),by=id]$V1,] )
# for n=2
#   user  system elapsed 
# 33.740   0.112  33.872

# for n=3
#   user  system elapsed 
# 33.988   0.184  34.194

这似乎仍然有点多，但它适用于我的情况。

可扩展地提取大数据表中每组的最后n行

0 个答案: