如何在.SD中为data.table执行进一步的分组和查找

时间:2015-07-08 17:50:14

标签: r data.table

在以下示例hflights数据中,我想找出每个Origin机场和Month的max和min ArrDelays以及相应的UniqueCarrier和Dest。我得到了它的工作,但我觉得它可以变得更简单。

对于每个.SD,我可以找到min(ArrDelay)和Max(ArrDelay),但我还需要与Min和Max ArrDelay对应的Airline和Dest。有没有办法执行查找?

library(data.table)
library(hflights)

DT <- as.data.table(hflights)

setkey(DT, Origin, Month)


DT[, 
 c(head(.SD[,  .(MaxArrDelay=max(ArrDelay, na.rm = TRUE), Dest) , by=UniqueCarrier][order(-MaxArrDelay)], 1),
   head(.SD[,  .(MinArrDelay=min(ArrDelay, na.rm = TRUE), Dest) , by=UniqueCarrier][order(MinArrDelay)], 1) )
, by=.(Origin, Month) ]

# Test the output for a single record... 
DT[ .("HOU", 1), .(max(na.omit(ArrDelay)), min(na.omit(ArrDelay)))]

1 个答案:

答案 0 :(得分:2)

你可以尝试

 library(data.table)#v1.9.5+
 res1 <- DT[, {min1 <- which.min(ArrDelay)
               max1 <- which.max(ArrDelay)
               list(DestMin=Dest[min1],
               ArrDelayMin=ArrDelay[min1],
               MinUniqueCarrier= UniqueCarrier[min1],
               DestMax= Dest[max1],
               ArrDelayMax= ArrDelay[max1], 
               MaxUniqueCarrier=UniqueCarrier[max1] )}, 
                                 by = .(Origin, Month)]

或者这可以通过

来实现
 nm1 <- c('Dest', 'ArrDelay', 'UniqueCarrier')   
 res2 <- DT[, c(.SD[which.min(ArrDelay)], .SD[which.max(ArrDelay)]) , 
                   by = .(Origin, Month), .SDcols= nm1]
 setnames(res2, 3:ncol(res2), paste0(nm1, rep(c('Min', 'Max'),each=length(nm1))))

 all.equal(res1, res2, check.attributes=FALSE)
 #[1] TRUE

或使用dplyr

library(dplyr)
grh <- group_by(hflights, Origin, Month)
Min <- grh %>% 
         slice(which.min(ArrDelay)) %>% 
         select(Dest, ArrDelay, UniqueCarrier) %>%
         setNames(., c(names(.)[1:2], paste0(names(.)[3:5], 'Min')))
Max <- grh %>% 
         slice(which.max(ArrDelay)) %>% 
         select(Dest, ArrDelay, UniqueCarrier) %>%
         setNames(., c(names(.)[1:2], paste0(names(.)[3:5], 'Max')))
bind_cols(Min, Max[-(1:2)])