在以下示例hflights数据中,我想找出每个Origin机场和Month的max和min ArrDelays以及相应的UniqueCarrier和Dest。我得到了它的工作,但我觉得它可以变得更简单。
对于每个.SD,我可以找到min(ArrDelay)和Max(ArrDelay),但我还需要与Min和Max ArrDelay对应的Airline和Dest。有没有办法执行查找?
library(data.table)
library(hflights)
DT <- as.data.table(hflights)
setkey(DT, Origin, Month)
DT[,
c(head(.SD[, .(MaxArrDelay=max(ArrDelay, na.rm = TRUE), Dest) , by=UniqueCarrier][order(-MaxArrDelay)], 1),
head(.SD[, .(MinArrDelay=min(ArrDelay, na.rm = TRUE), Dest) , by=UniqueCarrier][order(MinArrDelay)], 1) )
, by=.(Origin, Month) ]
# Test the output for a single record...
DT[ .("HOU", 1), .(max(na.omit(ArrDelay)), min(na.omit(ArrDelay)))]
答案 0 :(得分:2)
你可以尝试
library(data.table)#v1.9.5+
res1 <- DT[, {min1 <- which.min(ArrDelay)
max1 <- which.max(ArrDelay)
list(DestMin=Dest[min1],
ArrDelayMin=ArrDelay[min1],
MinUniqueCarrier= UniqueCarrier[min1],
DestMax= Dest[max1],
ArrDelayMax= ArrDelay[max1],
MaxUniqueCarrier=UniqueCarrier[max1] )},
by = .(Origin, Month)]
或者这可以通过
来实现 nm1 <- c('Dest', 'ArrDelay', 'UniqueCarrier')
res2 <- DT[, c(.SD[which.min(ArrDelay)], .SD[which.max(ArrDelay)]) ,
by = .(Origin, Month), .SDcols= nm1]
setnames(res2, 3:ncol(res2), paste0(nm1, rep(c('Min', 'Max'),each=length(nm1))))
all.equal(res1, res2, check.attributes=FALSE)
#[1] TRUE
或使用dplyr
library(dplyr)
grh <- group_by(hflights, Origin, Month)
Min <- grh %>%
slice(which.min(ArrDelay)) %>%
select(Dest, ArrDelay, UniqueCarrier) %>%
setNames(., c(names(.)[1:2], paste0(names(.)[3:5], 'Min')))
Max <- grh %>%
slice(which.max(ArrDelay)) %>%
select(Dest, ArrDelay, UniqueCarrier) %>%
setNames(., c(names(.)[1:2], paste0(names(.)[3:5], 'Max')))
bind_cols(Min, Max[-(1:2)])