根据不同列中的NA汇总列

时间:2014-08-28 03:31:29

标签: r aggregate

我想根据group1中的NA聚合group2:

Datetime            group1  group2
2011-08-08 21:00:00   1       1
2011-08-08 21:10:00   NA      2
2011-08-08 21:20:00   NA      3
2011-08-08 21:30:00   2       4
2011-08-08 21:40:00   NA      5
2011-08-08 21:50:00   NA      6
2011-08-08 22:00:00   3       7

这是我想要的输出:

Datetime            group1  group2
2011-08-08 21:00:00   1       1
2011-08-08 21:30:00   2       9 
2011-08-08 22:00:00   3       18

编辑: 9 = 2 + 3 + 4和18 = 5 + 6 + 7。

aggregate(group2~group1, data=Data, subset(Data,group1==NA),sum)

任何建议表示赞赏。我可以用聚合物做吗?或者我应该使用不同的包装?

3 个答案:

答案 0 :(得分:2)

看起来na.locf包中的zoo在这里非常有用。

假设dat是您的原始数据,我们可以采用非NA group1级别的日期,并使用cbind将这些日期与汇总的group2数据结合在一起

> library(zoo)
> Datetime <- dat$Datetime[!is.na(dat$group1)]
> cbind(Datetime, aggregate(group2~group1, na.locf(dat, fromLast = TRUE), sum))
#              Datetime group1 group2
# 1 2011-08-08 21:00:00      1      1
# 2 2011-08-08 21:30:00      2      9
# 3 2011-08-08 22:00:00      3     18

PS:感谢您更新/编辑您的问题(+1)。

答案 1 :(得分:1)

使用基础R的解决方案:

ddf = structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "2011-08-08", class = "factor"), 
    time = structure(1:7, .Label = c("21:00:00", "21:10:00", 
    "21:20:00", "21:30:00", "21:40:00", "21:50:00", "22:00:00"
    ), class = "factor"), group1 = c(1L, NA, NA, 2L, NA, NA, 
    3L), group2 = 1:7), .Names = c("Date", "time", "group1", 
"group2"), class = "data.frame", row.names = c(NA, -7L))

ddf$group1a = ddf$group1
for(i in nrow(ddf):1)   
     if(is.na(ddf$group1a[i])) 
          ddf$group1a[i] = ddf$group1a[i+1]
outdf = stack(with(ddf, tapply(group2, group1a, sum)))
names(outdf) = c("group2","group1")
outdf = outdf[,c(2,1)]
outdf

#  group1 group2
#1      1      1
#2      2      9
#3      3     18

答案 2 :(得分:0)

使用data.table

library(data.table)
DT1 <- DT[, group1:=cumsum(!is.na(c(0, group1[1:(.N-1)])))][,
       list(Datetime=Datetime[.N],group2=sum(group2)), by=group1][,c(2,1,3), with=FALSE]
  DT1
 #              Datetime group1 group2
 #1: 2011-08-08 21:00:00      1      1
 #2: 2011-08-08 21:30:00      2      9
 #3: 2011-08-08 22:00:00      3     18

数据

 dat <- structure(list(Datetime = c("2011-08-08 21:00:00", "2011-08-08 21:10:00", 
 "2011-08-08 21:20:00", "2011-08-08 21:30:00", "2011-08-08 21:40:00", 
 "2011-08-08 21:50:00", "2011-08-08 22:00:00"), group1 = c(1L, 
 NA, NA, 2L, NA, NA, 3L), group2 = 1:7), .Names = c("Datetime", 
"group1", "group2"), class = "data.frame", row.names = c(NA, 
 -7L))
 DT <- data.table(dat)