用data.table在R中按时间段计数

时间:2019-05-07 05:57:48

标签: r data.table

mat-expansion-panel

并尝试像这样计算N:

library(data.table)

dt <- fread(" ID  DATE    
              A1 20170220
              A1 20170308
              A1 20170311
              A1 20170410
              A1 20170411
              A1 20170413
              A1 20170415
              A1 20170416
              A1 20170420
              A1 20170430
              A2 20170120
              A2 20170121
              A2 20170123
              A2 20170125
              A2 20170202 ")

我尝试过

 ID  DATE     count30day(count rows until after 30day)
 A1 20170220      3 (count row until 20170322)
 A1 20170308      2 (count row until 20170407)
 A1 20170311      2 (count row until 20170410)
 A1 20170410      7 (count row until 20170510)
 A1 20170411      6 (count row until 20170511)
 A1 20170413      5 (count row until 20170513)
 A1 20170415      4 (count row until 20170514)
 A1 20170416      3 (count row until 20170516)
 A1 20170420      2 (count row until 20170520)
 A1 20170430      1 (count row until 20170530)
 A2 20170120      5 (count row until 20170220)
 A2 20170121      4 (count row until 20170220)
 A2 20170123      3 (count row until 20170220) 
 A2 20170125      2 (count row until 20170220)
 A2 20170202      1 (count row until 20170220)      

这是工作,但最后5个值是错误的。 它必须是54321,但结果是55432。

我处理的实际数据大约为2500000行,因此需要很长时间

是否可以减少时间并解决最后一个值问题?

3 个答案:

答案 0 :(得分:2)

另一个data.table解决方案

#set strings to actual dates
dt[, DATE := lubridate::ymd( DATE ) ]
#set key for the join
setkey(dt, DATE)
#join, suspend output until we calculated the number of 'hits' per row (.EACHI)
dt[dt, N := {
  val = dt[ ID == i.ID & DATE %between% c( i.DATE, i.DATE + 30 ) ];
  list( nrow( val ) )
}, by = .EACHI]

输出

#     ID       DATE N
#  1: A2 2017-01-20 5
#  2: A2 2017-01-21 4
#  3: A2 2017-01-23 3
#  4: A2 2017-01-25 2
#  5: A2 2017-02-02 1
#  6: A1 2017-02-20 3
#  7: A1 2017-03-08 2
#  8: A1 2017-03-11 2
#  9: A1 2017-04-10 7
# 10: A1 2017-04-11 6
# 11: A1 2017-04-13 5
# 12: A1 2017-04-15 4
# 13: A1 2017-04-16 3
# 14: A1 2017-04-20 2
# 15: A1 2017-04-30 1

基准

# Unit: milliseconds
#              expr      min       lq     mean   median       uq      max neval
# data.table_wimpel 10.51381 10.73975 11.41636 11.32511 11.89540 13.31526    10
# data.table_ronak  25.42636 25.56223 27.39190 26.46919 29.55910 32.10598    10
# tidyverse_ronak   28.09526 28.73364 30.30307 28.98098 29.45968 38.50784    10

microbenchmark::microbenchmark(
  data.table_wimpel = {
    dt = copy(DT)
    dt[, DATE := lubridate::ymd( DATE ) ]
    setkey(dt, DATE)
    dt[dt, N := {
      val = dt[ ID == i.ID & DATE %between% c( i.DATE, i.DATE + 30 ) ];
      list( nrow( val ) )
    }, by = .EACHI ] },
  data.table_ronak = {
    dt = copy(DT)
    dt$DATE <- ymd(dt$DATE) #Convert to date
    dt$row <- 1:nrow(dt)    #Add row number
    dt[ , N:= mapply(function(x, y) 
      sum(x <= DATE & DATE < (x + months(1)) & y <= row), DATE, row)]    
  },
  tidyverse_ronak = {
    dt = copy(DT)
    dt %>%
      mutate(DATE = ymd(DATE),
             row = row_number(),
             N = map2_dbl(DATE, row, 
                          ~ sum(.x <= DATE & DATE < (.x + months(1)) & .y <= row))) %>%
      select(-row)
  },
  times = 10 )

答案 1 :(得分:2)

使用非装备自连接:

dt[, N := 
    dt[.(ID=ID, stt=DATE, end=DATE+30), on=.(ID, DATE>=stt, DATE<=end), .N, by=.EACHI]$N
]

输出:

    ID       DATE N
 1: A1 2017-02-20 3
 2: A1 2017-03-08 2
 3: A1 2017-03-11 2
 4: A1 2017-04-10 7
 5: A1 2017-04-11 6
 6: A1 2017-04-13 5
 7: A1 2017-04-15 4
 8: A1 2017-04-16 3
 9: A1 2017-04-20 2
10: A1 2017-04-30 1
11: A2 2017-01-20 5
12: A2 2017-01-21 4
13: A2 2017-01-23 3
14: A2 2017-01-25 2
15: A2 2017-02-02 1

数据:

library(data.table)    
dt <- fread(" ID  DATE    
              A1 20170220
              A1 20170308
              A1 20170311
              A1 20170410
              A1 20170411
              A1 20170413
              A1 20170415
              A1 20170416
              A1 20170420
              A1 20170430
              A2 20170120
              A2 20170121
              A2 20170123
              A2 20170125
              A2 20170202 ")
dt[, DATE := as.Date(as.character(DATE), "%Y%m%d")]

答案 2 :(得分:1)

我认为我们需要对当前行号进行额外检查。

使用data.table:

library(data.table)
library(lubridate)

dt[, DATE := ymd(DATE) # convert 'DATE' to Date format by reference
  ][, row := .I        # Add row number using inbuilt var '.I' by reference
   ][ , N := mapply(function(x, y) 
               sum(x <= DATE & DATE < (x + months(1)) & y <= row), DATE, row)]

或使用tidyverse

library(tidyverse)
library(lubridate)
dt %>%
  mutate(DATE = ymd(DATE),
         row = row_number(),
         N = map2_dbl(DATE, row, 
             ~ sum(.x <= DATE & DATE < (.x + months(1)) & .y <= row))) %>%
  select(-row)


#   ID       DATE N
#1  A1 2017-02-20 3
#2  A1 2017-03-08 2
#3  A1 2017-03-11 2
#4  A1 2017-04-10 7
#5  A1 2017-04-11 6
#6  A1 2017-04-13 5
#7  A1 2017-04-15 4
#8  A1 2017-04-16 3
#9  A1 2017-04-20 2
#10 A1 2017-04-30 1
#11 A2 2017-01-20 5
#12 A2 2017-01-21 4
#13 A2 2017-01-23 3
#14 A2 2017-01-25 2
#15 A2 2017-02-02 1