Question

我有一个大型的时间序列数据文件，如下所示。数据集涵盖年份，增量为15分钟。一个小子集看起来像：

uniqueid     time
a            2014-04-30 23:30:00 
a            2014-04-30 23:45:00
a            2014-05-01 00:00:00
a            2014-05-01 00:15:00
a            2014-05-12 13:45:00
a            2014-05-12 14:00:00
b            2014-05-12 13:45:00
b            2014-05-12 14:00:00
b            2014-05-12 14:30:00

重现以上内容：

time<-c("2014-04-30 23:30:00","2014-04-30 23:45:00","2014-05-01 00:00:00","2014-05-01 00:15:00",
    "2014-05-12 13:45:00","2014-05-12 14:00:00","2014-05-12 13:45:00","2014-05-12 14:00:00",
    "2014-05-12 14:30:00")

uniqueid<-c("a","a","a","a","a","a","b","b","b")
mydf<-data.frame(uniqueid,time)

我的目标是按连续时间流计算每个唯一ID的行数。连续的时间跨度是指连续每15分钟标记一个唯一ID（例如id A，从30.04.14 23.30 hrs到01.05.14 00.15小时标记 - 因此4行），但当这个流量为15时-minute迭代被中断（在01.05.14 00:15之后，它没有在01.05.14 00:30标记，因此它被中断），它应该将下一个时间戳计为新的连续时间流的开始并再次计算数量行直到此流程再次中断。时间是POSIX。

正如你在上面的例子中看到的那样;连续的时间流可以涵盖不同的日期，不同的月份或不同的年份。我有许多独特的ID（如上所述，是一个非常大的文件），所以我正在寻找一种我的计算机可以处理的方式（循环可能不起作用）。

我正在寻找类似的输出：

uniqueid    flow     number_rows
a           1        4
a           2        2
b           3        2
b           4        1

我已经研究了一些时间包（例如lubridate），但鉴于我的R知识有限，我甚至不知道从哪里开始。

我希望一切都清楚 - 如果没有，我很乐意尝试进一步澄清它。非常感谢你提前！

Answer 1

使用data.table同时使用时差的另一种方法是使用每个组中组号和行数的data.table内部值：

library(data.table)
res<-setDT(mydf)[, list(number_rows=.N,flow=.GRP),
                 by=.(uniqueid,cumsum(as.numeric(difftime(time,shift(time,1L,type="lag",fill=0))) - 15))][,cumsum:=NULL]
print(res)

   uniqueid number_rows flow
1:        a           4    1
2:        a           2    2
3:        b           2    3
4:        b           1    4

此外，由于您发布的样本数据与您发布的子集不一致，因此我在下面列出了我的数据：

数据

time<-as.POSIXct(c("2014-04-30 23:30:00","2014-04-30 23:45:00","2014-05-01 00:00:00","2014-05-01 00:15:00", "2014-05-12 13:45:00","2014-05-12 14:00:00","2014-05-12 13:45:00","2014-05-12 14:00:00", "2014-05-12 14:30:00")) uniqueid<-c("a","a","a","a","a","a","b","b","b") mydf<-data.frame(uniqueid,time)

Answer 2

您可以按uniqueid分组，并且行之间的时间差的累积总和不等于15 min并且给出流id，然后应该计算行数给你你需要的东西：

逻辑的合理性是每当15中的时间差不等于uniqueid时，应生成新的流程，因此我们将其标记为TRUE并将其合并使用cumsum，它将成为具有以下连续行的新flow ID：

library(dplyr)
mydf$time <- as.POSIXct(mydf$time, "%Y-%m-%d %H:%M:%S")
# convert the time column to POSIXct class so that we can apply the diff function correctly
mydf %>% group_by(uniqueid, flow = 1 + cumsum(c(F, diff(time) != 15))) %>% 
         summarize(num_rows = n())

# Source: local data frame [4 x 3]
# Groups: uniqueid [?]
# 
#   uniqueid  flow num_rows
#     <fctr> <dbl>    <int>
# 1        a     1        4
# 2        a     2        2
# 3        b     3        2
# 4        b     4        1

Answer 3

Base R非常快。使用原始基准测试，我发现它完成了DT的一半时间，我厌倦了等待dplyr。

# estimated size of data, years x days x hours x 15mins x uniqueids
5*365*24*4*1000 # = approx 180M

# make data with posixct and characters of 180M rows, mydf is approx 2.5GB in memory
time<-rep(as.POSIXct(c("2014-04-30 23:30:00","2014-04-30 23:45:00","2014-05-01 00:00:00","2014-05-01 00:15:00",
        "2014-05-12 13:45:00","2014-05-12 14:00:00","2014-05-12 13:45:00","2014-05-12 14:00:00",
        "2014-05-12 14:30:00")),times = 20000000)

uniqueid<-rep(as.character(c("a","a","a","a","a","a","b","b","b")),times = 20000000)

mydf<-data.frame(uniqueid,time = time)
rm(time,uniqueid);gc()

基地R：

# assumes that uniqueid's are in groups and in order, and there won't be a followed by b that have the 15 minute "flow"
starttime <- Sys.time()

# find failed flows
mydf$diff <- c(0,diff(mydf$time))
mydf$flowstop <- mydf$diff != 15

# give each flow an id
mydf$flowid <- cumsum(mydf$flowstop)

# clean up vars
mydf$time <- mydf$diff <- mydf$flowstop <- NULL

# find flow length
mydfrle <- rle(mydf$flowid)

# get uniqueid/flowid pairs (unique() is too slow)
mydf <- mydf[!duplicated(mydf$flowid), ]

# append rle and remove separate var
mydf$number_rows <- mydfrle$lengths
rm(mydfrle)

print(Sys.time()-starttime)
# Time difference of 30.39437 secs

data.table：

library(data.table)
starttime <- Sys.time()
res<-setDT(mydf)[, list(number_rows=.N,flow=.GRP),
                 by=.(uniqueid,cumsum(as.numeric(difftime(time,shift(time,1L,type="lag",fill=0))) - 15))][,cumsum:=NULL]
print(Sys.time()-starttime)
# Time difference of 57.08156 secs

dplyr：

library(dplyr)
# convert the time column to POSIXct class so that we can apply the diff function correctly
starttime <- Sys.time()
mydf %>% group_by(uniqueid, flow = 1 + cumsum(c(F, diff(time) != 15))) %>% 
  summarize(num_rows = n())
print(Sys.time()-starttime)
# too long, did not finish after a few minutes

我认为独特的假设和时间是有序的，其他解决方案可能能够更好地利用它。 order（）很容易做到。

我不确定内存的影响，或者不是那么简单的不同数据集的影响。如果内存是个问题，它应该很容易将其分解成块并进行处理。在Base R中需要更多代码。

Answer 4

同时订购＆＃34; id＆＃34;和＆＃34;时间＆＃34;列，我们可以通过创建索引的逻辑向量来构建一个单独的组，无论是哪个＆＃34; id＆＃34;变化或＆＃34;时间＆＃34;是> 15分钟。

使用：

id = as.character(mydf$uniqueid)
tm = mydf$time

找到＆＃34; id＆＃34;：

id_gr = c(TRUE, id[-1] != id[-length(id)])

和＆＃34;时间＆＃34;：

tm_gr = c(0, difftime(tm[-1], tm[-length(tm)], unit = "mins")) > 15

更改并将它们组合在一起：

gr = id_gr | tm_gr

显示其中任何一个＆＃34; id＆＃34;改变了或者＆＃34;时间＆＃34; ＆GT; 15。并得到结果：

tab = tabulate(cumsum(gr))  ## basically, the only operation per group -- 'n by group'
data.frame(id = id[gr], flow = seq_along(tab), n = tab)
#  id flow n
#1  a    1 4
#2  a    2 2
#3  b    3 2
#4  b    4 1

规模更大：

set.seed(1821); nid = 1e4         
dat = replicate(nid, as.POSIXct("2016-07-07 12:00:00 EEST") + 
                     cumsum(sample(c(1, 5, 10, 15, 20, 30, 45, 60, 90, 120, 150, 200, 250, 300), sample(5e2:1e3, 1), TRUE)*60),
                simplify = FALSE)
names(dat) = make.unique(rep_len(letters, nid))
dat = data.frame(id = rep(names(dat), lengths(dat)), time = do.call(c, dat))

system.time({
    id = as.character(dat$id); tm = dat$time
    id_gr = c(TRUE, id[-1] != id[-length(id)])
    tm_gr = c(0, difftime(tm[-1], tm[-length(tm)], unit = "mins")) > 15
    gr = id_gr | tm_gr
    tab = tabulate(cumsum(gr))
    ans1 = data.frame(id = id[gr], flow = seq_along(tab), n = tab)
})
# user  system elapsed 
#  1.44    0.19    1.66

为了比较，包括MikeyMike的答案：

library(data.table)
dat2 = copy(dat)
system.time({
    ans2 = setDT(dat2)[, list(flow = .GRP, n = .N),
                by = .(id, cumsum(as.numeric(difftime(time, 
                                      shift(time, 1L, type = "lag", fill = 0), 
      unit = "mins")) > 15))][, cumsum := NULL]    
})
# user  system elapsed 
# 3.95    0.22    4.26

identical(as.data.table(ans1), ans2)
#[1] TRUE

根据多个连续时间流计算行数

4 个答案: