找出每行最接近特定值的时间

时间:2017-02-21 23:02:37

标签: r time data.table

说我有一个数据表:

dt <- data.table(
        datetime = seq(as.POSIXct("2016-01-01 00:00:00"),as.POSIXct("2016-01-01 10:00:00"), by = "1 hour"),
        ObType = c("A","A","B","B","B","B","A","A","B","A","A")
)

dt
                   datetime ObType
     1: 2016-01-01 00:00:00      A
     2: 2016-01-01 01:00:00      A
     3: 2016-01-01 02:00:00      B
     4: 2016-01-01 03:00:00      B
     5: 2016-01-01 04:00:00      B
     6: 2016-01-01 05:00:00      B
     7: 2016-01-01 06:00:00      A
     8: 2016-01-01 07:00:00      A
     9: 2016-01-01 08:00:00      B
    10: 2016-01-01 09:00:00      A
    11: 2016-01-01 10:00:00      A

我需要做的是ObType是&#34; B&#34;,我需要找到最近的ObType&#34; A&#34;在任何一方。所以结果应该是(几个小时):

               datetime ObType timeLag timeLead
 1: 2016-01-01 00:00:00      A      NA       NA
 2: 2016-01-01 01:00:00      A      NA       NA
 3: 2016-01-01 02:00:00      B       1        4
 4: 2016-01-01 03:00:00      B       2        3
 5: 2016-01-01 04:00:00      B       3        2
 6: 2016-01-01 05:00:00      B       4        1
 7: 2016-01-01 06:00:00      A      NA       NA
 8: 2016-01-01 07:00:00      A      NA       NA
 9: 2016-01-01 08:00:00      B       1        1
10: 2016-01-01 09:00:00      A      NA       NA
11: 2016-01-01 10:00:00      A      NA       NA

我通常使用data.table,但非data.table解决方案也没问题。

谢谢!

Lyss的

3 个答案:

答案 0 :(得分:6)

我使用roll=暗示的方法:

X = dt[ObType=="A"]
X
              datetime ObType
1: 2016-01-01 00:00:00      A
2: 2016-01-01 01:00:00      A
3: 2016-01-01 06:00:00      A
4: 2016-01-01 07:00:00      A
5: 2016-01-01 09:00:00      A
6: 2016-01-01 10:00:00      A

dt[ObType=="B", Lag:=X[.SD,on="datetime",roll=Inf,i.datetime-x.datetime]]
dt[ObType=="B", Lead:=X[.SD,on="datetime",roll=-Inf,x.datetime-i.datetime]]
dt[ObType=="B", Nearest:=X[.SD,on="datetime",roll="nearest",x.datetime-i.datetime]]
dt
               datetime ObType      Lag     Lead     Nearest
 1: 2016-01-01 00:00:00      A NA hours NA hours    NA hours
 2: 2016-01-01 01:00:00      A NA hours NA hours    NA hours
 3: 2016-01-01 02:00:00      B  1 hours  4 hours    -1 hours
 4: 2016-01-01 03:00:00      B  2 hours  3 hours    -2 hours
 5: 2016-01-01 04:00:00      B  3 hours  2 hours     2 hours
 6: 2016-01-01 05:00:00      B  4 hours  1 hours     1 hours
 7: 2016-01-01 06:00:00      A NA hours NA hours    NA hours
 8: 2016-01-01 07:00:00      A NA hours NA hours    NA hours
 9: 2016-01-01 08:00:00      B  1 hours  1 hours    -1 hours
10: 2016-01-01 09:00:00      A NA hours NA hours    NA hours
11: 2016-01-01 10:00:00      A NA hours NA hours    NA hours

roll=的一个优点是,您只需将Inf更改为您希望加入的时间限制即可应用过期限制。它是限制适用的时差,而不是行数。 Inf只是意味着不限制。 roll=符号表示是向前还是向后(超前或滞后)。

另一个优点是roll=很快。

答案 1 :(得分:3)

两种方法,一种使用连接,另一种使用重塑

加入

可能有更好的方法使用滚动连接/非等连接,但这是一种强力方法

dt2 <- dt[, key := 1][ 
    dt, 
    on = "key", 
    allow.cartesian = T
    ][
        ObType != i.ObType
        ][
            , `:=`(lag_min = datetime - i.datetime,
                         lag_max = i.datetime - datetime)
            ]


dt_min <- dt2[ObType == "B" & lag_min > 0, .(timeLag = min(lag_min)), by = .(datetime, ObType)]
dt_max <- dt2[ObType == "B" & lag_max > 0, .(timeLead = min(lag_max)), by = .(datetime, ObType)]


dt_max[ dt_min[ dt, on = c("datetime", "ObType"), nomatch = NA], on = c("datetime", "ObType"), nomatch = NA]

#                datetime ObType  lag_max  lag_min key
#  1: 2016-01-01 00:00:00      A NA hours NA hours   1
#  2: 2016-01-01 01:00:00      A NA hours NA hours   1
#  3: 2016-01-01 02:00:00      B  4 hours  1 hours   1
#  4: 2016-01-01 03:00:00      B  3 hours  2 hours   1
#  5: 2016-01-01 04:00:00      B  2 hours  3 hours   1
#  6: 2016-01-01 05:00:00      B  1 hours  4 hours   1
#  7: 2016-01-01 06:00:00      A NA hours NA hours   1
#  8: 2016-01-01 07:00:00      A NA hours NA hours   1
#  9: 2016-01-01 08:00:00      B  1 hours  1 hours   1
# 10: 2016-01-01 09:00:00      A NA hours NA hours   1
# 11: 2016-01-01 10:00:00      A NA hours NA hours   1

重塑

它非常复杂,有些步骤显然可以简化,但我还是把它全部放在这里所以你可以看到这个过程

dt[, group := rleid(ObType)]
dt_cast <- dcast(dt, formula = datetime + group ~ ObType, value.var = "ObType")

dt_cast[, `:=`(group_before = group - 1,
                             group_after = group + 1)]


dt_min <- dt_cast[ !is.na(B) ][dt_cast[!is.na(A), .(datetime, group)] , on = c(group_before = "group")  , allow.cartesian = T][, max(i.datetime), by = group]
dt_max <- dt_cast[ !is.na(B) ][dt_cast[!is.na(A), .(datetime, group)] , on = c(group_after = "group")  , allow.cartesian = T][, min(i.datetime), by = group]


dt_cast <- rbindlist(list(
    dt_cast[ dt_min, on = c("group"), nomatch = 0],
    dt_cast[ dt_max, on = c("group"), nomatch = 0]
))

dt <- dt_cast[ dt, on = c("datetime", "group"), nomatch = NA][, .(datetime, ObType, lag = V1)]

dt[ObType == "B" , lag_type := c("lag", "lead"), by = .(datetime, ObType)]
dt <- dcast(dt, formula = datetime + ObType ~ lag_type, value.var = "lag")

dt[, `:=`(timeLag = difftime(datetime, lag),
                    timeLead = difftime(lead, datetime),
                    `NA` = NULL)]

dt
#                datetime ObType                 lag                lead  timeLag timeLead
#  1: 2016-01-01 00:00:00      A                <NA>                <NA> NA hours NA hours
#  2: 2016-01-01 01:00:00      A                <NA>                <NA> NA hours NA hours
#  3: 2016-01-01 02:00:00      B 2016-01-01 01:00:00 2016-01-01 06:00:00  1 hours  4 hours
#  4: 2016-01-01 03:00:00      B 2016-01-01 01:00:00 2016-01-01 06:00:00  2 hours  3 hours
#  5: 2016-01-01 04:00:00      B 2016-01-01 01:00:00 2016-01-01 06:00:00  3 hours  2 hours
#  6: 2016-01-01 05:00:00      B 2016-01-01 01:00:00 2016-01-01 06:00:00  4 hours  1 hours
#  7: 2016-01-01 06:00:00      A                <NA>                <NA> NA hours NA hours
#  8: 2016-01-01 07:00:00      A                <NA>                <NA> NA hours NA hours
#  9: 2016-01-01 08:00:00      B 2016-01-01 07:00:00 2016-01-01 09:00:00  1 hours  1 hours
# 10: 2016-01-01 09:00:00      A                <NA>                <NA> NA hours NA hours
# 11: 2016-01-01 10:00:00      A                <NA>                <NA> NA hours NA hours

答案 2 :(得分:2)

dt$timelag = NA
dt$timelead = NA

A = split(dt, dt$ObType)$A
B = split(dt, dt$ObType)$B

A_time_up = sort(A$datetime)
A_time_dn = sort(A$datetime, decreasing = TRUE)

B$timelag = apply(B, 1, function(x) 
    A_time_up[which(x[1] < A_time_up)[1]]
)

B$timelead = apply(B, 1, function(x) 
    A_time_dn[which(x[1] > A_time_dn)[1]]
)

B$timelag = (B$timelag - as.numeric(B$datetime))/(3600)
B$timelead = (as.numeric(B$datetime) - B$timelead)/(3600)

rbind(A,B)