数据

Question

在R中，对于表“ spd”中的每一行，我都有两个变量（“ Account”和“ Asset”）以及“ DateTime”标记。

我想添加一个新列（在此示例中为“ Return”），该列将评估另一行是否具有完全相同的“帐户”和“资产”，并在其自身的30秒内添加时间戳。

例如，如果给定行的DateTime为 7/20/2018 8:05:00

“在30秒内”将介于 7/20/2018 8:04:30和7/20/2018 8:05:30

这是一个示例表，其中显示了根据上述计算得出的Account，Asset，DateTime以及新的Return Column。

Account  Asset  DateTime            Return
100      A100   7/20/2018 8:57:25   TRUE
100      A100   7/20/2018 8:57:55   TRUE
100      A100   7/20/2018 8:58:20   TRUE
100      B100   7/20/2018 8:56:31   FALSE
200      A200   7/21/2018 8:56:30   TRUE
200      A200   7/21/2018 8:57:00   TRUE
200      A200   7/21/2018 8:59:00   FALSE
200      A200   7/21/2018 8:59:45   FALSE

到目前为止，我只能对具有匹配的“帐户”和“资产”的行进行计数。例如，使用data.table包：

Library(data.table)
setDT(spd)
spd[,vehicle_count := .N, by = .(Account, Asset)]

我也对dplyr group by和其他几个人进行了实验，但是找不到找到将等式+30秒步骤合并的方法。

任何帮助将不胜感激！

根据下面的AndersSwanson讨论进行编辑。

最终，我需要将仅发生至少一次的记录（在30秒内具有相同帐户，相同资产，时间戳记）进行子集化并保留。

如果一条记录不匹配，则将其删除：（帐户和资产不匹配另一行）或（帐户和资产确实匹配，但时间戳记在30秒内没有匹配项）

Answer 1

在这里，我将“时间戳记”理解为上一个时间戳记，如果您是指该组中的第一个时间戳记，则可以执行diff=DateTime-DateTime[1]

library(dplyr)
df$DateTime <- as.POSIXct(df$DateTime,format="%m/%d/%Y %H:%M:%S")
df %>% group_by(Account, Asset) %>% 
       mutate(diff=DateTime-lag(DateTime), 
       Flag = case_when(n()==1 ~ F,is.na(diff) ~ T,abs(diff)<=30 ~ T, TRUE  ~ F))

    # A tibble: 8 x 6
    # Groups:   Account, Asset [3]
  Account Asset DateTime            Return diff       Flag 
     <int> <chr> <dttm>               <int> <time>     <lgl>
1     100 A100  2015-09-20 08:57:25      3 " NA secs" TRUE 
2     100 A100  2015-09-20 08:57:55      3 " 30 secs" TRUE 
3     100 A100  2015-09-20 08:58:20      3 " 25 secs" TRUE 
4     100 B100  2015-09-20 08:56:31      1 " NA secs" FALSE
5     200 A200  2015-09-21 08:56:30      2 " NA secs" TRUE 
6     200 A200  2015-09-21 08:57:00      2 " 30 secs" TRUE 
7     200 A200  2015-09-21 08:59:00      2 120 secs   FALSE
8     200 A200  2015-09-21 08:59:45      2 " 45 secs" FALSE

数据

df <- read.table(text="
    Account  Asset  DateTime            Return
    100      A100   '9/20/2015 8:57:25'   3
    100      A100   '9/20/2015 8:57:55'   3
    100      A100   '9/20/2015 8:58:20'   3
    100      B100   '9/20/2015 8:56:31'   1
    200      A200   '9/21/2015 8:56:30'   2
    200      A200   '9/21/2015 8:57:00'   2
    200      A200   '9/21/2015 8:59:00'   2
    200      A200   '9/21/2015 8:59:45'   2
                             ",header=T, stringsAsFactors = F)

Answer 2

我认为这就是您要得到的-

library(dplyr)
spd$DateTime = as.POSIXct(strptime(spd$DateTime, format = "%m/%d/%Y %H:%M:%S"))
spd$StartTime = spd$DateTime - 30
spd$EndTime = spd$DateTime + 30
spd = spd %>% group_by(Account, Asset) %>% 
        rowwise() %>% 
        mutate(Count = sum(DateTime >= spd$StartTime & DateTime <= spd$EndTime)-1) %>% 
        select(-StartTime, -EndTime)
spd$Return = spd$Count > 0


> spd

  Account Asset DateTime            Count Return
    <int> <chr> <dttm>              <dbl> <lgl> 
1     100 A100  2018-07-20 08:57:25  1.00 T     
2     100 A100  2018-07-20 08:57:55  2.00 T     
3     100 A100  2018-07-20 08:58:20  1.00 T     
4     100 B100  2018-07-20 08:56:31  0    F     
5     200 A200  2018-07-21 08:56:30  1.00 T     
6     200 A200  2018-07-21 08:57:00  1.00 T     
7     200 A200  2018-07-21 08:59:00  0    F     
8     200 A200  2018-07-21 08:59:45  0    F

Answer 3

使用dplyr进行润滑很容易。

使用lead()和lag()函数可以很容易地做到这一点。如果我们按DateTime升序排列数据，则可以使用lead()查看下一个值是否在30秒内，使用lag()函数查看最后一个值是否在30秒内。这也是矢量化的代码，应该运行得很快。

这有效，除非“帐户和资产”组中只有一个项目，但是如果该组中只有一个项目，则得到NA。这实际上是彻底的失败，我们可以简单地用FALSE替换任何NA。真好！

由于我的方法需要按DateTime安排所有内容，因此我还必须按照开始时的方式重新安排它。

library(dplyr)
library(lubridate)

data <- tibble(
  Account = c('100'    ,'100'    ,'100'    ,'100'   ,'200'    ,'200'    ,'200'    ,'200'    ),
  Asset = c('A100' , 'A100' , 'A100' , 'B100' , 'A200' , 'A200' , 'A200' , 'A200' ),
  DateTime = c('7/20/2018 8:57:25', '7/20/2018 8:57:55', '7/20/2018 8:58:20', '7/20/2018 8:56:31', '7/21/2018 8:56:30', '7/21/2018 8:57:00', '7/21/2018 8:59:00', '7/21/2018 8:59:45')
)

data <- data %>% 
  mutate(DateTime = mdy_hms(DateTime))

output <- data %>%
  group_by(Account,Asset) %>%
  arrange(DateTime) %>%
  mutate(Return = (abs(difftime(DateTime, lag(DateTime,1L) , units = 'secs')) <= 30) |
                  (abs(difftime(DateTime, lead(DateTime,1L), units = 'secs')) <= 30),
         # the previous line will return NA if there were less than 2 DateTime available
         # so replace any NA with FALSE to fix single entries
         Return = ifelse(is.na(Return),FALSE,Return)
        ) %>%
  arrange(Account,Asset,DateTime) %>%
  ungroup()

output  
# # A tibble: 8 x 4
# # Groups:   Account, Asset [3]
# Account Asset DateTime            Return
#  <chr>   <chr> <dttm>              <lgl> 
# 1 100     A100  2018-07-20 08:57:25 TRUE  
# 2 100     A100  2018-07-20 08:57:55 TRUE  
# 3 100     A100  2018-07-20 08:58:20 TRUE  
# 4 100     B100  2018-07-20 08:56:31 FALSE 
# 5 200     A200  2018-07-21 08:56:30 TRUE  
# 6 200     A200  2018-07-21 08:57:00 TRUE  
# 7 200     A200  2018-07-21 08:59:00 FALSE 
# 8 200     A200  2018-07-21 08:59:45 FALSE

R-给定一行，计算有多少其他行具有相同的两个列值以及在其自己的时间戳记30秒内的时间戳记

3 个答案:

数据