Question

我在根据条件计算事件之间的时间方面遇到了一些问题。我想确定退款给客户和之前购买之间的时间。也就是说，退款时间减去他们上次通过身份证购买的时间。有多个用户按ID分组，每个用户都有一个由TIMESTAMP索引的多个事件（购买或退款）。表格的相关行如下所示：

View(df1)
TIMESTAMP   ID  Order_Type
2017-05-04  55  Purchase
2017-05-12  55  Purchase
2017-05-18  55  Purchase
2017-06-16  55  Refund 
2017-05-06  36  Purchase
2017-05-14  36  Purchase
2017-05-22  36  Purchase
2017-06-14  36  Purchase
2017-06-28  36  Refund 
2017-07-10  36  Purchase

如表中所示，有些情况下客户已获得退款，但随后又进行了购买。我只希望计算从之前的购买日期开始直到退款。我认为我可以使用集合函数的方法。

输出为：

View(df2)
TIMESTAMP   ID  Days_Since_Last_Purchase
2017-06-16  55     29
2017-06-28  36     14

感谢您的任何意见。

Answer 1

这是基础R中的解决方案：

df$TIMESTAMP <- as.Date.character(df$TIMESTAMP, format = "%Y-%m-%d")

inds <- which(df$Order_Type == "Refund")
df2  <- df[inds, ]

df2$Days_Since <- unlist(Map(`-`, df$TIMESTAMP[inds], df$TIMESTAMP[inds-1]))
#    TIMESTAMP  ID Order_Type Days_Since_Last_Purchase
#    2017-06-16 55     Refund                       29
#    2017-06-28 36     Refund                       14

您还可以在大多数（所有？）情况下选择mapply而不是Map：

df2$Days_Since <- mapply(difftime, df$TIMESTAMP[inds], df$TIMESTAMP[inds-1])

注意：这种方法的一个好处是它只使用了基础R.但是，正如Moody_Mudskipper在评论中指出的那样，这个解决方案只适用于按时间顺序排列的数据和每个退款记录之前是相应的购买记录。在大多数实际情况中，这是一个大问题！

Answer 2

解决方案使用dplyr和tidyr。

library(dplyr)
library(tidyr)

dt2 <- dt %>%
  mutate(RowID = 1:n()) %>%
  mutate(TIMESTAMP = as.Date(TIMESTAMP)) %>%
  spread(Order_Type, TIMESTAMP) %>%
  fill(Refund, .direction = "up") %>%
  mutate(Days_Since_Last_Purchase = Refund - Purchase) %>%
  filter(Days_Since_Last_Purchase > 0) %>%
  arrange(ID, Refund, Days_Since_Last_Purchase) %>%
  group_by(ID, Refund) %>%
  slice(1) %>%
  select(TIMESTAMP = Refund, ID, Days_Since_Last_Purchase)
dt2
# A tibble: 2 x 3
# Groups:   ID, TIMESTAMP [2]
   TIMESTAMP    ID Days_Since_Last_Purchase
      <date> <int>                   <time>
1 2017-06-28    36                  14 days
2 2017-06-16    55                  29 days

数据

dt <- read.table(text = "TIMESTAMP ID Order_Type 2017-05-04 55 Purchase 2017-05-12 55 Purchase 2017-05-18 55 Purchase 2017-06-16 55 Refund 2017-05-06 36 Purchase 2017-05-14 36 Purchase 2017-05-22 36 Purchase 2017-06-14 36 Purchase 2017-06-28 36 Refund 2017-07-10 36 Purchase", header = TRUE, stringsAsFactors = FALSE)

Answer 3

另一个dplyr / tidyr解决方案

library(dplyr) library(lubridate) library(tidyr) df %>% mutate(TIMESTAMP = as_date(TIMESTAMP)) %>% arrange(ID,TIMESTAMP) %>% group_by(ID) %>% mutate(refund_group = lag(cumsum(Order_Type == "Refund"),1,0)) %>% # as table is sorted, and we're inside a given group, every instance of "Refund" marks the end of a refund_group group_by(ID,refund_group,Order_Type) %>% do({tail(.,1)}) %>% # we keep the last instance of Purchase & Refund for each refund_group ungroup %>% spread(Order_Type,TIMESTAMP) %>% mutate(Days_Since_Last_Purchase = Refund - Purchase) %>% # that's basically the final table, but we strip it further to get exactly the expected output select(TIMESTAMP = Refund,ID,Days_Since_Last_Purchase) %>% filter(!is.na(Days_Since_Last_Purchase))

<强>结果

# A tibble: 2 x 3
   TIMESTAMP    ID Days_Since_Last_Purchase
      <date> <int>                   <time>
1 2017-06-28    36                  14 days
2 2017-06-16    55                  29 days

Answer 4

这是一个for循环，用于显示新table中相同column的日期差异：

for(i in 2:nrow(df1)){
  df1$Days_Since_Last_Purchase[1] <- ""
  if((df1$Order_Type[i] == "Refund" & df1$Order_Type[i-1] == "Purchase") & (df1$ID[i] == df1$ID[i-1])){
      df1$Days_Since_Last_Purchase[i] <- difftime(df1$Timestamp[i],df1$Timestamp[i-1], units = c("days"))
  } else{
    df1$Days_Since_Last_Purchase[i] <- ""
  }
}

> df1
    Timestamp ID Order_Type Days_Since_Last_Purchase
1  2017-05-04 55   Purchase                         
2  2017-05-12 55   Purchase                         
3  2017-05-18 55   Purchase                         
4  2017-06-16 55     Refund                       29
5  2017-05-06 36   Purchase                         
6  2017-05-14 36   Purchase                         
7  2017-05-22 36   Purchase                         
8  2017-06-14 36   Purchase                         
9  2017-06-28 36     Refund                       14
10 2017-07-10 36   Purchase

R中的条件计算

4 个答案: