我的问题涉及如何计算自R事件发生以来的天数。 以下是数据的最小示例:
df <- data.frame(date=as.Date(c("06/07/2000","15/09/2000","15/10/2000","03/01/2001","17/03/2001","23/05/2001","26/08/2001"), "%d/%m/%Y"),
event=c(0,0,1,0,1,1,0))
date event
1 2000-07-06 0
2 2000-09-15 0
3 2000-10-15 1
4 2001-01-03 0
5 2001-03-17 1
6 2001-05-23 1
7 2001-08-26 0
二进制变量(事件)的值为1,表示事件发生,否则为0。重复观察在不同时间进行(date
)
自上次事件(tae
)以来的日期,预期输出如下:
date event tae
1 2000-07-06 0 NA
2 2000-09-15 0 NA
3 2000-10-15 1 0
4 2001-01-03 0 80
5 2001-03-17 1 153
6 2001-05-23 1 67
7 2001-08-26 0 95
我已经四处寻找类似问题的答案,但他们没有解决我的具体问题。我试图实现的想法 来自一个类似的帖子(Calculate elapsed time since last event),下面是最接近我的 得到了解决方案:
library(dplyr)
df %>%
mutate(tmp_a = c(0, diff(date)) * !event,
tae = cumsum(tmp_a))
这使得下面显示的输出不是预期的:
date event tmp_a tae
1 2000-07-06 0 0 0
2 2000-09-15 0 71 71
3 2000-10-15 1 0 71
4 2001-01-03 0 80 151
5 2001-03-17 1 0 151
6 2001-05-23 1 0 151
7 2001-08-26 0 95 246
非常感谢任何有关如何微调这种或不同方法的帮助。
答案 0 :(得分:9)
您可以尝试这样的事情:
# make an index of the latest events
last_event_index <- cumsum(df$event) + 1
# shift it by one to the right
last_event_index <- c(1, last_event_index[1:length(last_event_index) - 1])
# get the dates of the events and index the vector with the last_event_index,
# added an NA as the first date because there was no event
last_event_date <- c(as.Date(NA), df[which(df$event==1), "date"])[last_event_index]
# substract the event's date with the date of the last event
df$tae <- df$date - last_event_date
df
# date event tae
#1 2000-07-06 0 NA days
#2 2000-09-15 0 NA days
#3 2000-10-15 1 NA days
#4 2001-01-03 0 80 days
#5 2001-03-17 1 153 days
#6 2001-05-23 1 67 days
#7 2001-08-26 0 95 days
答案 1 :(得分:3)
这很痛苦,你会失去性能,但你可以使用for
循环来实现:
datas <- read.table(text = "date event
2000-07-06 0
2000-09-15 0
2000-10-15 1
2001-01-03 0
2001-03-17 1
2001-05-23 1
2001-08-26 0", header = TRUE, stringsAsFactors = FALSE)
datas <- transform(datas, date = as.Date(date))
lastEvent <- NA
tae <- rep(NA, length(datas$event))
for (i in 2:length(datas$event)) {
if (datas$event[i-1] == 1) {
lastEvent <- datas$date[i-1]
}
tae[i] <- datas$date[i] - lastEvent
# To set the first occuring event as 0 and not NA
if (datas$event[i] == 1 && sum(datas$event[1:i-1] == 1) == 0) {
tae[i] <- 0
}
}
cbind(datas, tae)
date event tae
1 2000-07-06 0 NA
2 2000-09-15 0 NA
3 2000-10-15 1 0
4 2001-01-03 0 80
5 2001-03-17 1 153
6 2001-05-23 1 67
7 2001-08-26 0 95
答案 2 :(得分:1)
老问题,但我正在尝试滚动连接并发现这很有趣。
library(data.table)
setDT(df)
setkey(df, date)
# rolling self-join to attach last event time
df = df[event == 1, .(lastevent = date), key = date][df, roll = TRUE]
# find difference between record and previous event == 1 record
df[, tae := difftime(lastevent, shift(lastevent, 1L, "lag"), unit = "days")]
# difftime for simple case between date and joint on previous event
df[event == 0, tae:= difftime(date, lastevent, unit = "days")]
> df
date lastevent event tae
1: 2000-07-06 <NA> 0 NA days
2: 2000-09-15 <NA> 0 NA days
3: 2000-10-15 2000-10-15 1 NA days
4: 2001-01-03 2000-10-15 0 80 days
5: 2001-03-17 2001-03-17 1 153 days
6: 2001-05-23 2001-05-23 1 67 days
7: 2001-08-26 2001-05-23 0 95 days
答案 3 :(得分:1)
我迟到了,但是我使用tidyr::fill
让这更容易。您实质上是将非事件转换为缺失值,然后使用fill
用最后一个事件填充NA
,然后从上一个事件中减去当前日期。
我已经使用整数日期列对此进行了测试,因此可能需要对Date
类型日期列进行一些调整(尤其是使用NA_integer_
。我不确定底层是什么type适用于Date
个对象;我猜是NA_real_
。)
df %>%
mutate(
event = as.logical(event),
last_event = if_else(event, true = date, false = NA_integer_)) %>%
fill(last_event) %>%
mutate(event_age = date - last_event)
答案 4 :(得分:0)
我遇到了类似的问题,并且能够结合上面的一些想法来解决。我与我的主要区别是顾客-第n个会发生不同的事件(对我来说是购买)。我想知道所有这些购买的累计金额以及上次活动的日期。我解决此问题的主要方法是创建一个索引数据框以与主数据框连接。与上面评分最高的问题相似。请参见下面的可重复代码。
library(tidyverse)
rm(list=ls())
#generate repeatable code sample dataframe
df <- as.data.frame(sample(rep(sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 12), each = 4),36))
df$subtotal <- sample(1:100, 36)
df$cust <- sample(rep(c("a", "b", "c", "d", "e", "f"), each=12), 36)
colnames(df) <- c("dates", "subtotal", "cust")
#add a "key" based on date and event
df$datekey <- paste0(df$dates, df$cust)
#The following 2 lines are specific to my own analysis but added to show depth
df_total_visits <- df %>% select(dates, cust) %>% distinct() %>% group_by(cust) %>% tally(n= "total_visits") %>% mutate(variable = 1)
df_order_bydate <- df %>% select(dates, cust) %>% group_by(dates, cust) %>% tally(n= "day_orders")
df <- left_join(df, df_total_visits)
df <- left_join(df, df_order_bydate) %>% arrange(dates)
# Now we will add the index, the arrange from the previous line is super important if your data is not already ordered by date
cummulative_groupping <- df %>% select(datekey, cust, variable, subtotal) %>% group_by(datekey) %>% mutate(spending = sum(subtotal)) %>% distinct(datekey, .keep_all = T) %>% select(-subtotal)
cummulative_groupping <- cummulative_groupping %>% group_by(cust) %>% mutate(cumulative_visits = cumsum(variable),
cumulative_spend = cumsum(spending))
df <- left_join(df, cummulative_groupping) %>% select(-variable)
#using the cumulative visits as the index, if we add one to this number we can then join it again on our dataframe
last_date_index <- df %>% select(dates, cust, cumulative_visits)
last_date_index$cumulative_visits <- last_date_index$cumulative_visits + 1
colnames(last_date_index) <- c("last_visit_date", "cust", "cumulative_visits")
df <- left_join(df, last_date_index, by = c("cust", "cumulative_visits"))
#the difference between the date and last visit answers the original posters question. NAs will return as NA
df$toa <- df$dates - df$last_visit_date
此答案适用于在同一天发生同一事件(数据卫生状况差或如果有多个供应商/客户参加该事件)。感谢您查看我的答案。这实际上是我在Stack上的第一篇文章。