我有一个开始和结束日期/时间的数据框,如下所示:
start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16", "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")
df <- data.frame(start_date, end_date)
还有一些时间间隔:
interval_start <- "21:00"
interval_end <- "02:00"
我想在df中创建一个新列,它计算每个实例在间隔期内花费的总分钟数。例如,第1行在间隔期内花费了40分钟。
有谁知道如何实现这一目标?感谢。
答案 0 :(得分:1)
包lubridate
有助于完成工作。要解决的主要问题是长时间段,其间隔发生几次(我用内部for
循环解决了它),关键函数是intersect
,这给出了问题的简单答案“交叉两个间隔“。总结所有交叉点给出了每行的解决方案。
library(lubridate)
start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16", "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")
start_date <- dmy_hm(start_date)
end_date <- dmy_hm(end_date)
df <- data.frame(start_date, end_date)
time_spent <- c()
# loop through each row
for (i in 1:nrow(df)){
row <- df[i,]
out <- 0
period <- interval(row$start_date, row$end_date)
#1. Set as many intervals for this time periods as there are days
for(day in seq(day(row$start_date) - 1, day(row$end_date), 1)){
myInterval <- interval(dmy_hm(paste(day,
month(row$start_date),
year(row$start_date),
"21:00")),
dmy_hm(paste(day+1,
month(row$start_date),
year(row$start_date),
"02:00")))
# calculate intersection
timedifference <- intersect(period, myInterval)
if(!is.na(timedifference)){
out <- out + as.numeric(timedifference)/60
}
}
time_spent <- c(time_spent, out)
}
df$time_spent <- time_spent
解决方案是
> df$time_spent
[1] 40 740 224 120
答案 1 :(得分:1)
这是我的简短解决方案(与其他答案形成对比;-))
我还使用了lubridate
包:
library(lubridate)
df$start_date <- dmy_hm(df$start_date)
df$end_date <- dmy_hm(df$end_date)
df$ diff <- unlist(lapply(1:nrow(df), function(x){
sequence <- seq(df$start_date[x],df$end_date[x], by = "min")
cum_sum <- cumsum(format(sequence, format = "%H:%M") <= "02:00" | format(sequence, format = "%H:%M") >= "21:00")
sum <- sum(format(sequence, format = "%H:%M") <= "02:00" | format(sequence, format = "%H:%M") >= "21:00")
n_intervals <- length(unique(cum_sum[cum_sum %in% unique(cum_sum[duplicated(cum_sum)])]))
ifelse(cum_sum[length(cum_sum)] - cum_sum[length(cum_sum)-1] != 0, return(sum - n_intervals-1), return(sum-1))
}))
# start_date end_date diff
# 1 2016-09-20 01:20:00 2016-09-20 06:20:00 40
# 2 2016-09-22 01:20:00 2016-09-24 22:40:00 740
# 3 2016-09-28 22:16:00 2016-09-29 03:20:00 224
# 4 2016-09-16 21:01:00 2016-09-16 23:01:00 120
这个想法如下(lapply
中的代码):
sum
和cumsum
,此序列的所有时间都在“21:00”到“02:00”之间。cumsum
中的间隔数,以查看此序列中有多少不同的间隔。sum
为2长时,分钟的差异只有1,所以我们总是要减去1.我们必须在找到的每个区间都这样做。
在这种情况下,cum_sum
的最后一个值与倒数第二个不同,这是一个加法间隔,我们不得不再减1个。它看起来非常复杂,但背后的想法应该是明确的(我希望)。
答案 2 :(得分:0)
请查看代码注释。我使用了lubridate
包。
start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16", "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")
df <- data.frame(start_date, end_date)
interval_start <- "21:00"
interval_end <- "02:00"
# Convert strings to dates
library(lubridate)
df$start_date <- dmy_hm(df$start_date)
df$end_date <- dmy_hm(df$end_date)
# Helper columns
df$day <- day(df$start_date)
df$mo <- month(df$start_date)
df$yr <- year(df$start_date)
df$day1 <- day(df$end_date)
df$mo1 <- month(df$end_date)
df$yr1 <- year(df$end_date)
# Add custom start/end for first day in row
df$interval_start <- dmy_hm(paste0(df$day-1,"/",df$mo,"/",df$yr," ", interval_start))
df$interval_end <- dmy_hm(paste0(df$day,"/",df$mo,"/",df$yr," ", interval_end))
# Add custom start/end for last day in row, if it is different
df$interval_start1 <- df$interval_start # this is just to initialize the column with the proper class
df$interval_end1 <- df$interval_end
for(i in 1:nrow(df)){
if(!(df$mo[i] == df$mo1[i] & df$day[i] == df$day1[i])){
df$interval_start1[i] <- dmy_hm(paste0(df$day1[i],"/",df$mo1[i],"/",df$yr1[i]," ", interval_start))
df$interval_end1[i] <- dmy_hm(paste0(df$day1[i],"/",df$mo1[i],"/",df$yr1[i]," ", interval_end))
}else{
df$interval_start1[i] <- NA
df$interval_end1[i] <- NA
}
}
# Calculate time in intervals for first day
time1 <- difftime(df$start_date,df$interval_end, units="mins")
time1.cap <- difftime(df$interval_start, df$interval_end, units="mins")
time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]
# initialize class of new col
df$time1 <- difftime(df$interval_start, df$interval_end, units="mins")
# Update time1
for(i in 1:nrow(df)){
if(df$start_date[i] < df$interval_end[i]){
time1 <- difftime(df$start_date,df$interval_end, units="mins")
time1.cap <- difftime(df$interval_start, df$interval_end, units="mins")
time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]
df$time1[i] <- time1[i]*-1
} else{
if(df$start_date[i] > df$interval_end[i]) {
time1 <- difftime(df$start_date,df$interval_end+86400, units="mins")
time1.cap <- difftime(df$interval_start, df$interval_end+86400, units="mins")
time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]
df$time1[i] <- time1[i]*-1
}
}
}
# initialize class of new col
df1 <- df[!is.na(df$interval_start1),]
df1$time2 <- difftime(df1$interval_start, df1$interval_end, units="mins")
# create time2 for last day, if different
for(i in 1:nrow(df1)){
if(df1$end_date[i] < df1$interval_end1[i]){
time2 <- difftime(df1$end_date,df1$interval_end1, units="mins")
time2.cap <- difftime(df1$interval_start1, df1$interval_end1, units="mins")
time2[abs(time2) > abs(time2.cap)] <- time2.cap[abs(time2) > abs(time2.cap)]
df1$time2[i] <- time2[i]*-1
} else{
if(df1$end_date[i] > df1$interval_end1[i]) {
time2 <- difftime(df1$interval_start1,df1$end_date, units="mins")
time2.cap <- difftime(df1$interval_start1, df1$interval_end1+86400, units="mins")
time2[abs(time2) > abs(time2.cap)] <- time2.cap[abs(time2) > abs(time2.cap)]
df1$time2[i] <- time2[i]*-1
}
}
}
# See if there were any days in between first and last and if so add time
time2 <- minutes(300 * round(difftime(df1$end_date,df1$start_date, units = "days")))+minutes(time2)*-1
df$time2 <- as.period(NA)
df$time2[!is.na(df$interval_start1)] <- time2
df$time2[is.na(df$interval_start1)] <- 0
df$time_in_interval <- minutes(df$time1)+df$time2
df$time_in_interval
请注意,86,400是一天中的秒数,因此该数字是多少。