R - 间隔花费的时间

时间:2016-10-07 11:11:43

标签: r datetime time dataframe

我有一个开始和结束日期/时间的数据框,如下所示:

start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16",  "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")
df <- data.frame(start_date, end_date)

还有一些时间间隔:

interval_start <- "21:00"
interval_end   <- "02:00"

我想在df中创建一个新列,它计算每个实例在间隔期内花费的总分钟数。例如,第1行在间隔期内花费了40分钟。

有谁知道如何实现这一目标?感谢。

3 个答案:

答案 0 :(得分:1)

lubridate有助于完成工作。要解决的主要问题是长时间段,其间隔发生几次(我用内部for循环解决了它),关键函数是intersect,这给出了问题的简单答案“交叉两个间隔“。总结所有交叉点给出了每行的解决方案。

library(lubridate)

start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16",  "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")

start_date <- dmy_hm(start_date)
end_date <- dmy_hm(end_date)

df <- data.frame(start_date, end_date)

time_spent <- c()

# loop through each row
for (i in 1:nrow(df)){
  row <- df[i,]
  out <- 0

  period <- interval(row$start_date, row$end_date)

  #1. Set as many intervals for this time periods as there are days
  for(day in seq(day(row$start_date) - 1, day(row$end_date), 1)){
    myInterval <- interval(dmy_hm(paste(day, 
                                        month(row$start_date), 
                                        year(row$start_date),
                                        "21:00")),
                           dmy_hm(paste(day+1, 
                                        month(row$start_date), 
                                        year(row$start_date),
                                        "02:00")))

    # calculate intersection
    timedifference <- intersect(period, myInterval)

    if(!is.na(timedifference)){
      out <- out + as.numeric(timedifference)/60
    }

  }

  time_spent <- c(time_spent, out)
}

df$time_spent <- time_spent

解决方案是

> df$time_spent
[1]  40 740 224 120

答案 1 :(得分:1)

这是我的简短解决方案(与其他答案形成对比;-)) 我还使用了lubridate包:

library(lubridate)
df$start_date <- dmy_hm(df$start_date)
df$end_date <- dmy_hm(df$end_date)

df$ diff <- unlist(lapply(1:nrow(df), function(x){

   sequence <- seq(df$start_date[x],df$end_date[x], by = "min")
   cum_sum <- cumsum(format(sequence, format = "%H:%M") <= "02:00" | format(sequence, format = "%H:%M") >= "21:00")
   sum <- sum(format(sequence, format = "%H:%M") <= "02:00" | format(sequence, format = "%H:%M") >= "21:00")
   n_intervals <- length(unique(cum_sum[cum_sum %in% unique(cum_sum[duplicated(cum_sum)])]))

   ifelse(cum_sum[length(cum_sum)] - cum_sum[length(cum_sum)-1] != 0, return(sum - n_intervals-1), return(sum-1))
}))

#            start_date            end_date diff
# 1 2016-09-20 01:20:00 2016-09-20 06:20:00   40
# 2 2016-09-22 01:20:00 2016-09-24 22:40:00  740
# 3 2016-09-28 22:16:00 2016-09-29 03:20:00  224
# 4 2016-09-16 21:01:00 2016-09-16 23:01:00  120

这个想法如下(lapply中的代码):

  1. 从每个间隔的开始到结束创建一个序列一分钟
  2. 计算条件的sumcumsum,此序列的所有时间都在“21:00”到“02:00”之间。
  3. 计算cumsum中的间隔数,以查看此序列中有多少不同的间隔。
  4. 困难的是,当sum为2长时,分钟的差异只有1,所以我们总是要减去1.我们必须在找到的每个区间都这样做。 在这种情况下,cum_sum的最后一个值与倒数第二个不同,这是一个加法间隔,我们不得不再减1个。
  5. 它看起来非常复杂,但背后的想法应该是明确的(我希望)。

答案 2 :(得分:0)

请查看代码注释。我使用了lubridate包。

start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16",  "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")
df <- data.frame(start_date, end_date)


interval_start <- "21:00"
interval_end   <- "02:00"

# Convert strings to dates
library(lubridate)
df$start_date <- dmy_hm(df$start_date)
df$end_date   <- dmy_hm(df$end_date)

# Helper columns
df$day  <- day(df$start_date)
df$mo   <- month(df$start_date)
df$yr   <- year(df$start_date)
df$day1 <- day(df$end_date)
df$mo1  <- month(df$end_date)
df$yr1  <- year(df$end_date)

# Add custom start/end for first day in row
df$interval_start <- dmy_hm(paste0(df$day-1,"/",df$mo,"/",df$yr," ", interval_start))
df$interval_end   <- dmy_hm(paste0(df$day,"/",df$mo,"/",df$yr," ", interval_end))


# Add custom start/end for last day in row, if it is different
df$interval_start1 <- df$interval_start # this is just to initialize the column with the proper class
df$interval_end1   <- df$interval_end

for(i in 1:nrow(df)){
  if(!(df$mo[i] == df$mo1[i] & df$day[i] == df$day1[i])){
    df$interval_start1[i] <- dmy_hm(paste0(df$day1[i],"/",df$mo1[i],"/",df$yr1[i]," ", interval_start))
    df$interval_end1[i]   <- dmy_hm(paste0(df$day1[i],"/",df$mo1[i],"/",df$yr1[i]," ", interval_end))
  }else{
    df$interval_start1[i] <- NA
    df$interval_end1[i]   <- NA
  } 
} 

# Calculate time in intervals for first day
time1     <- difftime(df$start_date,df$interval_end, units="mins")
time1.cap <- difftime(df$interval_start, df$interval_end, units="mins")
time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]

# initialize class of new col
df$time1 <- difftime(df$interval_start, df$interval_end, units="mins")

# Update time1
for(i in 1:nrow(df)){
  if(df$start_date[i] < df$interval_end[i]){
    time1     <- difftime(df$start_date,df$interval_end, units="mins")
    time1.cap <- difftime(df$interval_start, df$interval_end, units="mins")
    time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]

    df$time1[i] <- time1[i]*-1
  } else{

    if(df$start_date[i] > df$interval_end[i])  {
      time1     <- difftime(df$start_date,df$interval_end+86400, units="mins")
      time1.cap <- difftime(df$interval_start, df$interval_end+86400, units="mins")
      time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]

      df$time1[i] <- time1[i]*-1
    }
  }
}

# initialize class of new col

df1 <- df[!is.na(df$interval_start1),]
df1$time2 <- difftime(df1$interval_start, df1$interval_end, units="mins")

# create time2 for last day, if different
for(i in 1:nrow(df1)){
  if(df1$end_date[i] < df1$interval_end1[i]){
    time2     <- difftime(df1$end_date,df1$interval_end1, units="mins")
    time2.cap <- difftime(df1$interval_start1, df1$interval_end1, units="mins")
    time2[abs(time2) > abs(time2.cap)] <- time2.cap[abs(time2) > abs(time2.cap)]

    df1$time2[i] <- time2[i]*-1
  } else{

    if(df1$end_date[i] > df1$interval_end1[i])  {
      time2     <- difftime(df1$interval_start1,df1$end_date, units="mins")
      time2.cap <- difftime(df1$interval_start1, df1$interval_end1+86400, units="mins")
      time2[abs(time2) > abs(time2.cap)] <- time2.cap[abs(time2) > abs(time2.cap)]

      df1$time2[i] <- time2[i]*-1
    }
  }
}

# See if there were any days in between first and last and if so add time
time2 <- minutes(300 * round(difftime(df1$end_date,df1$start_date, units = "days")))+minutes(time2)*-1

df$time2 <- as.period(NA)
df$time2[!is.na(df$interval_start1)]  <- time2
df$time2[is.na(df$interval_start1)]   <- 0

df$time_in_interval <- minutes(df$time1)+df$time2
df$time_in_interval

请注意,86,400是一天中的秒数,因此该数字是多少。