Question

我有一个数据帧（da），其中每一行都有一个升序的时间戳（每个时间戳之间的间隔是随机的）。

我想根据da的行是否落在其他两个向量（first.times和second.times）中的时间之间来确定行。因此，我将迭代first.time和second.time的向量，并查看da是否在这些时间间隔内（min = {first times和max = second.times ），剩下的我不保留。

我弄清楚如何做到这一点的唯一方法是使用for循环，但这可能需要一段时间。这是带有一些示例数据的代码：

#Set start and end dates
date1 <- as.POSIXct(strptime('1970-01-01 00:00', format = '%Y-%m-%d %H:%M'))
date2 <- as.POSIXct(strptime('1970-01-05 23:00', format = '%Y-%m-%d %H:%M'))

#Interpolate 250000 dates in between (dates are set to random intervals)
dates <- c(date1 + cumsum(c(0, round(runif(250000, 20, 200)))), date2)

#Set up dataframe
da <- data.frame(dates = dates,
                 a = round(runif(1, 1, 10)),
                 b = rep(c('Hi', 'There', 'Everyone'), length.out = length(dates)))
head(da); dim(da)

#Set up vectors of time
first.times <- seq(date1,      #First time in sequence is date1
                   date2,      #Last time in sequence is date2
                   by = 13*60) #Interval of 13 minutes between each time (13 min * 60 sec)

second.times <- first.times + 5*60 #Second time is 5 min * 60 seconds later
head(first.times); length(first.times)
head(second.times); length(second.times)

#Loop to obtain rows
subsetted.dates <- da[0,]
system.time(for(i in 1:length(first.times)){
  subsetted.dates <- rbind(subsetted.dates, da[da$dates >= first.times[i] & da$dates < second.times[i],])
})
 user  system elapsed 
2.590   0.825   3.520

我想知道在for循环中是否有更有效，更快捷的方法来完成我的工作。这个示例数据集运行得非常快，但是我的实际数据集每次迭代可能需要45秒，而要进行1000次迭代，则可能需要一段时间！

任何帮助都会大有帮助！

谢谢！

Answer 1

切勿在循环中使用rbind或cbind！这将导致过多的内存复制。参见Patrick Burns' R Interno: Circle 2 - Growing Objects。取而代之的是，一旦循环外就为rbind建立数据帧列表：

由于在相等长度的向量之间迭代元素明智的方式，请考虑使用mapply或其列表包装器Map：

df_list <- Map(function(f, s) da[da$dates >= f & da$dates < s,],
               first.times, second.times)

# EQUIVALENT CALL
df_list <- mapply(function(f, s) da[da$dates >= f & da$dates < s,],
                  first.times, second.times, SIMPLIFY=FALSE)

甚至考虑使用transform将数据第一次和第二次添加到数据框中以添加列：

df_list <- Map(function(f, s) transform(da[da$dates >= f & da$dates < s,], 
                                        first_time = f, second_time = s),
               first.times, second.times)

从那里开始，使用多种解决方案对数据帧列表进行行绑定：

# BASE
final_df <- do.call(rbind, df_list)

# PLYR
final_df <- rbind.fill(df_list)

# DPLYR
final_df <- bind_rows(df_list)

# DATA TABLE
final_df <- rbindlist(df_list)

在此处查看基准测试示例：Convert a list of data frames into one data frame

Answer 2

与原始设置相比...

> subsetted.dates <- da[0,]
> system.time(for(i in 1:length(first.times)){
+   subsetted.dates <- rbind(subsetted.dates, da[da$dates >= first.times[i] & da$dates < second.times[i],])
+ })
   user  system elapsed 
   3.97    0.35    4.33

...使用lapply可能会稍微改善性能：

> system.time({
+   subsetted.dates <- lapply(1:length(first.times),function(i) da[da$dates >= first.times[i] & da$dates < second.times[i],])
+   subsetted.dates <- do.call(rbind,subsetted.dates)
+ })
   user  system elapsed 
   3.37    0.26    3.75

稍微改变一下算法，如果您先用较小的数据集创建日期索引，然后应用它，那将带来更好的性能：

> system.time({
+   da_dates <- da$dates
+   da_inds <- lapply(1:length(first.times),function(i) which(da_dates >= first.times[i] & da_dates < second.times[i]))
+   subsetted.dates <- da[unlist(da_inds),]
+ })
   user  system elapsed 
   2.60    0.31    2.94

建议：时间间隔可以按时间顺序排序（在这种情况下，它们已经按时间顺序排序）并且它们没有重叠，问题变得更快：

system.time({ 
  da_date_order <- order(da$dates)
  da_date_back_order <- order(da$dates)
  da_sorted_dates <- sort(da$dates)
  da_selected_dates <- rep(FALSE,length(da_sorted_dates))
  j = 1
  for (i in 1:length(da_dates)) {
    if (da_sorted_dates[i] >= first.times[j] & da_sorted_dates[i] < second.times[j]) {
      da_selected_dates[i] <- TRUE
    } else if (da_sorted_dates[i] >= second.times[j]) {
      j = j + 1
      if (j > length(second.times)) {
        break
      }
    }
  }
  subsetted.dates <- da[da_date_back_order[da_selected_dates],]
})

user  system elapsed 
0.98    0.00    1.01

如果您允许对原始da数据集进行排序，则解决方案甚至更快：

system.time({
  da <- da[order(da$dates),]
  da_sorted_dates <- da$dates
  da_selected_dates <- rep(FALSE,length(da_sorted_dates))
  j = 1
  for (i in 1:length(da_dates)) {
    if (da_sorted_dates[i] >= first.times[j] & da_sorted_dates[i] < second.times[j]) {
      da_selected_dates[i] <- TRUE
    } else if (da_sorted_dates[i] >= second.times[j]) {
      j = j + 1
      if (j > length(second.times)) {
        break
      }
    }
  }
  subsetted.dates <- da[da_selected_dates,]
})

user  system elapsed 
0.63    0.00    0.63

保留跨越多个时间范围的行

2 个答案: