我有一个数据帧(da
),其中每一行都有一个升序的时间戳(每个时间戳之间的间隔是随机的)。
我想根据da
的行是否落在其他两个向量(first.times
和second.times
)中的时间之间来确定行。因此,我将迭代first.time
和second.time
的向量,并查看da
是否在这些时间间隔内(min = {first times
和max = second.times
),剩下的我不保留。
我弄清楚如何做到这一点的唯一方法是使用for
循环,但这可能需要一段时间。这是带有一些示例数据的代码:
#Set start and end dates
date1 <- as.POSIXct(strptime('1970-01-01 00:00', format = '%Y-%m-%d %H:%M'))
date2 <- as.POSIXct(strptime('1970-01-05 23:00', format = '%Y-%m-%d %H:%M'))
#Interpolate 250000 dates in between (dates are set to random intervals)
dates <- c(date1 + cumsum(c(0, round(runif(250000, 20, 200)))), date2)
#Set up dataframe
da <- data.frame(dates = dates,
a = round(runif(1, 1, 10)),
b = rep(c('Hi', 'There', 'Everyone'), length.out = length(dates)))
head(da); dim(da)
#Set up vectors of time
first.times <- seq(date1, #First time in sequence is date1
date2, #Last time in sequence is date2
by = 13*60) #Interval of 13 minutes between each time (13 min * 60 sec)
second.times <- first.times + 5*60 #Second time is 5 min * 60 seconds later
head(first.times); length(first.times)
head(second.times); length(second.times)
#Loop to obtain rows
subsetted.dates <- da[0,]
system.time(for(i in 1:length(first.times)){
subsetted.dates <- rbind(subsetted.dates, da[da$dates >= first.times[i] & da$dates < second.times[i],])
})
user system elapsed
2.590 0.825 3.520
我想知道在for
循环中是否有更有效,更快捷的方法来完成我的工作。这个示例数据集运行得非常快,但是我的实际数据集每次迭代可能需要45秒,而要进行1000次迭代,则可能需要一段时间!
任何帮助都会大有帮助!
谢谢!
答案 0 :(得分:1)
切勿在循环中使用rbind
或cbind
!这将导致过多的内存复制。参见Patrick Burns' R Interno: Circle 2 - Growing Objects。取而代之的是,一旦循环外就为rbind
建立数据帧列表:
由于在相等长度的向量之间迭代元素明智的方式,请考虑使用mapply
或其列表包装器Map
:
df_list <- Map(function(f, s) da[da$dates >= f & da$dates < s,],
first.times, second.times)
# EQUIVALENT CALL
df_list <- mapply(function(f, s) da[da$dates >= f & da$dates < s,],
first.times, second.times, SIMPLIFY=FALSE)
甚至考虑使用transform
将数据第一次和第二次添加到数据框中以添加列:
df_list <- Map(function(f, s) transform(da[da$dates >= f & da$dates < s,],
first_time = f, second_time = s),
first.times, second.times)
从那里开始,使用多种解决方案对数据帧列表进行行绑定:
# BASE
final_df <- do.call(rbind, df_list)
# PLYR
final_df <- rbind.fill(df_list)
# DPLYR
final_df <- bind_rows(df_list)
# DATA TABLE
final_df <- rbindlist(df_list)
在此处查看基准测试示例:Convert a list of data frames into one data frame
答案 1 :(得分:0)
与原始设置相比...
> subsetted.dates <- da[0,]
> system.time(for(i in 1:length(first.times)){
+ subsetted.dates <- rbind(subsetted.dates, da[da$dates >= first.times[i] & da$dates < second.times[i],])
+ })
user system elapsed
3.97 0.35 4.33
...使用lapply
可能会稍微改善性能:
> system.time({
+ subsetted.dates <- lapply(1:length(first.times),function(i) da[da$dates >= first.times[i] & da$dates < second.times[i],])
+ subsetted.dates <- do.call(rbind,subsetted.dates)
+ })
user system elapsed
3.37 0.26 3.75
稍微改变一下算法,如果您先用较小的数据集创建日期索引,然后应用它,那将带来更好的性能:
> system.time({
+ da_dates <- da$dates
+ da_inds <- lapply(1:length(first.times),function(i) which(da_dates >= first.times[i] & da_dates < second.times[i]))
+ subsetted.dates <- da[unlist(da_inds),]
+ })
user system elapsed
2.60 0.31 2.94
建议:时间间隔可以按时间顺序排序(在这种情况下,它们已经按时间顺序排序)并且它们没有重叠,问题变得更快:
system.time({
da_date_order <- order(da$dates)
da_date_back_order <- order(da$dates)
da_sorted_dates <- sort(da$dates)
da_selected_dates <- rep(FALSE,length(da_sorted_dates))
j = 1
for (i in 1:length(da_dates)) {
if (da_sorted_dates[i] >= first.times[j] & da_sorted_dates[i] < second.times[j]) {
da_selected_dates[i] <- TRUE
} else if (da_sorted_dates[i] >= second.times[j]) {
j = j + 1
if (j > length(second.times)) {
break
}
}
}
subsetted.dates <- da[da_date_back_order[da_selected_dates],]
})
user system elapsed
0.98 0.00 1.01
如果您允许对原始da
数据集进行排序,则解决方案甚至更快:
system.time({
da <- da[order(da$dates),]
da_sorted_dates <- da$dates
da_selected_dates <- rep(FALSE,length(da_sorted_dates))
j = 1
for (i in 1:length(da_dates)) {
if (da_sorted_dates[i] >= first.times[j] & da_sorted_dates[i] < second.times[j]) {
da_selected_dates[i] <- TRUE
} else if (da_sorted_dates[i] >= second.times[j]) {
j = j + 1
if (j > length(second.times)) {
break
}
}
}
subsetted.dates <- da[da_selected_dates,]
})
user system elapsed
0.63 0.00 0.63