我在R中使用if语句运行double for循环所花费的时间长短遇到问题。 在一个数据集中,我大约有3000000行(DF1),在另一个数据集中,我大约有22(DF2)。下面是我拥有的两个数据帧的示例。
DF1
DateTime REG
2018-07-01 12:00:00 NHDG
2018-07-12 11:55:23 NSKR
DF2
StartDateTime EndDateTime Direction
2018-07-01 07:55:11 2018-07-01 12:01:56 W
2018-07-12 11:00:23 2018-07-12 11:45:00 E
当DateTime在StartDateTime和EndDateTime之间时,我想标记DF1中的任何内容。因此,输出将如下所示:
DF1
DateTime REG Flag
2018-07-01 12:00:00 NHDG 1
2018-07-12 11:55:23 NSKR 0
我当前使用的代码是:
#Flag if in delay or not
DF1$Flag<-0
for (i in 1:nrow(DF1)){
for (j in 1:nrow(DF2)){
if ((DF1$DateTime[i] >= DF2$StartDateTime[j]) & (DF1$DateTime <= DF2$EndDateTime[j])){
DF1$Flag[i]<-1
} else {
DF1$Flag[i]<-DF1$Flag
}
}
}
如果可能的话,我很高兴将此代码从for循环中删除。
答案 0 :(得分:2)
如果我理解的正确,如果DateTime在DF1
的 any 间隔之间,应该将DF2
中Flag的值设置为1,对吗?
然后,以下基本代码将完成这项工作:
DF1$Flag = sapply(DF1$DateTime,
function(x) as.integer(sum(x >= DF2$StartDateTime &
x <= DF2$EndDateTime) > 0))
# DateTime REG Flag
# 1 2018-07-01 12:00:00 NHDG 1
# 2 2018-07-12 11:55:23 NSKR 0
想法是使比较矢量化:对于DF1
中的每个DateTime(通过sapply
进行“循环”排序),您将值与{{ 1}},您得到DF2
的结果:如果sum
大于0,则您在sum
中至少有一行,其中DF2
的DateTime在其开始之间-和EndDateTime。然后DF1
将as.integer
的布尔输出转换为sum(...) > 0
或1
。
并且,如果您想要更快的解决方案,请使用0
:
dplyr
否则:
在df1 = full_join(mutate(DF1, foo=1), mutate(DF2, foo=1), by='foo') %>%
mutate(Flag = as.integer(DateTime >= StartDateTime & DateTime <= EndDateTime)) %>%
group_by(DateTime) %>% slice(which.max(Flag)) %>%
select(DateTime, REG, Flag)
(j循环)的行上,第二个循环似乎有问题:对于DF2
的每一行,您将日期与连续的开始日期和结束日期进行比较DF1
的所有行,基本上每次都覆盖结果Flag值,而只保留与DF2
的最后一行进行比较的结果...?
换句话说,DF2
中的i
不会在DF1$Flag[i] <- ...
循环内移动(并且每次都会被覆盖)。
因此,如果您只想比较j
中的最小日期范围和最大日期范围,则只需执行以下操作:
DF2
答案 1 :(得分:0)
那呢?
library(data.table)
DF1$flag <- as.numeric(sapply(seq(nrow(DF1)), function(x)
DF1[x, "DateTime"] %between% c(min(DF2[x, "StartDateTime"]), max(DF2[x, "EndDateTime"]))))
# DateTime REG flag
# 1 2018-07-01 12:00:00 NHDG 1
# 2 2018-07-12 11:55:23 NSKR 0
数据
> dput(DF1)
structure(list(DateTime = structure(1:2, .Label = c("2018-07-01 12:00:00",
"2018-07-12 11:55:23"), class = "factor"), REG = structure(1:2, .Label = c("NHDG",
"NSKR"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
> dput(DF2)
structure(list(StartDateTime = structure(1:2, .Label = c("2018-07-01 07:55:11",
"2018-07-12 11:00:23"), class = "factor"), EndDateTime = structure(1:2, .Label = c("2018-07-01 12:01:56",
"2018-07-12 11:45:00"), class = "factor"), Direction = structure(2:1, .Label = c("E",
"W"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
DF1$DateTime <- as.POSIXct(DF1$DateTime)
DF2$StartDateTime <- as.POSIXct(DF2$StartDateTime)
DF2$EndDateTime <- as.POSIXct(DF2$EndDateTime)
答案 2 :(得分:0)
也可以使用foverlaps
:
library(data.table)
setDT(DF1)[, DateTime := as.POSIXct(DateTime)][, EndDateTime := DateTime]
setDT(DF2)[, `:=` (StartDateTime = as.POSIXct(StartDateTime),
EndDateTime = as.POSIXct (EndDateTime))]
setkey(DF1, DateTime, EndDateTime)
setkey(DF2, StartDateTime, EndDateTime)
DF1[, Flag := foverlaps(DF1, DF2, type = "within", which = TRUE, mult = "first")][
is.na(Flag), Flag := 0][, EndDateTime := NULL]
这将检查DF1
中的每个日期是否位于DF2
中的任何时间间隔中。
至少根据我的测试,它也会很快。使用sapply
进行基准测试:
Unit: milliseconds
expr min lq mean median uq max neval
DT 4.752853 5.247319 18.38787 5.42855 6.950966 311.1944 25
sapply 9413.337014 10598.926908 11206.14866 10892.91751 11746.901293 13568.7995 25
这是在DF1
中有1万行和DF2
中有12行的数据集中。
我只在300 000/22行上运行了一次,这就是我得到的:
Unit: seconds
expr min lq mean median uq max neval
DT 11.60865 11.60865 11.60865 11.60865 11.60865 11.60865 1
sapply 674.05823 674.05823 674.05823 674.05823 674.05823 674.05823 1
答案 3 :(得分:0)
一种更快的方法是使用从tidyr到crossing df1和df2的crossing(),在新数据帧中设置每行的标志,然后使用aggregate()减少返回的行数。此方法假定df1中没有重复的条目。如果有,它们将被合并。
> df1
DateTime REG
1 2018-07-01 12:00:00 NHDG
2 2018-07-12 11:55:23 NSKR
> df2
StartDateTime EndDateTime Direction
1 2018-07-01 07:55:11 2018-07-01 12:01:56 W
2 2018-07-12 11:00:23 2018-07-12 11:45:00 E
> # Create a DF with rows for each combination of df1 rows with df2 rows
> tmp <- crossing(df1, df2)
> tmp
DateTime REG StartDateTime EndDateTime Direction
1 2018-07-01 12:00:00 NHDG 2018-07-01 07:55:11 2018-07-01 12:01:56 W
2 2018-07-01 12:00:00 NHDG 2018-07-12 11:00:23 2018-07-12 11:45:00 E
3 2018-07-12 11:55:23 NSKR 2018-07-01 07:55:11 2018-07-01 12:01:56 W
4 2018-07-12 11:55:23 NSKR 2018-07-12 11:00:23 2018-07-12 11:45:00 E
> # Create a new column for the flag
> tmp$flag <- tmp$DateTime >= tmp$StartDateTime & tmp$DateTime <= tmp$EndDateTime
> tmp
DateTime REG StartDateTime EndDateTime Direction flag
1 2018-07-01 12:00:00 NHDG 2018-07-01 07:55:11 2018-07-01 12:01:56 W TRUE
2 2018-07-01 12:00:00 NHDG 2018-07-12 11:00:23 2018-07-12 11:45:00 E FALSE
3 2018-07-12 11:55:23 NSKR 2018-07-01 07:55:11 2018-07-01 12:01:56 W FALSE
4 2018-07-12 11:55:23 NSKR 2018-07-12 11:00:23 2018-07-12 11:45:00 E FALSE
> # Drop the unwanted columns
> tmp <- tmp[,c("DateTime", "REG", "flag")]
> tmp
DateTime REG flag
1 2018-07-01 12:00:00 NHDG TRUE
2 2018-07-01 12:00:00 NHDG FALSE
3 2018-07-12 11:55:23 NSKR FALSE
4 2018-07-12 11:55:23 NSKR FALSE
> # Sum all flags for a given df1 date and limit total to 1
> df1 <- aggregate(flag ~ DateTime + REG, tmp, FUN = function(x) {min(1, sum(x))})
> df1
DateTime REG flag
1 2018-07-01 12:00:00 NHDG 1
2 2018-07-12 11:55:23 NSKR 0
>
运行更多日期,并与原始的for循环和上述sapply()方法进行比较:
Original for loop method: 6.282 sec elapsed
sapply() method: 1.65 sec elapsed
crossing() and aggregate(): 0.385 sec elapsed
完整的脚本在这里:
#!/usr/bin/env Rscript
library(tictoc)
library(tidyr)
# Setup: generate a lot of dates for performance comparison
beg <- as.POSIXct("2018-07-01 12:00:00")
end <- as.POSIXct("2100-12-01 12:00:00")
dates <- seq(beg, end, 60*60*24)
#df1 <- data.frame(c("2018-07-01 12:00:00", "2018-07-12 11:55:23"), c("NHDG","NSKR"))
df1 <- data.frame(dates, rep(c("NHDG","NSKR"), length(dates)/2))
df2 <- data.frame(c("2018-07-01 07:55:11", "2018-07-12 11:00:23"), c("2018-07-01 12:01:56", "2018-07-12 11:45:00"), c("W","E"))
colnames(df1) <- c("DateTime", "REG")
colnames(df2) <- c("StartDateTime","EndDateTime","Direction")
df1$DateTime <- as.POSIXct(df1$DateTime, tz = "America/Los_Angeles")
df2$StartDateTime <- as.POSIXct(df2$StartDateTime, tz = "America/Los_Angeles")
df2$EndDateTime <- as.POSIXct(df2$EndDateTime, tz = "America/Los_Angeles")
# Original (fixed)
tic(sprintf("%30s", "Original for loop method"))
for (i in 1:nrow(df1)){
df1$flag[i] <- 0
for (j in 1:nrow(df2)){
if ((df1$DateTime[i] >= df2$StartDateTime[j]) & (df1$DateTime[i] <= df2$EndDateTime[j])){
df1$flag[i]<-1
break
}
}
}
toc()
result1 <- df1
df1$flag <- NULL
# Sapply
tic(sprintf("%30s", "sapply() method"))
df1$flag = sapply(df1$DateTime,
function(x) as.integer(sum(x >= df2$StartDateTime &
x <= df2$EndDateTime) > 0))
toc()
result2 <- df1
df1$flag <- NULL
# Aggregate
tic(sprintf("%30s", "crossing() and aggregate()"))
# Create a DF with rows for each combination of df1 rows with df2 rows
tmp <- crossing(df1, df2)
# Create a new column for the flag
tmp$flag <- tmp$DateTime >= tmp$StartDateTime & tmp$DateTime <= tmp$EndDateTime
# Drop the unwanted columns
tmp <- tmp[,c("DateTime", "REG", "flag")]
# Sum all flags for a given df1 date and limit total to 1
df1 <- aggregate(flag ~ DateTime + REG, tmp, FUN = function(x) {min(1, sum(x))})
# Sort the rows by date
df1 <- df1[order(df1$DateTime),]
# Reset the row names (for comparison below)
rownames(df1) <- NULL
toc()
result3 <- df1
# Prove that results are the same
if (!all.equal(result1, result2)) {
print("MISMATCH")
stop()
}
if (!all.equal(result1, result3)) {
print(MISMATCH)
stop()
}
print("PASS")