我需要遍历数据表2 DT2,并且对于每一行检查数据表1 DT1中的某些条件是否匹配。
我目前这样做。这在DT1中查找DT2中的数据。在循环中,您将看到:
subset = DT1[DT1$time == DT2$time[i] & DT1$a == DT2$a[i] & DT1$b == DT2$b[i] & (DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] ) & DT1$Flag ==0]
如果匹配我需要
1 - 标记在DT1中匹配的行,因此我再次不匹配。在使用DT1$Flag[match$RowNumber] = 1
2 - 使用DT1中相应列的数据填充DT2中的列 在使用
完成的代码中 DT2$x[i] = match$x
这几乎就是它的工作原理但是DT1可能是10,000行而DT2可能是100,000行,所以对于每10,000行我是子集。子集10,000次!
还有第二个要求,即循环具有完全匹配的匹配条件。在您看到的代码中
DT1$time == DT2$time[i]
完成匹配后。放宽时间条件并运行第二个循环,其时间窗口为+/- 7秒,因此在第二个循环中,您可以在DT1中找到与DT2中的时间相差+/- 7秒的匹配。
DT1[ DT1$time >= DT2$time[i] -7 & DT1$time <= DT2$time[i]+7
这也可以,但是因为数据表有很多行,子集化需要很长时间。这两个循环能以某种方式变得更快吗?
注意子集中的OR条件使得这个很棘手。
(DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] )
示例代码低于您可以运行的代码。谢谢。
############# Here is the setup of the datatables
times= rep(as.POSIXct("2016-01-01",tz="GMT")+seq(1,10,by = 1),2)
times= times[order(times)]
DT1 = data.table(time = times, a = c(1,seq(1,19,1)) , b = c(11,seq(11,29,1)) , cat= c("a","a", rep(c("a","b"),each=9) ) ,Flag =rep(0,20) ,x = seq(201,220,1) )
DT1$RowNumber = seq(1,dim(DT1)[1],1)
DT2 = data.table(time = as.POSIXct(c("2016-01-01 00:00:01","2016-01-01 00:00:10","2016-01-01 00:00:10"),tz = "GMT"), a = c(1,19,10),b=c(11,29,20), cat1 = c("a","x","b"), x = c(0,0,0),MatchType = c("none","none","none"), cat2=c("a","b","a"))
######### This is the for loop that does the matching
#If there is a match i.e. dim(subset)[1]>0 two things happen
# 2 - flag the row used in DT1 so it is not used again...notice DT1$Flag ==0 is used in the subset
# 1 - populate column x in dt2 from column x in dt1
for(i in 1:dim(DT2)[1])#loop over rows of dt2
{
#i =1
subset = DT1[DT1$time == DT2$time[i] & DT1$a == DT2$a[i] & DT1$b == DT2$b[i] & (DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] ) & DT1$Flag ==0] #lookin dt2 for the dt1 data
if(dim(subset)[1]>0)
{
match = head(subset,1) # if there are multiple matches only use the 1st one
DT1$Flag[match$RowNumber] = 1 #flag the row used in DT1 so it is not used again
DT2$MatchType[i] = "First Loop"#populate column x in dt2 from column x in dt1
DT2$x[i] = match$x #populate column x in dt2 from column x in dt1
}
}
##### after that loop some rows in DT2 will not have a match. In this case the last row has HasAMatch = 0
DT2 # NOTE HERE that the last row has Match Type = none because a match could not be found
DT1 # NOTE the flag column has a 1 in the first and last rows which was set in the loop when the match occured
##### Now a second loop is done this time trying to match within a time window +-7 seconds instead of a matching EXACTLEY on time
firstloop = DT2[DT2$MatchType != "none",] ### this removes any of the rows ALREADY MATCHED IN THE FIRST LOOP
DT2 =DT2[DT2$MatchType == "none",] ### this is used in the loop below and has the rows that have NOT been matched yet
DT1 = DT1[DT1$Flag == 0,] ## this again removes rows from DT1 that have already been matched
DT1$RowNumber = seq(1,dim(DT1)[1],1)
for(i in 1:dim(DT2)[1])#loop over rows of dt2
{
i=1
subset = DT1[ DT1$time >= DT2$time[i] -7 & DT1$time <= DT2$time[i]+7 & DT1$a == DT2$a[i] & DT1$b == DT2$b[i] & (DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] ) & DT1$Flag ==0] #lookin dt2 for the dt1 data
if(dim(subset)[1]>0)
{
match = head(subset,1) # if there are multiple matches only use the 1st one
DT1$Flag[match$RowNumber] = 1 #flag the row used in DT1 so it is not used again
DT2$MatchType[i] = "Second Loop" #populate column x in dt2 from column x in dt1
DT2$x[i] = match$x #populate column x in dt2 from column x in dt1
}
}
# now the process is finished
rbind(firstloop, DT2) # NOTE now you can see the match type of "second loop" for the last row
DT1 # NOTE the flag in row 10 because that was the row used in the match
答案 0 :(得分:1)
首先我要提一下,你必须尝试阻止在数据集中使用变量名称的函数名称:.nupkg
和cat
是R中的函数,因此我使用了subset
在这个答案中,而不是cat0
和cat
而不是subs
。您的代码有几种可能的改进:
创建样本数据:
特别是使用subset
创建RowNumber
变量可以更高效地完成。此外,我还给了.I
一个rownumber变量,因为这在接下来的步骤中很有用:
DT2
第一个循环:
这可以通过取消times <- rep(as.POSIXct("2016-01-01",tz="GMT") + seq(1,10,by = 1), 2)
times <- times[order(times)]
DT1 <- data.table(time = times,
a = c(1,1:19),
b = c(11,11:29),
cat0 = c("a","a", rep(c("a","b"), each=9)),
Flag = rep(0,20),
x = seq(201,220,1))[, rn := .I]
DT2 <- data.table(time = as.POSIXct(c("2016-01-01 00:00:01","2016-01-01 00:00:10","2016-01-01 00:00:10"), tz="GMT"),
a = c(1,19,10),
b = c(11,29,20),
cat1 = c("a","x","b"),
x = c(0,0,0),
MatchType = c("none","none","none"),
cat2 = c("a","b","a"))[, rn := .I]
的更新并利用{em> data.table 包的引用可能性来更新DT1
(这可能会简化)比在for-loop中更有效率:
:=
然后可以通过创建索引并再次使用for(i in 1:nrow(DT2))
{
subs <- DT1[time == DT2$time[i] &
a == DT2$a[i] &
b == DT2$b[i] &
(cat0 == DT2$cat1[i] | cat0 == DT2$cat2[i])
& Flag == 0]
if(nrow(subs) > 0)
{
DT2[i, `:=` (MatchType = 'First Loop', x = subs$x[1])]
}
}
按引用更新来更新 DT1
:
:=
第二个循环:
idx1 <- DT1[(time %in% DT2$time) & (a %in% DT2$a) & (b %in% DT2$b) &
(cat0 %in% DT2$cat1 | cat0 %in% DT2$cat2) & (Flag == 0),
.SD[1],
.(time,a,b,cat0,Flag)]$rn
DT1[idx1, Flag := 1]
这些改进消除了创建中间子集和for(i in DT2[MatchType == "none"]$rn) # here we need the rownumber variable for DT2
{
subs <- DT1[time >= DT2$time[i]-7 &
time <= DT2$time[i]+7 &
a == DT2$a[i] &
b == DT2$b[i] &
(cat0 == DT2$cat1[i] | cat0 == DT2$cat2[i] )
& Flag == 0]
if(nrow(subs) > 0)
{
DT1[subs$rn[1], Flag := 2]
DT2[i, `:=` (MatchType = 'Second Loop', x = subs$x[1])]
}
}
步骤的需要。最终结果:
rbind