在R中,我怎样才能最好地矢量化这个操作?
我有一个参考值表,下限(A)和上限(B)。
我还有一个值表(X)来查找上表。
对于X的每个值,我需要确定它是否位于参考表中A和B的任何值之间。
为了演示上述内容,以下是使用循环的解决方案:
#For Reproduceability,
set.seed(1);
#Set up the Reference and Lookup Tables
nref = 5; nlook = 10
referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5),
B=runif(nref,min=0.50,max=0.75));
lookupTable <- data.frame(X=runif(nlook),IsIn=0)
#Process for each row in the lookup table
#search for at least one match in the reference table where A <= X < B
for(x in 1:nrow(lookupTable)){
v <- lookupTable$X[x]
tmp <- subset(referenceTable,v >= A & v < B)
lookupTable[x,'IsIn'] = as.integer(nrow(tmp) > 0)
}
我正在尝试删除for(x in .... )
组件,因为我现实生活中的表格中有数千条记录。
答案 0 :(得分:6)
我找不到确切的欺骗,所以这是使用data.table::foverlaps
的可能解决方案。首先,我们需要向lookupTable
添加一个额外的列,以便在两侧创建边界。然后key
referenceTable
(foverlaps
工作所必需的)然后只运行一个简单的重叠连接,同时只选择第一个连接,因为您想要任何连接(我已经使用0^
来转换为二进制文件,因为你不想要实际的位置)
library(data.table)
setDT(lookupTable)[, Y := X] # Add an additional boundary column
setkey(setDT(referenceTable)) # Key the referenceTable data set
lookupTable[, IsIn := 0 ^ !foverlaps(lookupTable,
referenceTable,
by.x = c("X", "Y"),
mult = "first",
nomatch = 0L,
which = TRUE)]
# X IsIn Y
# 1: 0.2059746 0 0.2059746
# 2: 0.1765568 0 0.1765568
# 3: 0.6870228 1 0.6870228
# 4: 0.3841037 1 0.3841037
# 5: 0.7698414 0 0.7698414
# 6: 0.4976992 1 0.4976992
# 7: 0.7176185 1 0.7176185
# 8: 0.9919061 0 0.9919061
# 9: 0.3800352 1 0.3800352
# 10: 0.7774452 0 0.7774452
答案 1 :(得分:0)
使用findInterval
referenceTable2 = cbind(-Inf, referenceTable)
for(x in 1:nrow(referenceTable2)){
tmp <- findInterval(lookupTable$X, referenceTable2[x,])
lookupTable[,'IsIn'] = lookupTable[,'IsIn'] + (tmp == 2)
}
lookupTable[,'IsIn'] = sign(lookupTable[,'IsIn'])
正如您所看到的,参考表中仍然存在循环,因此如果您的参考表仍然很小,此解决方案尤其有用。一些基准:
1)样本集:
> microbenchmark(nicholas = {set.seed(1); nref = 5; nlook = 10; referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5), B=runif(nref,min=0.50,max=0.75)); lookupTable <- data.frame(X=runif(nlook),IsIn=0); for(x in 1:nrow(lookupTable)){v <- lookupTable$X[x]; tmp <- subset(referenceTable,v >= A & v < B); lookupTable[x,'IsIn'] = as.integer(nrow(tmp) > 0)}},
+ mts = {set.seed(1); nref = 5; nlook = 10; referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5), B=runif(nref,min=0.50,max=0.75)); lookupTable <- data.frame(X=runif(nlook),IsIn=0); referenceTable2 = cbind(-Inf, referenceTable); for(x in 1:nrow(referenceTable2)){tmp <- findInterval(lookupTable$X, referenceTable2[x,]); lookupTable[,'IsIn'] = lookupTable[,'IsIn'] + (tmp == 2);}; lookupTable[,'IsIn'] = sign(lookupTable[,'IsIn'])},
+ david = {set.seed(1); nref = 5; nlook = 10; referenceTable <- data.frame(A=runif(nref,min=0.25,max=0.5), B=runif(nref,min=0.50,max=0.75)); lookupTable <- data.frame(X=runif(nlook),IsIn=0); setDT(lookupTable)[, Y := X]; setkey(setDT(referenceTable)); lookupTable[, IsIn := 0 ^ !foverlaps(lookupTable, referenceTable, by.x = c("X", "Y"), mult = "first", nomatch = 0L, which = TRUE)]},
+ times = 100)
Unit: milliseconds
expr min lq mean median uq max neval
nicholas 1.948096 2.091311 2.190386 2.150790 2.254352 4.092121 100
mts 2.520489 2.711986 2.883299 2.803421 2.885990 5.165999 100
david 6.466129 7.013798 7.344596 7.197132 7.422916 12.274029 100
2)nref = 10; nlook = 1000
Unit: milliseconds
expr min lq mean median uq max neval
nicholas 152.804680 160.681265 164.601443 163.304849 165.387296 243.250708 100
mts 4.434997 4.720027 5.025555 4.819624 4.991995 11.325172 100
david 6.505314 6.920032 7.181116 7.111529 7.331950 9.318765 100
3)nref = 200; nlook = 1000
Unit: milliseconds
expr min lq mean median uq max neval
nicholas 172.939666 179.672397 183.337253 181.191081 183.694077 264.59672 100
mts 77.836588 81.071752 83.860281 81.991919 83.484246 168.22290 100
david 6.870116 7.404256 7.736445 7.587591 7.836234 11.54349 100
我认为大卫的解决方案是一个明显的赢家。 当参考间隔非常少时,该解决方案具有优势。请注意,在您的示例中,许多都是重叠的,并且事先将它们组合可能会进一步改善结果。