这对我来说很难解释,因此我仅举一个例子。我在下面有两个向量(a和b)。
a <- c("cat","dog","banana","yogurt","dog")
b <- c("salamander","worm","dog","banana","cat","yellow","blue")
我想要的是以下结果:
[1] 0 0 2 1 1 0 0
其中结果的每个元素是b的每个元素出现在向量a中的次数。
do.call("c",lapply(b,function(x){sum(x == a)}))
这给了我我想要的,但是我需要一个矢量化/更快的版本,因为我正在处理> 20,000条记录。任何帮助表示赞赏!
答案 0 :(得分:6)
您可以将colSums
与colSums(outer(a, b, `==`))
[1] 0 0 2 1 1 0 0
一起使用:
SELECT id
FROM your_table
GROUP BY id
HAVING SUM(CASE WHEN value IS NULL OR value = '' THEN 1 ELSE 0 END) = COUNT(ID)
答案 1 :(得分:6)
您可以这样做:
res <- table(factor(b, levels=b)[match(a, b, nomatch=0)])
salamander worm dog banana cat yellow blue
0 0 2 1 1 0 0
如果您想要香草向量,可以使用as.vector(res)
。
评论
==
进行详尽的比较要快。这些步骤与@GKi的双重合并非常相似:找到向量匹配的位置,然后映射回b。基准
必需的软件包:data.table,purrr,microbenchmark
各种选项
library(data.table)
# NelsonGon's answer
purrem <- function() purrr::map_dbl(b, ~sum(.x==a))
# Andrew's answer
vappem <- function() vapply(b, function(x) sum(x == a), FUN.VALUE = integer(1))
# Andrew's answer
collem <- function() colSums(outer(a, b, `==`))
# arg0naut91's answer
lappem <- function() unlist(lapply(b, function(x) sum(x == a)))
# this answer
matchem <- function() table(factor(b, levels=b)[match(a, b, nomatch=0)])
# this answer + data.table
matchem2<- function()
setDT(list(b))[, n := 0L][setDT(list(a))[, .N, by=V1], on=.(V1), n := N]$n
# @GKi's answer
mergem <- function() merge(b, table(merge(a, b, by=1)), by=1, all.x=T)[,2]
示例输入和基准测试代码
nv = 1e4 # values that can appear in a
nb = 1e3 # values to look up, nb <= na
na = 1e5 # length of a
set.seed(1)
a <- sample(nv, na, replace=TRUE)
b <- seq_len(nb)
microbenchmark::microbenchmark(times = 10,
pur_res <- purrem(),
vap_res <- vappem(),
col_res <- collem(),
lap_res <- lappem(),
mat_res <- matchem(),
mat_res2<- matchem2(),
mer_res <- mergem()
)
# make sure results match
# left as an exercise for the cautious user
identical(as.vector(mat_res), lap_res) # ok
identical(as.integer(col_res), lap_res) # ok
# etc
结果
Unit: milliseconds
expr min lq mean median uq max neval
pur_res <- purrem() 373.488498 389.331825 479.039835 430.363183 500.948370 858.77997 10
vap_res <- vappem() 367.247322 397.516902 472.635368 505.782597 532.951841 570.68548 10
col_res <- collem() 1353.356494 1481.029982 1507.536324 1515.966781 1552.886597 1650.93967 10
lap_res <- lappem() 352.197701 394.562073 469.988534 507.935397 525.426475 559.56388 10
mat_res <- matchem() 3.032507 3.230309 5.101941 3.371101 3.874484 15.31595 10
mat_res2 <- matchem2() 7.591947 11.666453 12.809046 12.266796 13.676658 22.04095 10
mer_res <- mergem() 23.448314 23.712974 27.730525 24.547323 24.716967 46.92548 10
如果花费不到一秒钟的时间,它可以装入内存并可以运行一次,则在这些选项中进行选择可能不太重要。在非慢速选项之间的排名可能取决于OP实际问题的参数(希望将nv,na,nb调整为此处的近似值)。
随意编辑更多选项并重新运行,将结果复制到我的此处。例如,我无法使用@NelsonGon的stringi方法来处理这些参数,但是也许其他人有更多的耐心或更强大的计算机。我也很想知道内存使用情况,但是还没有了解支持测量内存的软件包。
如果存在某些nv / na / nb配置,其中一个答案的效果特别好,则可以使用类似的基准突出显示该情况来编辑该答案。
仅供参考:
bench::mark(
pur_res <- purrem(),
vap_res <- vappem(),
col_res <- collem(),
lap_res <- lappem(),
mat_res <- matchem(),
mat_res2<- matchem2(),
mer_res <- mergem(),
stringi <- sapply(b, function(x) sum(stringi::stri_count(x, regex=a))),
check=FALSE
)
# A tibble: 8 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <list> <list>
1 pur_res <- purrem() 421.14ms 424.65ms 424.65ms 428.15ms 2.35 382.21MB 0 2 849.29ms <dbl [1,0~ <Rprofmem [2,1~ <bch:~ <tibble [2 ~
2 vap_res <- vappem() 367.88ms 370.61ms 370.61ms 373.34ms 2.70 381.52MB 0 2 741.23ms <int [1,0~ <Rprofmem [1,0~ <bch:~ <tibble [2 ~
3 col_res <- collem() 1.64s 1.64s 1.64s 1.64s 0.608 1.12GB 2 1 1.64s <dbl [1,0~ <Rprofmem [32 ~ <bch:~ <tibble [1 ~
4 lap_res <- lappem() 411.25ms 506.67ms 506.67ms 602.1ms 1.97 381.53MB 3 2 1.01s <int [1,0~ <Rprofmem [1,0~ <bch:~ <tibble [2 ~
5 mat_res <- matchem() 3.11ms 3.48ms 3.44ms 5.79ms 287. 1.4MB 0 144 501.66ms <S3: tabl~ <Rprofmem [90 ~ <bch:~ <tibble [14~
6 mat_res2 <- matchem2() 5.22ms 6.26ms 5.96ms 27.7ms 160. 4.83MB 1 80 501.18ms <int [1,0~ <Rprofmem [435~ <bch:~ <tibble [80~
7 mer_res <- mergem() 19.88ms 22.75ms 22.02ms 33.6ms 44.0 6.59MB 1 23 523.3ms <int [1,0~ <Rprofmem [410~ <bch:~ <tibble [23~
8 stringi <- sapply(b, function(x) sum(string~ 6.57m 6.57m 6.57m 6.57m 0.00254 1.12GB 1 1 6.57m <int [1,0~ <Rprofmem [2,3~ <bch:~ <tibble [1 ~
答案 2 :(得分:4)
也许这会快一点,但不确定是否有重大改进:
vapply(b, function(x) sum(x == a), FUN.VALUE = integer(1))
输出:
salamander worm dog banana cat yellow blue
0 0 2 1 1 0 0
在unlist
系列中,lapply
与apply
的组合也可能会稍微好一些:
unlist(lapply(b, function(x) sum(x == a)))
输出:
[1] 0 0 2 1 1 0 0
我现在没有机会适当地进行基准测试,但是我相信也不必要使用大括号({}
)会对性能产生负面影响。
答案 3 :(得分:2)
不确定速度,但是可以:
purrr::map_dbl(b, ~sum(.x==a))
[1] 0 0 2 1 1 0 0
一种base
/ stringi
的选择可能会更慢:
sapply(b,function(x) sum(stringi::stri_count(x,
regex=a)))
salamander worm dog banana cat yellow
0 0 2 1 1 0
blue
0
答案 4 :(得分:1)
您可以使用factor
匹配两个向量,并使用table
创建一个计数向量,计算每个元素出现在另一个向量中的次数,假设b是唯一的:
table(factor(a, levels=b))
#salamander worm dog banana cat yellow blue
# 0 0 2 1 1 0 0
要对此进行优化,可以通过match
进行匹配,并通过tabulate
进行计数:
tabulate(match(a,b), length(b))
#[1] 0 0 2 1 1 0 0
如果b不是唯一的,则可以使用:
Ub <- unique(b)
tabulate(match(a,Ub), length(Ub))[match(b,Ub)]
#[1] 0 0 2 1 1 0 0
rm(Ub)
将最常见的案件放在b
的开头时,应该可以加快速度。同样,将tabulate(bin, nbins)
的使用方式更改为.Internal(tabulate(bin, nbins))
也会减少计算时间。
可以使用match
代替fastmatch::fmatch
,这样可以减少计算时间:
library(fastmatch)
tabulate(fmatch(a,b), length(b))
#[1] 0 0 2 1 1 0 0
答案 5 :(得分:1)
当前
tabulate(match(a,b), length(b))
或tabulate(fastmatch::fmatch(a,b), length(b))
是最快的,具有最低的内存使用率。
library(data.table)
library(purrr)
library(fastmatch)
library(microbenchmark)
fun <- alist(ACE = do.call("c",lapply(b,function(x){sum(x == a)}))
, Andrew = colSums(outer(a, b, `==`))
, arg0naut911 = vapply(b, function(x) sum(x == a), FUN.VALUE = integer(1))
, arg0naut912 = unlist(lapply(b, function(x) sum(x == a)))
, NelsonGon1 = purrr::map_dbl(b, ~sum(.x==a))
# , NelsonGon2 = sapply(b,function(x) sum(stringi::stri_count(x, regex=a))) #This is somehow slow
, Frank1 = table(factor(b, levels=b)[match(a, b, nomatch=0)])
, Frank2 = setDT(list(b))[, n := 0L][setDT(list(a))[, .N, by=V1], on=.(V1), n := N]$n
, GKi1 = table(factor(a, levels=b))
, GKi2 = tabulate(match(a,b), length(b))
, GKi3 = {Ub <- unique(b); tabulate(match(a,Ub), length(Ub))[match(b,Ub)]; rm(Ub)}
, GKi4 = tabulate(fmatch(a,b), length(b))
)
memUse <- function(list, setup = "", gctort = FALSE) {
as.data.frame(lapply(list, function(z) {
eval(setup)
ttt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
gctorture(on = gctort)
eval(z)
gctorture(on = FALSE)
sum(.Internal(gc(FALSE, FALSE, TRUE))[13:14]) - ttt
}))
}
nv = 1e4 # values that can appear in a
nb = 1e3 # values to look up, nb <= na
na = 1e5 # length of a
set.seed(42)
a <- sample(nv, na, replace=TRUE)
b <- seq_len(nb)
microbenchmark(list = fun, times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# ACE 269.954636 331.972708 328.789761 344.776136 345.382701 354.785752 10
# Andrew 848.698037 863.489016 876.087567 871.606562 880.389684 925.432033 10
# arg0naut911 269.009657 311.542098 324.791662 338.709570 344.767421 355.313022 10
# arg0naut912 269.993883 323.843154 330.403232 337.707712 345.261788 377.198969 10
# NelsonGon1 271.066344 316.591125 334.548298 341.959808 350.633499 365.647488 10
# Frank1 2.845864 2.880154 3.003895 3.029094 3.085876 3.232025 10
# Frank2 3.928908 4.066095 5.148183 4.162109 4.452070 13.676931 10
# GKi1 31.971671 32.343447 32.626064 32.733487 32.832000 33.282033 10
# GKi2 1.779743 1.859890 1.948823 1.970881 2.018004 2.099922 10
# GKi3 1.882411 1.946231 2.059325 2.055469 2.188922 2.214205 10
# GKi4 1.103117 1.160845 1.243543 1.242525 1.260500 1.500836 10
memUse(list=fun, gctort = FALSE) #in Mb
# ACE Andrew arg0naut911 arg0naut912 NelsonGon1 Frank1 Frank2 GKi1 GKi2 GKi3 GKi4
#1 382.4 1144.4 382.3 382.3 360.2 1.3 3.2 4.6 0.8 0.8 0.4
memUse(list=fun, gctort = TRUE) #in Mb
# ACE Andrew arg0naut911 arg0naut912 NelsonGon1 Frank1 Frank2 GKi1 GKi2 GKi3 GKi4
#1 1.7 1144.5 1.6 1.6 1.2 0.9 2.2 2.9 0.8 0.8 0.4
### Variant B - Mimicking the case of ACE ###
set.seed(42)
nv <- 20
nb <- 15
na <- 50 #max
lengtha <- 20000
xv <- replicate(nv, paste0(sample(LETTERS, sample(3:15, 1), TRUE), collapse=""))
b <- sample(xv, nb)
la <- replicate(lengtha, sample(xv, sample(0:na, 1), TRUE))
fun <- alist(ACE = lapply(la, function(a) {do.call("c",lapply(b,function(x){sum(x == a)}))})
, Andrew = lapply(la, function(a) {colSums(outer(a, b, `==`))})
, arg0naut911 = lapply(la, function(a) {vapply(b, function(x) sum(x == a), FUN.VALUE = integer(1))})
, arg0naut912 = lapply(la, function(a) {unlist(lapply(b, function(x) sum(x == a)))})
, NelsonGon1 = lapply(la, function(a) {purrr::map_dbl(b, ~sum(.x==a))})
# , NelsonGon2 = lapply(la, function(a) {sapply(b,function(x) sum(stringi::stri_count(x, regex=a)))}) #This is somehow slow
, Frank1 = lapply(la, function(a) {table(factor(b, levels=b)[match(a, b, nomatch=0)])})
, Frank2 = lapply(la, function(a) {setDT(list(b))[, n := 0L][setDT(list(a))[, .N, by=V1], on=.(V1), n := N]$n})
, GKi1 = lapply(la, function(a) {table(factor(a, levels=b))})
, GKi2 = lapply(la, function(a) {tabulate(match(a,b), length(b))})
, GKi3 = lapply(la, function(a) {Ub <- unique(b); tabulate(match(a,Ub), length(Ub))[match(b,Ub)]; rm(Ub)})
, GKi4 = lapply(la, function(a) {tabulate(fmatch(a,b), length(b))})
)
microbenchmark(list = fun, times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# ACE 465.81627 473.90476 497.44989 486.15057 530.19484 550.1138 10
# Andrew 434.23044 439.07163 467.63245 447.41847 486.72514 564.0105 10
# arg0naut911 434.10375 453.50480 506.61509 503.49702 547.05514 619.0931 10
# arg0naut912 423.36126 427.58611 472.05053 482.25018 499.00205 534.3943 10
# NelsonGon1 1471.78370 1550.21649 1581.23682 1574.90285 1606.96480 1695.4031 10
# Frank1 1283.42164 1316.24555 1353.04844 1356.99698 1382.43747 1419.8793 10
# Frank2 34208.83565 35393.61614 36239.77059 35568.44068 37873.94184 39361.0081 10
# GKi1 1101.14022 1153.13165 1192.08497 1184.66592 1221.57634 1321.6016 10
# GKi2 77.63488 79.44446 94.12155 82.22419 97.47998 138.5571 10
# GKi3 673.66302 708.49934 728.21153 729.96899 759.65502 773.2909 10
# GKi4 81.43012 83.92463 91.73833 86.39957 92.53420 137.13057 10
memUse(list=fun, gctort = FALSE) #in Mb
# ACE Andrew arg0naut911 arg0naut912 NelsonGon1 Frank1 Frank2 GKi1 GKi2 GKi3 Gki4
#1 28.9 48.6 28.9 29.1 28.5 30.6 41.3 28.9 29.4 25.3 25.4