重复应用不同的cols组合

时间:2019-07-06 13:48:58

标签: r apply

我无法嵌套应用函数来与其他列组合重复多次

对于sp1==1 & s1==1,我需要获得sp2==1 & s1==1s1的百分比,对于s2,s3 ... s1000,也需要以相同的方式获得。这是一个简短的示例:

x <- data.frame("sp1"=rep(0:1, times=5),
                "sp2"=rep(0:1, each=5),
                "s1" = rep(0:1, times=10),
                "s2" = rep(0:1, each=2),
                "s3" = rep(1:0, each=2))
> x
   sp1 sp2 s1 s2 s3
1    0   0  0  0  1
2    1   0  1  0  1
3    0   0  0  1  0
4    1   0  1  1  0
5    0   0  0  0  1
6    1   1  1  0  1
7    0   1  0  1  0
8    1   1  1  1  0
9    0   1  0  0  1
10   1   1  1  0  1
11   0   0  0  1  0
12   1   0  1  1  0
13   0   0  0  0  1
14   1   0  1  0  1
15   0   0  0  1  0
16   1   1  1  1  0
17   0   1  0  0  1
18   1   1  1  0  1
19   0   1  0  1  0
20   1   1  1  1  0

现在,我键入一个函数来计算有关s1的百分比:

r <- as.data.frame(sapply(x[,1:2],
                          function(i) sum(i ==1 & x$s1 == 1)/sum(i ==1)))
> r
    sapply(x[, 1:2], function(i) sum(i == 1 & x$s1 == 1)/sum(i == 1))
sp1                                                               1.0
sp2                                                               0.6

我想建立一个关于s1,s2,s3,... s1000 ...的所有百分比的sp1,sp2,sp3,... sp200的df。

> r
      s1   s2   s3 ... s1000
sp1   1.0  0.5  0.5
sp2   0.6  0.5  0.5
...
sp200

我试图对两个组变量都做一个函数,一个用于sp,另一个用于s:

intento <- as.data.frame(sapply(i=x[,1:2], 
                                j=x[,3:5], 
                                function(i,j)sum(i ==1 & j == 1)/sum(i ==1)))

但是从逻辑上讲不是这样的:

Error in match.fun(FUN) : argument "FUN" is missing, with no default

4 个答案:

答案 0 :(得分:2)

您正在寻找outer。您的功能只需要Vectorize d。

FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))

outer(1:2, 3:5, FUN)
#      [,1] [,2] [,3]
# [1,]  1.0  0.5  0.5
# [2,]  0.6  0.5  0.5

您可以使用grep进行优化以自动查找列

outer(grep("sp", names(x)), grep("s\\d+", names(x)), FUN)

答案 1 :(得分:2)

我们可以根据列名对其进行分隔,并在它们上使用sapply

sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))

sapply(x[sp_cols], function(i) sapply(x[s_cols], 
                        function(j) sum(i == 1 & j == 1)/sum(i == 1)))

如果列中的值只有1和0,则可以减少为

sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))

#     s1  s2  s3
#sp1 1.0 0.5 0.5
#sp2 0.6 0.5 0.5

答案 2 :(得分:1)

一种类似的方法是使用lapply(x, function(x) which(x == 1),然后再使用它。我们的思想过程是我们最好存储信息而不是反复检查。

#as suggested by @Ronak
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))

x_l_zero <- lapply(x, function(x) which(x == 1))
sapply(x_l_zero[s_cols]
       , function(x)  sapply(x_l_zero[sp_cols]
                             , function(y) length(intersect(x,y))/length(y)))

     s1  s2  s3
sp1 1.0 0.5 0.5
sp2 0.6 0.5 0.5

@Ronak具有最快的解决方案,或多或少已解决了OP的代码。

Unit: microseconds
             expr    min      lq     mean  median      uq     max neval
 jay.sf_outer_FUN 1190.8 1240.85 1360.103 1284.50 1337.30  2627.0   100
 cole_which_apply  268.4  289.00  454.609  306.05  322.00  7610.7   100
 ronak_1_unsimple  181.3  193.95  321.863  209.95  233.40  6227.4   100
   ronak_2_simple  228.5  241.25  342.354  250.65  276.05  7478.4   100
      akrun_dplyr 5218.7 5506.05 6108.997 5721.80 6081.65 25147.3   100

性能代码:

library(microbenchmark)
library(tidyverse)
##data set
x <- data.frame("sp1"=rep(0:1, times=5),
                "sp2"=rep(0:1, each=5),
                "s1" = rep(0:1, times=10),
                "s2" = rep(0:1, each=2),
                "s3" = rep(1:0, each=2))

#for jay.sf
FUN <- Vectorize(function(i,j) sum(x[i] == 1 & x[j] == 1)/sum(x[i] == 1))

#names of columns
sp_cols <- grep("^sp", names(x))
s_cols <- grep("^s\\d+", names(x))

sp_cols_nam <- grep("^sp", names(x), value = T)
s_cols_nam <- grep("^s\\d+", names(x), value = T)

#benchmark
microbenchmark(
  outer_FUN = {
    outer(sp_cols, s_cols, FUN)
  }
  , apply_heaven = {
    x_l_zero <- lapply(x, function(x) which(x == 1))
    sapply(x_l_zero[s_cols], function(x)  sapply(x_l_zero[sp_cols] , function(y) length(intersect(x,y))/length(y)))
  }
  , ronak_1_unsimple = {
    sapply(x[sp_cols], function(i) sapply(x[s_cols], 
                                            function(j) sum(i == 1 & j == 1)/sum(i == 1)))
  }
  , ronak_2_simple = {
    sapply(x[s_cols], function(i) sapply(x[sp_cols], function(j) sum(i & j)/sum(j)))
  }
  , akrun_dplyr = {
    crossing(nm1 = sp_cols_nam,  
             nm2 = s_cols_nam) %>%
      mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
      spread(nm2, val)
  }
)

答案 3 :(得分:0)

这里是tidyverse

的一个选项
library(tidyverse)
crossing(nm1 = names(x)[startsWith(names(x), "sp")],  
        nm2 = grep("^s\\d+", names(x), value = TRUE)) %>%
    mutate(val = pmap_dbl(., ~ sum(x[..1] ==1 & x[..2] == 1)/sum(x[..1]))) %>%
    spread(nm2, val)
# A tibble: 2 x 4
#  nm1      s1    s2    s3
#  <chr> <dbl> <dbl> <dbl>
#1 sp1     1     0.5   0.5
#2 sp2     0.6   0.5   0.5