如何使用dplyr

时间:2017-05-09 09:54:34

标签: r dplyr

我有两个数据框:


setA_df <- structure(list(gene_symbol = c("Wif1", "Wisp1", "Wnt11", "Wnt6", 
"Wnt9a", "Xxylt1", "Zbp1", "Zbtb7c", "Zc2hc1c", "Zfp300", "Zfp36", 
"Zfp367", "Zfp54", "Zfp612", "Zfp748", "Zfp783", "Zfp791", "Zic1", 
"Zic3", "Zic4"), Sample1 = c(6, 1420, 7, 2, 47, 290, 13, 34, 
4, 16, 165, 545, 100, 9, 160, 1, 15, 0, 0, 0), Sample2 = c(0, 
2617, 25, 1, 105, 206, 4, 83, 25, 4, 187, 159, 63, 34, 147, 6, 
4, 4, 0, 1)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", 
"data.frame"), .Names = c("gene_symbol", "Sample1", "Sample2"
))

setA_df
#>    gene_symbol Sample1 Sample2
#> 1         Wif1       6       0
#> 2        Wisp1    1420    2617
#> 3        Wnt11       7      25
#> 4         Wnt6       2       1
#> 5        Wnt9a      47     105
#> 6       Xxylt1     290     206
#> 7         Zbp1      13       4
#> 8       Zbtb7c      34      83
#> 9      Zc2hc1c       4      25
#> 10      Zfp300      16       4
#> 11       Zfp36     165     187
#> 12      Zfp367     545     159
#> 13       Zfp54     100      63
#> 14      Zfp612       9      34
#> 15      Zfp748     160     147
#> 16      Zfp783       1       6
#> 17      Zfp791      15       4
#> 18        Zic1       0       4
#> 19        Zic3       0       0
#> 20        Zic4       0       1

setB_df <- structure(list(gene_symbol = c("Wif1", "Wisp1", "Wnt11", "Wnt6", 
"Wnt9a", "Xxylt1", "Zbp1", "Zbtb7c", "Zc2hc1c", "Zfp300", "Zfp36", 
"Zfp367", "Zfp54", "Zfp612", "Zfp748", "Zfp783", "Zfp791", "Zic1", 
"Zic3", "Zic4"), Blood = c(991.833, 104.167, 52.5, 4.833, 33.333, 
163.667, 131.333, 7, 77.667, 52.5, 222.333, 10.833, 63.5, 0.167, 
184.922, 20.167, 17.333, 0.833, 20.833, 0), Bone = c(4282.167, 
642.333, 5.667, 66.667, 16.667, 143.167, 112.667, 61.667, 45.833, 
24.833, 1070.333, 41.833, 140.402, 13.667, 156.167, 84.613, 5.333, 
44.667, 3.167, 18.167)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"), .Names = c("gene_symbol", "Blood", "Bone"
))


setB_df
#>    gene_symbol   Blood     Bone
#> 1         Wif1 991.833 4282.167
#> 2        Wisp1 104.167  642.333
#> 3        Wnt11  52.500    5.667
#> 4         Wnt6   4.833   66.667
#> 5        Wnt9a  33.333   16.667
#> 6       Xxylt1 163.667  143.167
#> 7         Zbp1 131.333  112.667
#> 8       Zbtb7c   7.000   61.667
#> 9      Zc2hc1c  77.667   45.833
#> 10      Zfp300  52.500   24.833
#> 11       Zfp36 222.333 1070.333
#> 12      Zfp367  10.833   41.833
#> 13       Zfp54  63.500  140.402
#> 14      Zfp612   0.167   13.667
#> 15      Zfp748 184.922  156.167
#> 16      Zfp783  20.167   84.613
#> 17      Zfp791  17.333    5.333
#> 18        Zic1   0.833   44.667
#> 19        Zic3  20.833    3.167
#> 20        Zic4   0.000   18.167

我想要做的是计算所有成对列的Pearson相关性(gene_symbol除外。最终的df如下所示:

Sample1  Blood -0.01524773 #cor(setA$Sample1,setB_df$Blood)
Sample2  Blood -0.003247626
Sample1  Bone  0.03889127
Sample2  Bone  0.06420447

如何用dplyr实现? 实际上,每组都有更多的列。

1 个答案:

答案 0 :(得分:2)

这给出了定义的输出,但如上所述,dplyr在这里并没有增加多少。

library(reshape2) 
library(dplyr)

setA_df %>%
  select(-gene_symbol) %>%
  cor(setB_df %>% select(-gene_symbol)) %>%
  melt()  

#      Var1  Var2        value
# 1 Sample1 Blood -0.015247732
# 2 Sample2 Blood -0.003247626
# 3 Sample1  Bone  0.038891268
# 4 Sample2  Bone  0.064204473