从不同数据中分组元素

时间:2014-08-04 11:25:01

标签: r

在我的工作中,我试图找到哪些基因通常在一起。所以我设置了一些实验,现在尝试分析数据。我已经编写了一个很好的脚本来分析它,但它仍然不够。

这次我想做的是分析几个表并确定哪些基因通常在一起 - 在同一个集群中。

这是我的数据:

第一张表:

    > dput(tbl_col_clu1[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `20` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `52.5` = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `81` = c(0, 0, 0, 0, 
0, 0, 0.64209043, 0, 0, 0, 0, 0, 0, 0, 0.636411741, 0.183490041, 
0, 0, 0, 0), `110` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `140.5` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `189` = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0.84958569, 0, 0, 0, 0, 0), `222.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 1, 0.37119221, 0, 0, 0, 1, 0, 0, 0, 0, 
0), `278` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), `340` = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `397` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `453.5` = c(0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `529` = c(0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `580` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `630.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `683.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `735.5` = c(0, 
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `784` = c(0, 
0, 0, 0, 0, 0, 0, 0.399952462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0.959211661, 1), `832` = c(0, 0.1266780707, 0, 0, 0, 0, 0, 0.2132893016, 
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.959211661, 1), `882.5` = c(0, 
0.12667807, 0, 0, 0, 1, 0, 0.08480435, 0, 0, 0, 0, 0, 1, 0, 0, 
0, 0, 1, 0.70163097), `926.5` = c(0, 1, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), `973` = c(0, 0.12621196, 0, 
0, 0, 0, 0, 0.11813646, 0, 0, 0, 1, 0, 0, 0.59389934, 1, 0, 0, 
0, 0), `1108` = c(0, 0.092444384, 0, 0, 0, 0, 0, 0.115758222, 
0, 0, 0, 0.925835779, 0, 0, 1, 0.303482426, 0.848464317, 0, 0, 
0), `1200` = c(0, 0.120055749, 0, 1, 0, 0, 0, 0.150055416, 0, 
0, 0, 0.558015841, 0, 0, 0.796949668, 0.276321753, 1, 0, 0, 0
), Clusters = structure(c(1L, 64L, 45L, 102L, 11L, 77L, 170L, 
55L, 59L, 316L, 316L, 98L, 90L, 77L, 232L, 178L, 101L, 50L, 51L, 
51L), .Label = c("10", "10,13,15", "10,15", "10,15,16", "10,20,21,22,23,24", 
"10,22,23,24", "11", "11,12,13,14,15", "11,12,13,14,15,16", "11,12,13,14,15,16,17", 
"12", "12,13", "12,13,14", "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17", 
"12,13,14,15,16,17,18,19,20,21,22,23,24", "12,13,15", "12,13,17", 
"13", "13,14", "13,14,15", "13,14,15,16", "13,14,15,16,17", "13,15", 
"13,15,16,17", "14", "14,15", "14,15,16", "14,15,16,17", "14,15,16,17,18,19,20,21,22,23,24", 
"14,19", "15", "15,16", "15,16,17", "15,16,17,18,19,20,21,22,23,24", 
"15,16,17,19,20,21,22,23,24", "15,17", "15,17,24", "15,22,23,24", 
"15,23", "15,24", "16", "16,17", "17", "17,18,19,20", "17,18,19,20,21,22,23,24", 
"17,21,22,23,24", "18", "18,19", "18,19,20", "18,19,20,21", "18,19,20,21,22", 
"18,19,20,21,22,23", "18,19,20,21,22,23,24", "18,19,21", "18,19,22,23", 
"18,20", "19", "19,20", "19,20,21", "19,20,21,22", "19,20,21,22,23", 
"19,20,21,22,23,24", "19,20,22", "19,20,22,23", "19,20,22,23,24", 
"19,20,23", "19,21", "19,22", "19,23", "19,24", "2", "2,18,19,20", 
"2,19,20", "2,3,4", "20", "20,21", "20,21,22", "20,21,22,23", 
"20,21,22,23,24", "20,21,23", "20,22", "20,22,23", "20,22,23,24", 
"20,22,24", "20,23", "20,23,24", "20,24", "21", "21,22", "21,22,23", 
"21,22,23,24", "21,23,24", "21,24", "22", "22,23", "22,23,24", 
"22,24", "23", "23,24", "24", "3", "3,10", "3,18,19,20", "3,18,19,20,21,22,23,24", 
"3,19,20", "3,19,20,21", "3,19,20,22,23,24", "3,20,21,22,23,24", 
"3,20,22,23,24", "3,21,23,24", "3,22,23,24", "3,22,24", "3,23", 
"3,23,24", "3,24", "3,4", "3,4,10", "3,4,18,19", "3,4,18,19,20", 
"3,4,18,19,20,21,22,23", "3,4,18,19,20,21,22,23,24", "3,4,19,20,21", 
"3,4,21", "3,4,21,22,23", "3,4,21,22,23,24", "3,4,22,23", "3,4,22,23,24", 
"3,4,22,24", "3,4,23,24", "3,4,24", "3,4,5", "3,4,5,10", "3,4,5,10,23,24", 
"3,4,5,20", "3,4,5,22,23,24", "3,4,5,23,24", "3,4,5,24", "3,4,5,6", 
"3,4,5,6,10", "3,4,5,6,20,22,23,24", "3,4,5,6,7", "3,4,5,6,7,10", 
"3,4,5,6,7,24", "3,4,5,6,7,8", "3,4,5,6,7,8,10", "3,4,5,6,7,8,10,13", 
"3,4,5,6,7,8,10,22,23,24", "3,4,5,6,7,8,12", "3,4,5,6,7,8,15", 
"3,4,5,6,7,8,18,19,20,21,22,23,24", "3,4,5,6,7,8,22,23,24", "3,4,5,6,7,8,9,10", 
"3,4,5,6,7,8,9,10,11,12", "3,4,5,6,7,8,9,10,11,12,13,14,15", 
"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"3,4,5,6,7,8,9,10,11,14,15", "3,4,5,6,7,8,9,10,19,20,21,22,23,24", 
"3,4,5,6,7,8,9,10,22,23,24", "3,4,6", "3,4,6,7,20,21,22,23,24", 
"3,4,7", "3,4,7,8", "3,5,6,7,8", "3,5,8", "3,7", "3,7,19,20,22,23", 
"4", "4,10", "4,10,24", "4,18,19,20", "4,19,20", "4,20,21,22", 
"4,20,21,22,23,24", "4,20,22,23,24", "4,22,23,24", "4,23,24", 
"4,24", "4,5", "4,5,10", "4,5,10,21", "4,5,10,23,24", "4,5,19,20,21,22,23", 
"4,5,19,20,22,23,24", "4,5,20,21,22,23,24", "4,5,20,22,23,24", 
"4,5,22,23,24", "4,5,24", "4,5,6", "4,5,6,10", "4,5,6,10,20,22,23,24", 
"4,5,6,19", "4,5,6,22,23,24", "4,5,6,7", "4,5,6,7,10", "4,5,6,7,19,20,21,22,23,24", 
"4,5,6,7,22,23,24", "4,5,6,7,8", "4,5,6,7,8,10", "4,5,6,7,8,10,19,20,21,22,23,24", 
"4,5,6,7,8,10,20,21,22,23,24", "4,5,6,7,8,10,21,22,23,24", "4,5,6,7,8,10,22,23,24", 
"4,5,6,7,8,10,23,24", "4,5,6,7,8,15", "4,5,6,7,8,17,18,19,20,21,22,23,24", 
"4,5,6,7,8,19,20", "4,5,6,7,8,19,20,21,22,23,24", "4,5,6,7,8,20,21,22,23,24", 
"4,5,6,7,8,21,22,23,24", "4,5,6,7,8,22,23,24", "4,5,6,7,8,9,10", 
"4,5,6,7,8,9,10,11,12", "4,5,6,7,8,9,10,11,12,13,14,15", "4,5,6,7,8,9,10,11,12,13,14,15,16,17", 
"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "4,5,6,7,8,9,10,12,13", 
"4,5,6,7,8,9,14,15,16", "4,5,7,9", "4,5,8,22", "4,6", "4,6,7,22,23,24", 
"4,6,7,23,24", "4,6,7,8,15,17", "4,6,7,8,23,24", "4,7", "4,7,20,21", 
"4,7,21,22,23,24", "4,7,8", "4,7,8,22,23,24", "5", "5,10", "5,17", 
"5,18,19,20,21,22,23", "5,19,20,21,22,23,24", "5,20", "5,22,23,24", 
"5,24", "5,6", "5,6,10", "5,6,7", "5,6,7,10", "5,6,7,10,19", 
"5,6,7,22,23,24", "5,6,7,8", "5,6,7,8,10", "5,6,7,8,10,15", "5,6,7,8,10,22,23,24", 
"5,6,7,8,15", "5,6,7,8,18,19,20,21,22,23,24", "5,6,7,8,21,22,23,24", 
"5,6,7,8,22,23,24", "5,6,7,8,9", "5,6,7,8,9,10", "5,6,7,8,9,10,11,12,13", 
"5,6,7,8,9,10,11,12,13,14,15", "5,6,7,8,9,12", "5,6,7,8,9,13", 
"5,7", "5,7,8", "5,8", "6", "6,10", "6,21,22,23", "6,22", "6,22,23,24", 
"6,7", "6,7,10,17", "6,7,22,23,24", "6,7,23,24", "6,7,24", "6,7,8", 
"6,7,8,10", "6,7,8,13,14,15,16,17", "6,7,8,15", "6,7,8,19,20", 
"6,7,8,20,21,22,23,24", "6,7,8,21,22,23,24", "6,7,8,23,24", "6,7,8,9", 
"6,7,8,9,10", "6,7,8,9,10,11,12", "6,7,8,9,10,11,12,13,14,15,16,17", 
"6,7,8,9,10,15,16", "6,7,8,9,10,18,19,20,21,22,23,24", "6,7,8,9,15", 
"6,8", "7", "7,15", "7,15,17", "7,16,18,21", "7,17", "7,19,20", 
"7,19,20,21,22", "7,20,21,22,23,24", "7,20,22,23,24", "7,22,23,24", 
"7,24", "7,8", "7,8,10", "7,8,10,22,23,24", "7,8,13,15", "7,8,14", 
"7,8,15", "7,8,15,16", "7,8,15,23", "7,8,20", "7,8,22", "7,8,23", 
"7,8,9", "7,8,9,10", "7,8,9,13", "7,8,9,15,16,17", "8", "8,10", 
"8,15", "8,17", "8,22", "8,24", "8,9", "8,9,10", "9", "9,10,11,12,13,14,15,16,17"
), class = "factor")), .Names = c("10", "20", "52.5", "81", "110", 
"140.5", "189", "222.5", "278", "340", "397", "453.5", "529", 
"580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5", 
"973", "1108", "1200", "Clusters"), row.names = c("at1g01050.1", 
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01320.2", "at1g01420.1", 
"at1g01710.1", "at1g01800.1", "at1g01920.2", "at1g01940.1", "at1g01960.1", 
"at1g02020.2", "at1g02100.2", "at1g02140.1", "at1g02150.1", "at1g02500.2", 
"at1g02560.1", "at1g02880.3", "at1g02920.1", "at1g02930.2"), class = "data.frame")

第二张表:

> dput(tbl_col_clu2[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `20` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `52.5` = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `81` = c(0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `110` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `140.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `189` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `222.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `278` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), `340` = c(0, 
0, 0, 0, 0, 0, 0.583163048, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 
1, 0.218194067), `397` = c(0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 
0, 0.63953839, 0, 1, 0, 0, 0, 1), `453.5` = c(0, 0.66069369, 
0, 0, 0, 1, 0.57541627, 1, 1, 0, 0, 0, 1, 0.64615661, 0, 0.45209671, 
0, 0, 0, 0.17022498), `529` = c(0, 0.521435654, 0, 0, 1, 0, 0.175996209, 
0, 0, 0, 1, 0, 0, 0, 0, 0.886059888, 0, 0, 0, 0.17022498), `580` = c(0, 
0.437291195, 0, 0, 1, 0, 0.20731698, 0, 0, 0, 1, 0, 0, 0, 0, 
0.719755907, 0, 0, 0, 0.033248127), `630.5` = c(0, 0.52204783, 
0, 0, 0, 0, 0.48815538, 0, 0, 0, 0, 1, 0, 0, 0, 0.82709638, 0, 
0, 0, 0.09539534), `683.5` = c(0, 0.52429838, 0, 0, 0, 0, 0.59605685, 
0, 0, 0, 0, 0, 0, 0, 0, 0.27845748, 0.28224351, 0, 0, 0), `735.5` = c(1, 
0.3768651, 0, 1, 0, 0, 0.51381348, 0, 0, 0, 0, 0, 0, 0, 0, 0.39914361, 
0.22206677, 0, 0, 0), `784` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 0, 0, 0), `832` = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.16189002, 0, 0, 0), `882.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `926.5` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), `973` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.86100786, 0, 0, 0, 0, 
0), `1108` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), `1200` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), Clusters = structure(c(168L, 32L, 246L, 
168L, 81L, 44L, 8L, 44L, 27L, 318L, 81L, 132L, 15L, 3L, 219L, 
32L, 156L, 318L, 1L, 6L), .Label = c("10", "10,11", "10,11,12", 
"10,11,12,13", "10,11,12,13,14", "10,11,12,13,14,15", "10,11,12,13,14,15,16", 
"10,11,12,13,14,15,16,17", "10,11,12,13,14,15,16,17,18,19", "10,11,12,13,14,15,16,17,18,19,20", 
"10,11,12,13,14,15,16,17,18,19,20,21", "10,11,12,13,14,16", "10,11,12,13,15,16,17,18,19,20,21", 
"10,11,12,13,19", "10,12", "10,12,13", "10,12,13,14", "10,12,13,14,15", 
"10,12,13,14,15,16,17", "10,12,13,15", "10,12,21", "10,13", "10,13,14", 
"10,17,18", "10,20", "11", "11,12", "11,12,13", "11,12,13,14", 
"11,12,13,14,15", "11,12,13,14,15,16", "11,12,13,14,15,16,17", 
"11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,19,20", 
"11,12,13,14,15,16,17,18,19,20,21,22,23", "11,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"11,12,13,14,15,16,17,18,19,21,22", "11,12,13,14,15,16,18", "11,12,13,17,18,19", 
"11,12,14", "11,13", "11,13,14,15,16", "11,15", "12", "12,13", 
"12,13,14", "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17", 
"12,13,14,15,16,17,18", "12,13,14,15,16,17,18,19", "12,13,14,15,16,17,18,19,20", 
"12,13,14,15,16,17,18,19,20,21", "12,13,14,15,16,17,18,19,20,21,22", 
"12,13,14,15,16,17,18,19,20,21,22,23", "12,13,14,15,16,17,18,19,20,21,22,23,24", 
"12,13,14,15,16,17,18,19,23,24", "12,13,14,15,16,17,19", "12,13,14,15,16,17,19,20,21", 
"12,13,14,15,16,17,21", "12,13,14,15,16,18", "12,13,14,15,17", 
"12,13,14,16,17,19", "12,13,14,18", "12,13,15", "12,13,16", "12,13,16,17,18,19", 
"12,13,16,19", "12,13,17", "12,13,21,22,23", "12,14", "12,14,15", 
"12,14,15,16", "12,14,15,17,19", "12,15", "12,15,16,17", "12,16,17", 
"12,20", "12,21,23", "13", "13,14", "13,14,15", "13,14,15,16", 
"13,14,15,16,17", "13,14,15,16,17,18", "13,14,15,16,17,18,19", 
"13,14,15,16,17,18,19,20", "13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22", 
"13,14,15,16,17,18,19,20,21,22,23", "13,14,15,16,17,18,19,20,21,22,23,24", 
"13,14,15,16,17,18,19,21", "13,14,15,16,17,18,19,21,22,23", "13,14,15,16,17,19", 
"13,14,15,16,17,21", "13,14,15,16,18,23", "13,14,17", "13,14,19,20,21,22,23", 
"13,14,23,24", "13,15", "13,15,16", "13,15,16,18,19", "13,15,17", 
"13,16,17", "13,17", "13,17,19", "13,19", "13,21", "14", "14,15", 
"14,15,16", "14,15,16,17", "14,15,16,17,18", "14,15,16,17,18,19", 
"14,15,16,17,18,19,20", "14,15,16,17,18,19,20,21", "14,15,16,17,18,19,20,21,22", 
"14,15,16,17,18,19,20,21,22,23", "14,15,16,17,18,19,20,21,22,23,24", 
"14,15,16,17,18,19,20,22,23,24", "14,15,16,17,19", "14,15,16,17,19,20", 
"14,15,16,17,19,20,21", "14,15,16,17,22", "14,15,16,19", "14,15,17", 
"14,15,19", "14,17", "14,17,18,19", "14,19", "14,21", "15", "15,16", 
"15,16,17", "15,16,17,18", "15,16,17,18,19", "15,16,17,18,19,20", 
"15,16,17,18,19,20,21", "15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24", 
"15,16,17,19", "15,16,17,19,20,21", "15,16,17,19,24", "15,16,17,20,21", 
"15,16,17,21", "15,16,17,23", "15,16,18,19", "15,16,19,20", "15,17", 
"15,18,19,20", "15,18,19,20,21", "15,19", "16", "16,17", "16,17,18", 
"16,17,18,19", "16,17,18,19,20", "16,17,18,19,20,21", "16,17,18,19,20,21,22", 
"16,17,18,19,20,21,22,23", "16,17,18,19,20,21,22,23,24", "16,17,19", 
"16,17,19,20", "16,17,19,20,21", "16,17,19,21", "16,17,23", "16,19", 
"17", "17,18", "17,18,19", "17,18,19,20", "17,18,19,20,21", "17,18,19,20,21,22", 
"17,18,19,20,21,22,23", "17,18,19,20,21,22,23,24", "17,18,19,21", 
"17,19", "17,19,20", "17,19,20,21", "17,19,20,21,22,23,24", "17,19,23", 
"17,20,21", "17,20,21,23", "17,21,22", "17,23", "17,24", "18", 
"18,19", "18,19,20", "18,19,20,21", "18,19,20,21,22", "18,19,20,21,22,23", 
"18,19,20,21,22,23,24", "18,19,20,21,23", "18,20", "19", "19,20", 
"19,20,21", "19,20,21,22", "19,20,21,22,23", "19,20,21,22,23,24", 
"19,20,21,23,24", "19,20,22", "19,21", "19,22", "19,23", "2", 
"2,17", "2,3,4,5,6", "2,3,4,5,6,7", "20", "20,21", "20,21,22", 
"20,21,22,23", "20,21,22,23,24", "20,21,23", "20,21,23,24", "21", 
"21,22", "21,22,23", "21,22,23,24", "21,23", "22", "22,23", "22,23,24", 
"23", "23,24", "24", "3", "3,23,24", "3,4", "3,4,23,24", "3,4,5", 
"3,4,5,6", "3,4,5,6,13,14,15,16,17,18,19,20,21,22,23,24", "3,4,5,6,7", 
"3,4,5,6,7,8,9", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"3,4,5,6,7,8,9,20,21,22,23,24", "3,4,5,6,7,8,9,21,22,23,24", 
"3,4,5,6,8,9", "3,4,5,7,8,9,15,16,17,18,19,20,21,22,23", "3,4,6,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"3,8,9,10,11,12,13,14,15,16,17,18,19,20", "4", "4,17,18,19,20,21,22,23,24", 
"4,19,20,21,22,23,24", "4,21", "4,22,23,24", "4,5,17,18,19,20,21,22,23,24", 
"4,5,21,22,23,24", "4,5,6", "4,5,6,22,23,24", "4,5,6,7,8,9", 
"4,5,6,7,8,9,10", "4,5,6,7,8,9,10,15,16,17,18,19,20,21,22,23,24", 
"4,5,6,7,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24", "4,5,6,7,8,9,13", 
"4,5,6,7,8,9,14,15,16,17,18,19,20,21,22,23,24", "4,5,6,7,8,9,17,18,19,20,21,22,23,24", 
"4,5,6,7,8,9,19,20,21,22,23,24", "4,5,6,7,8,9,19,23,24", "4,5,6,7,8,9,23,24", 
"4,5,7,8,9", "4,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24", 
"4,8,9,23,24", "5", "5,22,23", "5,6", "5,6,15,16,17,18,19,20,21,22,23,24", 
"5,6,19,20,21,22,23,24", "5,6,24", "5,6,7", "5,6,7,8", "5,6,7,8,19,20,21,22,23,24", 
"5,6,7,8,9", "5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14,15,16,17", 
"5,6,7,8,9,15,23,24", "5,6,9", "5,7", "5,8,9", "6", "6,15,16,17,18,19,20,21,22,23,24", 
"6,19,20,21,22,23,24", "6,20,21,22,23,24", "6,21,22,23,24", "6,7", 
"6,7,8", "6,7,8,9", "6,7,8,9,15,16,17,18,19,20,21,22,23,24", 
"6,7,8,9,23,24", "6,7,9", "6,8,15,16,17,18,19,20,21,22,23", "6,8,9", 
"6,9", "7", "7,14,24", "7,8,9", "7,8,9,10,11,12,13,14,15", "7,8,9,20,21,22,23,24", 
"7,8,9,23,24", "7,9", "7,9,10", "8", "8,19,20,21", "8,19,20,21,22,23,24", 
"8,9", "8,9,10,11,12,13,14,15,16,17", "8,9,10,17,18,19,20,21,22", 
"8,9,12,13,14,15,16,17,18,19", "8,9,14,15,16,17,18,19,20,21,22,23,24", 
"8,9,15,16,17,18,19,20,21,22", "8,9,19", "8,9,19,20,21,22,23", 
"8,9,21,22", "9", "9,10", "9,10,11,12,13,14", "9,10,11,12,13,14,15,16", 
"9,10,11,12,13,14,15,16,17", "9,10,11,12,13,14,15,16,17,18,19", 
"9,10,11,12,13,14,15,16,17,18,19,20,21", "9,10,11,12,13,14,15,16,17,18,19,20,21,22,23", 
"9,10,11,12,13,14,15,16,17,19", "9,12", "9,12,13", "9,12,13,14", 
"9,13", "9,13,14,15", "9,13,14,15,16,17", "9,13,14,15,18", "9,14", 
"9,14,15,16", "9,15", "9,15,16,17", "9,16", "9,16,17,18,19,21,22", 
"9,16,17,19", "9,17", "9,17,18", "9,19", "9,19,20", "9,19,20,21", 
"9,19,21", "9,20", "9,20,21", "9,20,21,22", "9,21", "9,22", "9,23"
), class = "factor")), .Names = c("10", "20", "52.5", "81", "110", 
"140.5", "189", "222.5", "278", "340", "397", "453.5", "529", 
"580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5", 
"973", "1108", "1200", "Clusters"), row.names = c("at1g01050.1", 
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01420.1", "at1g01470.1", 
"at1g01800.1", "at1g01910.5", "at1g01920.2", "at1g01980.1", "at1g02020.2", 
"at1g02100.2", "at1g02130.1", "at1g02140.1", "at1g02150.1", "at1g02500.2", 
"at1g02560.1", "at1g02780.1", "at1g02880.3", "at1g02920.1"), class = "data.frame")

第三张表:

> dput(tbl_col_clu3[1:20,])
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), `33.95` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `58.66` = c(0, 0, 0, 0, 0.328143363, 
0.552139556, 0.495919686, 0, 0, 0, 0, 0, 0, 0, 0, 0.416266322, 
0.886125103, 1, 1, 0), `84.42` = c(0, 0, 0, 0, 1, 1, 0, 0, 0, 
0, 0, 0.327004551, 0, 0, 0, 0.956778355, 1, 0.175277617, 0.240402438, 
0), `110.21` = c(0, 0, 0, 0, 0, 0.151581882, 0, 0, 0, 0, 0, 1, 
0, 0, 1, 0, 0.091367379, 0.029316359, 0, 0), `134.16` = c(0.190968551, 
0, 0, 0, 0, 0.164736594, 0, 0, 0, 0, 0, 0.650199285, 0, 0, 0, 
0, 0.097800974, 0.007393484, 0, 0), `164.69` = c(0.5342874459, 
0, 0.3619993464, 0, 0, 0.1891527151, 0, 0, 0, 0, 0, 0.4926963182, 
0, 0, 0, 0, 0, 0, 0, 0), `199.1` = c(0.866134859, 0, 0.405387979, 
0, 0, 0.274468991, 0, 0, 0, 0, 0, 0.352737127, 0.170514318, 0, 
0, 0, 0, 0, 0, 0), `234.35` = c(1, 0, 0.446118481, 0, 0, 0.338427523, 
0, 0, 0, 0, 0, 0.204601923, 0.343919727, 0, 0, 0, 0, 0, 0, 0), 
    `257.19` = c(0.732231652, 0, 0.666653103, 0, 0, 0.403078017, 
    0, 0, 0, 0, 0, 0.315665123, 1, 0, 0, 0, 0, 0, 0, 0), `361.84` = c(0.660960044, 
    0, 1, 0, 0, 0.202578329, 0, 0, 0, 0, 0, 0.320183046, 0.424361453, 
    0, 0, 0, 0, 0, 0, 0), `432.74` = c(0.47961801, 0, 0.48323321, 
    0, 0, 0.25926071, 0, 0, 0, 0, 0, 0.36362413, 0.43039587, 
    0, 0, 0, 0, 0, 0, 0), `506.34` = c(0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0.22943212, 0.19354376, 0, 0, 0, 0, 0, 0, 0), `581.46` = c(0, 
    0.52783556, 0, 1, 0, 0, 0, 0.64407392, 0, 0.70701938, 0, 
    0.2596209, 0.29757967, 0, 0, 0, 0, 0, 0, 0), `651.71` = c(0, 
    0.32678969, 0, 0.36428195, 0, 0, 0, 0.64951761, 0, 0.80866933, 
    1, 0.18614028, 0.21567888, 0.32813633, 0, 0, 0, 0, 0, 0), 
    `732.59` = c(0, 0.229023369, 0, 0.312832425, 0, 0, 0, 0.696041374, 
    0, 0.590471454, 0, 0.108699479, 0.187935709, 0.275177957, 
    0, 0, 0, 0, 0, 0.243080694), `817.56` = c(0, 0.25668583, 
    0, 0.4003249, 0, 0, 0, 0.53376606, 0, 0.85524485, 0, 0.22539659, 
    0.27977127, 0.55089774, 0, 0, 0, 0, 0, 1), `896.24` = c(0, 
    0.31675535, 0, 0.50882005, 0, 0, 0, 0.74705458, 0.12936306, 
    1, 0, 0.1949139, 0.21957859, 0.75063327, 0, 0, 0, 0, 0, 0.63346358
    ), `971.77` = c(0, 0.27811949, 0, 0.48419038, 0, 0, 0, 0.8563439, 
    0.39897143, 0.84491933, 0, 0.13935282, 0.17670128, 0.84111004, 
    0, 0, 0, 0, 0, 0), `1038.91` = c(0, 1, 0, 0.52506752, 0, 
    0, 0, 1, 1, 0.85617714, 0, 0.13507463, 0, 1, 0, 0, 0, 0, 
    0, 0), Clusters = structure(c(222L, 88L, 237L, 88L, 145L, 
    155L, 143L, 88L, 122L, 88L, 97L, 180L, 260L, 102L, 186L, 
    145L, 149L, 149L, 145L, 106L), .Label = c("10", "10,11", 
    "10,11,12", "10,11,12,13", "10,11,12,13,14", "10,11,12,13,14,15", 
    "10,11,12,13,14,15,16", "10,11,12,13,14,15,16,17,18", "10,11,12,13,14,15,16,17,18,19", 
    "10,11,12,13,14,15,16,17,18,19,20", "10,11,12,14", "10,11,12,14,15", 
    "10,11,12,14,15,16", "10,11,12,14,15,16,17,18", "10,11,12,14,15,16,17,18,19", 
    "10,11,12,14,15,16,17,18,19,20", "10,11,12,14,15,17,18,19", 
    "10,11,12,15,16,17", "10,11,14", "10,11,15", "10,11,15,16,17", 
    "10,11,16", "10,11,17", "10,11,20", "10,12", "10,14,15,16", 
    "10,14,15,16,17,18,19", "10,15", "10,15,16", "10,15,16,18", 
    "10,16,19", "10,18,19,20", "10,19", "10,19,20", "10,20", 
    "11", "11,12", "11,12,13", "11,12,13,14", "11,12,13,14,15", 
    "11,12,13,14,15,16", "11,12,13,14,15,16,17,18", "11,12,13,14,15,16,17,18,19", 
    "11,12,13,14,15,16,17,18,19,20", "11,12,13,14,15,16,18,19", 
    "11,12,14,15", "11,12,14,15,16,17", "11,12,14,15,16,17,18", 
    "11,12,14,15,16,17,18,19", "11,12,14,15,16,17,18,19,20", 
    "11,12,18", "11,12,19", "11,12,20", "12", "12,13", "12,13,14", 
    "12,13,14,15", "12,13,14,15,16", "12,13,14,15,16,17,18", 
    "12,13,14,15,16,17,18,19,20", "12,14", "12,14,15", "12,14,15,16", 
    "12,14,15,16,17", "12,14,15,16,17,18", "12,14,15,16,17,18,19", 
    "12,14,15,16,17,18,19,20", "12,14,15,16,20", "12,14,15,18,19,20", 
    "12,15", "12,16", "12,16,17,18", "12,18,19,20", "12,19,20", 
    "12,20", "13", "13,14", "13,14,15", "13,14,15,16,17,18,19,20", 
    "13,16", "13,20", "14", "14,15", "14,15,16", "14,15,16,17", 
    "14,15,16,17,18", "14,15,16,17,18,19", "14,15,16,17,18,19,20", 
    "14,15,16,18", "14,15,17", "14,15,18", "14,16", "14,16,17", 
    "14,16,17,18,19,20", "14,18,19,20", "14,19", "15", "15,16", 
    "15,16,17", "15,16,17,18", "15,16,17,18,19", "15,16,17,18,19,20", 
    "15,20", "16", "16,17", "16,17,18", "16,17,18,19", "16,17,18,19,20", 
    "16,17,18,20", "16,17,19", "16,18,19,20", "16,19,20", "17", 
    "17,18", "17,18,19", "17,18,19,20", "17,18,20", "17,19,20", 
    "17,20", "18", "18,19", "18,19,20", "19", "19,20", "2", "2,19,20", 
    "2,3", "2,3,4", "2,3,4,5", "2,3,4,5,11", "2,3,4,5,6", "2,3,4,5,6,7,8", 
    "2,3,4,5,6,7,8,11,12", "2,3,4,5,6,7,8,9", "2,3,4,5,6,7,8,9,10", 
    "2,3,4,5,6,7,8,9,10,11", "2,3,4,5,6,7,8,9,10,11,12", "2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "2,4", "2,5", "2,5,6,7", "20", "3", "3,18", "3,4", "3,4,10", 
    "3,4,20", "3,4,5", "3,4,5,6", "3,4,5,6,7", "3,4,5,6,7,8", 
    "3,4,5,6,7,8,9", "3,4,5,6,7,8,9,10", "3,4,5,6,7,8,9,10,11", 
    "3,4,5,6,7,8,9,10,11,12", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", 
    "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "3,4,8", "3,4,8,9", "3,5", "3,7", "3,9", "4", "4,5", "4,5,12,13", 
    "4,5,16", "4,5,6", "4,5,6,16,17,18,19,20", "4,5,6,20", "4,5,6,7", 
    "4,5,6,7,8", "4,5,6,7,8,10,11", "4,5,6,7,8,9", "4,5,6,7,8,9,10", 
    "4,5,6,7,8,9,10,11", "4,5,6,7,8,9,10,11,12", "4,5,6,7,8,9,10,11,12,13,14,15", 
    "4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19", "4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20", "4,5,6,7,8,9,16,17", 
    "4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20", "4,6,7", "4,7,13", 
    "5", "5,11,12,14,15,16,17,18,19", "5,14", "5,14,15,16", "5,16,19", 
    "5,17,18,19,20", "5,18", "5,6", "5,6,7", "5,6,7,10", "5,6,7,8", 
    "5,6,7,8,10", "5,6,7,8,9", "5,6,7,8,9,10", "5,6,7,8,9,10,11", 
    "5,6,7,8,9,10,11,12", "5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14", 
    "5,6,7,8,9,10,11,12,13,14,15,16", "5,6,7,8,9,10,11,12,13,14,15,16,17,18", 
    "5,6,7,8,9,10,11,12,13,14,15,16,17,18,19", "5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "5,6,7,8,9,16,17,18,19,20", "5,6,8", "5,7,8,9,10", "5,7,8,9,10,14,15,16,17,18", 
    "5,8", "6", "6,7", "6,7,16", "6,7,8", "6,7,8,10,11,12,15,16,17,18", 
    "6,7,8,19", "6,7,8,9", "6,7,8,9,10", "6,7,8,9,10,11", "6,7,8,9,10,11,12", 
    "6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,13,14,15,16,17", 
    "6,7,8,9,10,11,12,13,14,15,16,17,18,19", "6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "6,7,8,9,10,11,12,14,15,16", "6,7,8,9,10,18,19", "7", "7,10,11,14,15", 
    "7,12", "7,8", "7,8,12", "7,8,9", "7,8,9,10", "7,8,9,10,11", 
    "7,8,9,10,11,12", "7,8,9,10,11,12,13", "7,8,9,10,11,12,13,14,15,16", 
    "7,8,9,10,11,12,13,14,15,16,17,18", "7,8,9,10,11,12,13,14,15,16,17,18,19", 
    "7,8,9,10,11,12,13,14,15,16,17,18,19,20", "7,8,9,10,11,12,14,15,16,17,18,19", 
    "7,8,9,10,11,12,14,15,16,17,18,19,20", "7,8,9,10,12,15,16,17,18", 
    "7,9,10,11,12,13,14,15,16,17,18,19,20", "8", "8,10", "8,10,20", 
    "8,14,15,16,17,18,19,20", "8,16,17", "8,9", "8,9,10", "8,9,10,11", 
    "8,9,10,11,12", "8,9,10,11,12,13,14", "8,9,10,11,12,13,14,15", 
    "8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17,18", 
    "8,9,10,11,12,13,14,15,16,17,18,19", "8,9,10,11,12,13,14,15,16,17,18,19,20", 
    "8,9,10,11,12,14,15,16", "8,9,10,11,12,14,15,16,17,18,19,20", 
    "8,9,10,14,15,16,17,18,19,20", "8,9,17", "9", "9,10", "9,10,11", 
    "9,10,11,12", "9,10,11,12,13,14,15,16,17", "9,10,11,12,13,14,15,16,17,18", 
    "9,10,11,12,13,14,15,16,17,18,19", "9,10,11,12,13,14,15,16,17,18,19,20", 
    "9,10,11,12,14,15,16", "9,10,11,12,14,15,16,17,18", "9,10,11,12,14,15,16,17,18,19", 
    "9,10,11,12,14,15,16,17,18,19,20", "9,10,11,12,16,17,18,19,20", 
    "9,10,11,14,15,16,17", "9,10,12,14,15,16,17", "9,10,14,15", 
    "9,11,12", "9,11,12,14", "9,12,14", "9,20"), class = "factor")), .Names = c("10", 
"33.95", "58.66", "84.42", "110.21", "134.16", "164.69", "199.1", 
"234.35", "257.19", "361.84", "432.74", "506.34", "581.46", "651.71", 
"732.59", "817.56", "896.24", "971.77", "1038.91", "Clusters"
), row.names = c("at1g01050.1", "at1g01080.1", "at1g01090.1", 
"at1g01320.2", "at1g01470.1", "at1g01800.1", "at1g01910.5", "at1g01960.1", 
"at1g01980.1", "at1g02150.1", "at1g02470.1", "at1g02500.2", "at1g02560.1", 
"at1g02780.1", "at1g02816.1", "at1g02880.2", "at1g02920.1", "at1g02930.2", 
"at1g03030.1", "at1g03090.2"), class = "data.frame")

最后一列(Clusters)对我们和row.names很重要。这个专栏说我们可以在哪一列中找到该基因的丰度。对我来说并不重要,其中exaclty cluster是基因,但是哪些基因与它结合在一起。

让我们举个例子:

Those genes belong to the same cluster (cluster 5) in data1.
at1g09640.1
at1g07250.1
at1g08200.1
at1g09300.2    ##
at1g09490.2    ## Those
at1g09760.1    ##
at1g09780.1

如果我们分析其他数据集(data2)。我们可以看到其中一些基因可以再次找到。也许它是不同的集群(集群20)左右,但它们在一起,对我来说最重要。

at1g02880.3
at1g01220.1
at1g09300.2   ## 
at1g09490.2   ## Those
at1g09760.1   ## 
at1g02130.1

我有15个类似的数据集,我希望能够问R:给我看一下15个数据集中的15个或15个数据集中的13个可以找到的基因等等......

有什么想法吗?

2 个答案:

答案 0 :(得分:1)

首先,您需要将这些以逗号分隔的列表转换为列 - 以这种方式使用它们要容易得多。然后,您想要找到哪些基因具有匹配的列。最后,您可以汇总以获得有多少基因与其他基因匹配的总数。

请注意,您将拥有基因顺序以及与自身匹配的基因。此外,"集群"列将告诉您它们在同一组聚类中的次数。

这将在O(n ^ 2)时间内运行,这意味着分析的基因数量翻倍将使时间翻两番。我的快速计时测试估计我的计算机需要15个小时来完成15个2300行的数据帧。

library(plyr)

frame_list <- list(tbl_col_clu1, tbl_col_clu2, tbl_col_clu3)

turn_numbers_into_columns <- function(x) {
  # Creates a data.frame that has the group numbers as columns
  x[, strsplit(x$Clusters, ",")[[1]]] <- 1   
  return(x) 
}

get_comparison <- function(current_table) {
  # Creates a comparison data frame for a single input table   
  simplified_frame <- data.frame(
    "gene" = row.names(current_table), 
    "Clusters" = as.character(current_table$Clusters),
    stringsAsFactors = FALSE)
  split_f <- adply(simplified_frame, 1, turn_numbers_into_columns)

  #This is the slow line
  comparison_frame <- ddply(split_f, "gene", function(x) {
    ddply(split_f, "gene", function (y) {
      output <- as.data.frame(x == y)
      output$gene <- x$gene
      output$gene2 <- y$gene
      return(output)
    })   
  })
  return(comparison_frame) 
}

combined_frame <- ldply(frame_list, get_comparison)


sum_frame <- aggregate(
  combined_frame[, !(names(combined_frame) %in% c("gene", "gene2"))], 
  by = combined_frame[, c("gene", "gene2")], 
  FUN = sum,
  na.rm = T)

View(sum_frame)

如果您拥有相同的基因和分组,您可以将所有内容转换为阵列,其运行速度比数据帧快,将时间缩短约6倍。运行速度非常慢的部分将被替换为类似的东西。它返回可以一起添加的三维数组。

comparison_frame <- aaply(split_f, 1, function(x) {
    print(x)
    output <- aaply(split_f, 1, function (y) {
      output <- array(x == y, c(1, length(x)))
      return(output)
    })  
    return(output)
  })

答案 1 :(得分:1)

使用Apriori或FPGrowth算法将它们扔进SPMF。 SPMF期望输入为逗号分隔的整数序列的文件(您可能必须转换数据)。每个序列都在单独的字符串上:

1,2,4,10
3,2,1,11,12
2,5,14,5

你这样调用它:

java -jar spmf.jar run FPGrowth sequences.txt output.txt 35% 90%

第一个数字是最小支持(应该包含您的组的多少个集合,以将其视为一个组)。 SPMF包含different algorithms您可以尝试查看哪一个最适合您。