用匹配的关键字总结dplyr

时间:2019-04-15 12:58:33

标签: r dplyr summarize

我有一个数据表MutationsNumberTable_Neighbours,其中包含不同癌症中每个基因的突变数(HGNC)。另一个表Neighbors_table用于匹配基因及其最接近的基因 染色体上的邻居。

MutationsNumberTable_Neighbours = merge(MutationsNumberTable_Neighbours, Neighbors_table, by = "Gene")

MutationsNumberTable_Neighbours[1:3,]
HGNC ACC BLCA BRCA CESC CHOL COAD/READ_MSI COAD/READ_POLE COAD/READ_REST DLBC ESCA GBM HNSC KICH KIRC KIRP LGG LIHC LUAD_HEAVY LUAD_REST LUSC_HEAVY LUSC_REST MESO OV PAAD PCPG PRAD SARC SKCM_HIGH SKCM_LOW STAD TGCT THCA THYM UCEC_MSI
1: A1BG   1    5    2    1    0             1              5              2    0    0   2    7    0    1    0   2    3          4         3          1         3    0  0    3    0    2    0         2        2    7    0    0    0        6
2: A1CF   1    5    4    4    1             3              8              0    0    0   4   13    0    2    0   4    5          6         5          5        16    0  1    2    0    6    2        58       15   11    0    0    0        0
3:  A2M   0   20    9    9    0             7             23              9    1    6   9   12    0    6    3   5    6         18         5          7        22    0  0    2    1    4    6        47       16   40    1    1    0        2
   UCEC_POLE UCEC_REST UCS UVM_HIGH UVM_LOW
1:        14         1   0        0       0
2:        41         0   1        0       0
3:        72         6   1        0       0
                                                                                                                                                                                                                                                                                                                                                                                         Neighbors
1: ZNF134,ZNF606,SLC27A5,ZNF419,ZSCAN18,ZNF814,C19orf18,RPS5,ZNF548,ZNF135,ZNF324,ZBTB45,ZNF17,ZNF530,ZNF211,ZNF551,ZNF324B,ZNF552,ZNF8,ZNF549,ZNF547,ZNF304,ZSCAN4,ZNF544,ZNF772,ZNF671,ZNF329,ZSCAN22,UBE2M,ZNF417,TRIM28,ZNF418,ZSCAN1,ZNF587,CHMP2A,MZF1,ZNF586,ZNF274,ZNF132,ZNF550,ZNF587B,ZNF497,ZNF749,ZIK1,ZNF446,VN1R1,ZNF776,ZNF154,ZNF256,ZNF773,ZNF586,ZNF416,ZNF584,ZNF837,TRAPPC2P1
2:                                                                                                                                                                                                                                                                                                                            ASAH2,SGMS1,CSTF2T,AGAP6,PRKG1,NCOA4,FAM21A,PRKG1,TIMM23,ASAH2B,MSMB
3:                                                                                                                                                                                                               KLRB1,CLEC1A,NECAP1,ZNF705A,AICDA,A2ML1,CLEC4A,PHC1,FAM90A1,MFAP5,PZP,RIMKLB,KLRG1,CLEC4E,CLEC1B,CLEC4D,CLEC9A,CD69,CLEC6A,CLECL1,KLRF1,CLEC2D,CLEC12B,CLEC12A,CLEC2B,CLEC2A,M6PR

dput(head(MutationsNumberTable_Neighbours))
structure(list(HGNC = c("A1BG", "A1CF", "A2M", "A2ML1", "A3GALT2", 
"A4GALT"), ACC = c(1L, 1L, 0L, 2L, 0L, 0L), BLCA = c(5L, 5L, 
20L, 19L, 1L, 2L), BRCA = c(2L, 4L, 9L, 14L, 0L, 1L), CESC = c(1L, 
4L, 9L, 10L, 1L, 5L), CHOL = c(0L, 1L, 0L, 0L, 0L, 0L), `COAD/READ_MSI` = c(1L, 
3L, 7L, 10L, 3L, 3L), `COAD/READ_POLE` = c(5L, 8L, 23L, 34L, 
3L, 4L), `COAD/READ_REST` = c(2L, 0L, 9L, 9L, 0L, 2L), DLBC = c(0L, 
0L, 1L, 1L, 0L, 0L), ESCA = c(0L, 0L, 6L, 2L, 1L, 2L), GBM = c(2L, 
4L, 9L, 4L, 2L, 4L), HNSC = c(7L, 13L, 12L, 8L, 0L, 1L), KICH = c(0L, 
0L, 0L, 0L, 0L, 0L), KIRC = c(1L, 2L, 6L, 3L, 0L, 0L), KIRP = c(0L, 
0L, 3L, 1L, 0L, 0L), LGG = c(2L, 4L, 5L, 3L, 0L, 0L), LIHC = c(3L, 
5L, 6L, 7L, 0L, 1L), LUAD_HEAVY = c(4L, 6L, 18L, 17L, 1L, 0L), 
    LUAD_REST = c(3L, 5L, 5L, 15L, 0L, 0L), LUSC_HEAVY = c(1L, 
    5L, 7L, 6L, 0L, 0L), LUSC_REST = c(3L, 16L, 22L, 21L, 0L, 
    1L), MESO = c(0L, 0L, 0L, 1L, 0L, 0L), OV = c(0L, 1L, 0L, 
    1L, 0L, 0L), PAAD = c(3L, 2L, 2L, 5L, 2L, 0L), PCPG = c(0L, 
    0L, 1L, 0L, 0L, 0L), PRAD = c(2L, 6L, 4L, 1L, 0L, 4L), SARC = c(0L, 
    2L, 6L, 0L, 0L, 2L), SKCM_HIGH = c(2L, 58L, 47L, 51L, 2L, 
    17L), SKCM_LOW = c(2L, 15L, 16L, 23L, 0L, 5L), STAD = c(7L, 
    11L, 40L, 26L, 3L, 3L), TGCT = c(0L, 0L, 1L, 0L, 0L, 0L), 
    THCA = c(0L, 0L, 1L, 0L, 0L, 0L), THYM = c(0L, 0L, 0L, 0L, 
    0L, 1L), UCEC_MSI = c(6L, 0L, 2L, 1L, 0L, 2L), UCEC_POLE = c(14L, 
    41L, 72L, 73L, 4L, 27L), UCEC_REST = c(1L, 0L, 6L, 9L, 0L, 
    1L), UCS = c(0L, 1L, 1L, 2L, 0L, 0L), UVM_HIGH = c(0L, 0L, 
    0L, 0L, 0L, 0L), UVM_LOW = c(0L, 0L, 0L, 0L, 0L, 0L), Neighbors = c("ZNF134,ZNF606,SLC27A5,ZNF419,ZSCAN18,ZNF814,C19orf18,RPS5,ZNF548,ZNF135,ZNF324,ZBTB45,ZNF17,ZNF530,ZNF211,ZNF551,ZNF324B,ZNF552,ZNF8,ZNF549,ZNF547,ZNF304,ZSCAN4,ZNF544,ZNF772,ZNF671,ZNF329,ZSCAN22,UBE2M,ZNF417,TRIM28,ZNF418,ZSCAN1,ZNF587,CHMP2A,MZF1,ZNF586,ZNF274,ZNF132,ZNF550,ZNF587B,ZNF497,ZNF749,ZIK1,ZNF446,VN1R1,ZNF776,ZNF154,ZNF256,ZNF773,ZNF586,ZNF416,ZNF584,ZNF837,TRAPPC2P1", 
    "ASAH2,SGMS1,CSTF2T,AGAP6,PRKG1,NCOA4,FAM21A,PRKG1,TIMM23,ASAH2B,MSMB", 
    "KLRB1,CLEC1A,NECAP1,ZNF705A,AICDA,A2ML1,CLEC4A,PHC1,FAM90A1,MFAP5,PZP,RIMKLB,KLRG1,CLEC4E,CLEC1B,CLEC4D,CLEC9A,CD69,CLEC6A,CLECL1,KLRF1,CLEC2D,CLEC12B,CLEC12A,CLEC2B,CLEC2A,M6PR", 
    "KLRB1,NECAP1,ZNF705A,AICDA,CLEC4A,PHC1,FAM90A1,C3AR1,MFAP5,PZP,RIMKLB,A2M,KLRG1,CLEC4E,SLC2A3,CLEC4D,SLC2A14,FOXJ2,CD69,CLEC6A,CLECL1,KLRF1,CLEC2D,CLEC2B,M6PR", 
    "PHC2,HDAC1,KIAA1522,ZSCAN20,RBBP4,HMGB4,C1orf94,ADC,S100PBP,ZNF362,BSDC1,AK2,YARS,TMEM54,ZBTB8A,RNF19B,TSSK3,TRIM62,SYNC,HPCA,MARCKSL1,ZBTB8B,ZBTB8OS,FNDC5", 
    "PACSIN2,C22orf46,MEI1,NFAM1,SCUBE1,EFCAB6,FAM109B,TCF20,SREBF2,SERHL2,NAGA,TTLL1,TSPO,POLDIP3,TNFRSF13C,BIK,TTLL12,CYP2D6,MCAT,WBP2NL,RRP7A,CYB5R3,MPPED1,ARFGAP3,SEPT3,CCDC134,NDUFA6,CENPM,SMDT1,ATP5L2"
    )), class = c("data.table", "data.frame"), row.names = c(NA, 
-6L), .internal.selfref = <pointer: 0x16d2358>, sorted = "HGNC", .Names = c("HGNC", 
"ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD/READ_MSI", "COAD/READ_POLE", 
"COAD/READ_REST", "DLBC", "ESCA", "GBM", "HNSC", "KICH", "KIRC", 
"KIRP", "LGG", "LIHC", "LUAD_HEAVY", "LUAD_REST", "LUSC_HEAVY", 
"LUSC_REST", "MESO", "OV", "PAAD", "PCPG", "PRAD", "SARC", "SKCM_HIGH", 
"SKCM_LOW", "STAD", "TGCT", "THCA", "THYM", "UCEC_MSI", "UCEC_POLE", 
"UCEC_REST", "UCS", "UVM_HIGH", "UVM_LOW", "Neighbors"))

我想得到一个新表MutationsNumberTable_Neighbours_summarized,其中将包含每个基因(包括其邻居)的突变计数。我能够产生一个简单的代码:

cancers = colnames(MutationsNumberTable_Neighbours) %>% setdiff(c("HGNC", "Neighbors"))

SummarizeMutationsNeighbours = function(gene){
  Neighbours = c(gene, unlist(strsplit(MutationsNumberTable_Neighbours$Neighbors[MutationsNumberTable_Neighbours$HGNC == gene], ",")))
  MutTable_geneNeighb = MutationsNumberTable_Neighbours[MutationsNumberTable_Neighbours$HGNC %in% Neighbours, ..cancers]
  MutTable_geneNeighb = plyr::numcolwise(sum)(MutTable_geneNeighb)
  MutTable_geneNeighb$Gene = gene
  return((MutTable_geneNeighb))

}

library(pbapply)
MutationsNumberTable_Neighbours_summarized = pblapply(MutationsNumberTable_Neighbours$HGNC, SummarizeMutationsNeighbours)
MutationsNumberTable_Neighbours_summarized = rbindlist(MutationsNumberTable_Neighbours_summarized)

我认为这不是最佳解决方案(根据我的数据,这需要15个小时才能完成)。但是,我想出的dplyr方法不会产生任何效果:

MutationsNumberTable_Neighbours_summ = MutationsNumberTable_Neighbours %>% group_by(HGNC) %>% mutate(N = paste(HGNC, Neighbors, sep=",")) 
MutationsNumberTable_Neighbours_summ = MutationsNumberTable_Neighbours_summ %>% filter(HGNC %in% unlist(strsplit(N, ","))) %>% summarise_at(cancers, sum)

0 个答案:

没有答案