R:匹配各个表之间的列并添加特定另一列的值

时间:2018-02-22 20:40:21

标签: r merge dplyr

我想将小鼠与人类基因相匹配。 因此,我有两个单独的表,我想比较鼠标列,并添加另一个匹配人类基因名称的列。

我是用dplyr包尝试这个但是无法让它工作。 合并也没有帮助我。

表1

  

dput(d)

structure(list(SYMBOL = structure(c(55L, 11L, 36L, 64L, 2L, 52L, 
8L, 42L, 63L, 18L, 71L, 72L, 85L, 54L, 82L, 83L, 27L, 84L, 73L, 
4L, 6L, 34L, 68L, 41L, 38L, 57L, 23L, 67L, 28L, 29L, 10L, 35L, 
51L, 7L, 69L, 74L, 39L, 65L, 24L, 79L, 44L, 16L, 33L, 32L, 46L, 
78L, 22L, 66L, 48L, 40L, 37L, 76L, 56L, 12L, 13L, 14L, 5L, 15L, 
1L, 3L, 17L, 62L, 21L, 20L, 45L, 60L, 61L, 77L, 9L, 81L, 70L, 
75L, 49L, 26L, 31L, 53L, 25L, 43L, 80L, 19L, 30L, 50L, 47L, 58L, 
59L), .Label = c("Alox12", "Apoh", "Arvcf", "Axin2", "Bcl6b", 
"Brat1", "Btbd17", "Cav2", "Ccm2", "Ccnd2", "Cdc45", "Cdh1", 
"Cdh4", "Ckmt1", "Clec10a", "Clec2g", "Comt", "Cox5a", "Cttnbp2", 
"Dazap2", "Dbt", "Dgke", "Dlat", "Drp2", "Egfl6", "Fap", "Fer", 
"Fgf23", "Fgf6", "Galnt1", "Gcg", "Glra1", "Gmpr", "Gna12", "Gpr107", 
"H19", "Hddc2", "Igsf5", "Ins2", "Itgb2", "Itgb2l", "Klf6", "Lck", 
"Lhx2", "Mcts1", "Mid2", "Mkrn2", "Mnt", "Mx1", "Myf5", "Nalcn", 
"Narf", "Ndufa9", "Ngfr", "Pbsn", "Pemt", "Pih1d2", "Pparg", 
"Raf1", "Rem1", "Rnf17", "Rtca", "Scmh1", "Scml2", "Scnn1g", 
"Scpep1", "Sdhd", "Slc22a18", "Slfn4", "Tbrg4", "Tbx2", "Tbx4", 
"Tfe3", "Th", "Tmprss2", "Tpd52l1", "Trappc10", "Trim25", "Tspan32", 
"Tssk3", "Wap", "Wnt3", "Wnt9a", "Xpo6", "Zfy2"), class = "factor"), 
    X1 = c(0L, 19L, 491L, 0L, 0L, 58L, 75L, 264L, 90L, 23L, 106L, 
    5L, 0L, 3L, 7L, 145L, 106L, 248L, 108L, 85L, 30L, 299L, 2L, 
    5L, 1L, 0L, 64L, 65L, 0L, 0L, 355L, 173L, 11L, 0L, 84L, 0L, 
    0L, 0L, 18L, 10L, 0L, 2L, 11L, 0L, 70L, 168L, 37L, 155L, 
    86L, 109L, 12L, 3L, 1L, 51L, 0L, 55L, 31L, 14L, 5L, 41L, 
    71L, 14L, 19L, 338L, 40L, 16L, 2L, 152L, 80L, 0L, 74L, 1L, 
    60L, 136L, 1L, 25L, 0L, 3L, 10L, 43L, 292L, 0L, 31L, 234L, 
    160L), X2 = c(0L, 4L, 177L, 0L, 0L, 17L, 29L, 152L, 71L, 
    7L, 111L, 1L, 0L, 13L, 0L, 20L, 71L, 197L, 48L, 30L, 18L, 
    107L, 3L, 1L, 6L, 1L, 26L, 20L, 0L, 0L, 75L, 60L, 26L, 6L, 
    44L, 0L, 30L, 1L, 41L, 5L, 0L, 0L, 7L, 0L, 31L, 99L, 26L, 
    33L, 69L, 44L, 7L, 2L, 1L, 39L, 1L, 251L, 23L, 14L, 1L, 80L, 
    40L, 8L, 15L, 110L, 61L, 15L, 0L, 94L, 45L, 0L, 37L, 16L, 
    36L, 7L, 30L, 7L, 0L, 7L, 15L, 38L, 138L, 0L, 24L, 50L, 95L
    )), .Names = c("SYMBOL", "X1", "X2"), class = "data.frame", row.names = c(NA, 
-85L))

表2

  

dput(mouse_to_human_genes)

structure(list(MGI.symbol = c("Pemt", "Mid2", "Ndufa9", "Ndufa9", 
"Cttnbp2", "Cdh1", "Brat1", "Ccm2", "Cdh4", "Itgb2l", "Tbrg4", 
"Slc22a18", "Itgb2", "Tfe3", "Alox12", "Gna12", "Galnt1", "Rnf17", 
"Igsf5", "Ccnd2", "Rtca", "Dbt", "Fgf23", "Fgf6", "Bcl6b", "Klf6", 
"Myf5", "Fap", "Cav2", "Pparg", "Slfn4", "Slfn4", "Gcg", "Dgke", 
"Apoh", "Raf1", "Cdc45", "Nalcn", "Ckmt1", "Mkrn2", "Tbx2", "Lck", 
"Xpo6", "Lhx2", "Gmpr", "Axin2", "Trim25", "Hddc2", "Trappc10", 
"Trappc10", "Mx1", "Cox5a", "Scml2", "Egfl6", "Comt", "Scpep1", 
"Tmprss2", "Dazap2", "Arvcf", "Tbx4", "Rem1", "Drp2", "Tpd52l1", 
"Tssk3", "Btbd17", "Gpr107", "Ins2", "Wnt9a", "Glra1", "Th", 
"Mnt", "Pih1d2", "Scmh1", "Scnn1g", "Tspan32", "Dlat", "Wnt3", 
"Fer", "Sdhd", "Sdhd", "Ckmt1", "Narf", "Ngfr"), HGNC.symbol = c("PEMT", 
"MID2", "", "NDUFA9", "CTTNBP2", "CDH1", "BRAT1", "CCM2", "CDH4", 
"ITGB2", "TBRG4", "SLC22A18", "ITGB2", "TFE3", "ALOX12", "GNA12", 
"GALNT1", "RNF17", "IGSF5", "CCND2", "RTCA", "DBT", "FGF23", 
"FGF6", "BCL6B", "KLF6", "MYF5", "FAP", "CAV2", "PPARG", "SLFN12", 
"SLFN12L", "GCG", "DGKE", "APOH", "RAF1", "CDC45", "NALCN", "CKMT1B", 
"MKRN2", "TBX2", "LCK", "XPO6", "LHX2", "GMPR", "AXIN2", "TRIM25", 
"HDDC2", "TRAPPC10", "", "MX1", "COX5A", "SCML2", "EGFL6", "COMT", 
"SCPEP1", "TMPRSS2", "DAZAP2", "ARVCF", "TBX4", "REM1", "DRP2", 
"TPD52L1", "TSSK3", "BTBD17", "GPR107", "INS", "WNT9A", "GLRA1", 
"TH", "MNT", "PIH1D2", "SCMH1", "SCNN1G", "TSPAN32", "DLAT", 
"WNT3", "FER", "", "SDHD", "CKMT1A", "NARF", "NGFR"), Chromosome.scaffold.name = c("17", 
"X", "12", "12", "7", "16", "7", "7", "20", "21", "7", "11", 
"21", "X", "17", "7", "18", "13", "21", "12", "1", "1", "12", 
"12", "17", "10", "12", "2", "7", "3", "17", "17", "2", "17", 
"17", "3", "22", "13", "15", "3", "17", "1", "16", "9", "6", 
"17", "17", "6", "21", "21", "21", "15", "X", "X", "22", "17", 
"21", "12", "22", "17", "20", "X", "6", "1", "17", "9", "11", 
"1", "5", "11", "17", "11", "1", "16", "11", "11", "17", "5", 
"11", "11", "15", "17", "17"), Gene.start..bp. = c(17505563L, 
107825755L, 4657634L, 4649095L, 117710651L, 68737225L, 2537877L, 
44999475L, 61252426L, 44885953L, 45100100L, 2899721L, 44885953L, 
49028726L, 6996065L, 2728112L, 35581117L, 24764152L, 39745407L, 
4273772L, 100266207L, 100186919L, 4368227L, 4428155L, 7023020L, 
3775996L, 80716912L, 162170684L, 116287380L, 12287368L, 35411060L, 
35464249L, 162142873L, 56834099L, 66212033L, 12583601L, 19479459L, 
101053776L, 43593054L, 12557014L, 61399896L, 32251239L, 28097979L, 
124001670L, 16238580L, 65528563L, 56887909L, 125219962L, 44012319L, 
5155499L, 41420304L, 74919791L, 18239314L, 13569605L, 19941607L, 
56978105L, 41464551L, 51238292L, 19969896L, 61452404L, 31475293L, 
101219769L, 125119049L, 32351521L, 74356416L, 130053426L, 2159779L, 
227918656L, 151822513L, 2163929L, 2384060L, 112064010L, 41027200L, 
23182715L, 2301997L, 112024814L, 46762506L, 108747822L, 112086824L, 
112086773L, 43692886L, 82458180L, 49495293L)), .Names = c("MGI.symbol", 
"HGNC.symbol", "Chromosome.scaffold.name", "Gene.start..bp."), class = "data.frame", row.names = c(NA, 
-83L))

我试图与

合并
added_to_list <- d %>%
                  mutate(mouse_to_human=if_else("d$SYMBOL" == "mouse_to_human_genes$MGI.symbol", c("mouse_to_human_genes$HGNC.symbol"), as.character(NA))

在本专栏中仅向我提供NA列表。 谢谢你的帮助!!  d

1 个答案:

答案 0 :(得分:0)

我认为您可以执行以下操作:

a $ human = apply(a,1,function(x)b $ HGNC.symbol [tolower(b $ HGNC.symbol)== tolower(x [3])])其中a是表1和b表2。

如果要加入公共列,可以使用dplyr中的left_join。或者您也可以看看biomaRt。请检查此post