拟南芥基因ID转换(BioMart,CLC基因组学工作台输出)

时间:2019-05-12 13:49:10

标签: r biomart

我从CLC基因组学工作台获得了拟南芥的RNA-seq读数输出。基因列表包含基因名称(即“ TRY”,“ TMM”,“ SVP”,“ FLC”)和ID(例如“ AT1G01390”,“ AT1G01310”,“ AT1G01240”)的混合。我想将它们全部转换为基因名称,因此可以通过GO term R程序包运行该程序包(该程序包似乎不读取ID,如AT1G01390)。

当我使用biomaRt的getBM()函数时,它返回的基因比我正在阅读的基因列表少得多。来自CLC的原始列表包含所有拟南芥基因(27,655),而getBM()的输出通常具有12,085个或更少的基因名称。

有人成功进行过这种转换吗?

提前谢谢!

我尝试了各种类型的属性,但是没有一个起作用。

#data load in and conversions, meta matrix/design creation:
    #reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
      #each sample

  reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
  meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")


mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
  listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))  
  ensembl = useDataset("athaliana_eg_gene",mart= mart)

  genes <- row.names(reads)

  test1 <- getBM(attributes='external_gene_name', 
        values = genes, 
        mart = ensembl)

1 个答案:

答案 0 :(得分:0)

好的,至少在我的情况下,我找到了解决此问题的方法。

我正在使用的gmt和fgsea信息只能读取基因符号(例如“ TRY”)或entrez ID。因此,我编写了一个函数,可以将所有必须的信息转换为符号或entrez ID。代码是:

  reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))

genes <- row.names(reads)

sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))

#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")

IDconvert <- function(genes) {

  for (i in genes){

    if (grepl("AT[0-9]", i) == TRUE) {

      if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {

        if (is.na(getEG(i, data='org.At.tair')) == TRUE) {

          i <- i

           } else{

             name <- getEG(i, data='org.At.tair')

             name.l <- as.list(name)
             newname <- as.character(name.l[[1]])
             genes <- sub(i, newname, genes)

            }

      } else{
      name <- getSYMBOL(i, data='org.At.tair')

      name.l <- as.list(name)
      newname <- as.character(name.l[[1]])
      genes <- sub(i, newname, genes)

      }

    } else{
      NULL
    } 

  }
  return(genes)

}   


genes2 <- IDconvert(genes)

sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))

row.names(reads) <- genes2  


gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")                  
gmt.combo <- c(gmt, gmt.ids)

#Stage 3 GO terms

names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)

fgseaRes3 <- fgsea(pathways = gmt.combo, 
                  stats = sub.break3.rank,
                  minSize=5,
                  maxSize=500,
                  nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway



#Stage 1 GO terms

names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)

fgseaRes1 <- fgsea(pathways = gmt.combo, 
                  stats = sub.break1.rank,
                  minSize=5,
                  maxSize=500,
                  nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway


#Stage 2 GO terms

names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)

fgseaRes2 <- fgsea(pathways = gmt.combo, 
                   stats = sub.break2.rank,
                   minSize=5,
                   maxSize=500,
                   nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway



#Stage 4 GO terms

names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)

fgseaRes4 <- fgsea(pathways = gmt.combo, 
                   stats = sub.break4.rank,
                   minSize=5,
                   maxSize=500,
                   nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")


#GO Venn-----------------------------------

group.venn(list(One = pathways.stg1, 
                Two = pathways.stg2, 
                Three = pathways.stg3, 
                Four = pathways.stg4), 
           fill = c("orange", "green", "red", "blue"))