R列表中的.attrs和重复条目

时间:2015-04-08 15:24:17

标签: r ncbi rentrez

我正在尝试使用此R脚本从NCBI获取一些信息:

require(rentrez)
require(magrittr)
rs = "rs16891982"
rss = c("rs16891982", "rs12203592", "rs1408799", "rs10756819", "rs35264875", "rs1393350", "rs12821256", "rs17128291", "rs1800407", "rs12913832", "rs1805008", "rs4911414")
# given a rs number, return chr, bp, allele and gene name
annotateGeneName = function(rs) {
    anno = rentrez::entrez_search(db = "snp", term = rs) %>%
           "[["("ids")                                   %>%
           rentrez::entrez_summary(db = "snp", id = .)
           if(length(anno) < 1) {
               warning(sprintf("%s not found in dbSNP!", rs))
               return(invisible(NULL))
           }
           # there might be multiple entries
           # if "snp_id" is not in the list, then
           # it means multiple SNPs have been return for this search
           # just take the first hit
           if(! "snp_id" %in% names(anno)) {
               anno = anno[[1]]
           }
    chrpos = anno[["chrpos"]]
    EA     = anno$allele_origin %>% gsub("\\(.*", "", .)
    fEA    = anno$global_maf %>% gsub("/.*", "", .) %>% gsub("^.*=", "", .)
    genes  = dplyr::first(anno$genes, default = NA)
    res = data.frame(snp = rs, chrpos = chrpos, EA = EA, fEA = fEA, genes = genes)
    res
}
annotateGeneNames = function(rss) {
    do.call(rbind, lapply(rss, annotateGeneName))
}
ids = rentrez::entrez_search(db = "snp", term = rs) %>% "[["("ids")
x = rentrez::entrez_fetch(db = "snp", id = ids[1], rettype="xml")
snp1xml = xmlParse(x)
snp1list = xmlToList(snp1xml)
print(snp1list)

当您打印结果时,您可以看到以下内容:

...
$Rs$Sequence$.attrs
     exemplarSs ancestralAllele 
    "285153617"   "C,C,C,C,C,C" 


$Rs$Ss$.attrs
        ssId       handle      batchId     locSnpId  subSnpClass       orient 
  "23456916"   "PERLEGEN"      "12309" "afd3693051"        "snp"    "forward" 
      strand      molType      buildId  methodClass    validated 
    "bottom"    "genomic"        "123"  "hybridize" "by-cluster" 


$Rs$Ss$.attrs
                          ssId                         handle 
                    "28510204"              "MGC_GENOME_DIFF" 
                       batchId                       locSnpId 
                       "12314" "BC064405x37550355-C16403799G" 
                   subSnpClass                         orient 
                         "snp"                      "forward" 
                        strand                        molType 
                      "bottom"                         "cDNA" 
                       buildId                    methodClass 
                         "126"                     "computed" 


$Rs$Ss
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "TTCCCTTTCATTTTCCAGAGAAACTTGATCAGGAACCCACTGATTCCAAGAGCAAAGTAATCAGTGAGGAAATGACACCTAGAATTCATGATGAAAAAAGGATGCTTTATATGGTCCTTTTTAAGGTGATAGTTTTTCCTGACGTCCATAGATTTATTAAGAATCTGGTATTTTAAACAGTAGGAAATACACATAGAAATATCAAATCCAAGTTGTGCTAGACCAGAAACTTTTAGAAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"

$Rs$Ss$.attrs

$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "AAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"

$Rs$Assembly$Component$MapLoc$FxnSet
      geneId       symbol      mrnaAcc      mrnaVer      protAcc      protVer 
     "51151"    "SLC45A2"  "NM_016180"          "4"  "NP_057264"          "3" 
    fxnClass readingFrame       allele      residue   aaPosition 
 "reference"          "3"          "C"          "F"        "373" 

$Rs$Assembly$Component$MapLoc$FxnSet
                geneId                 symbol                mrnaAcc 
               "51151"              "SLC45A2"            "NM_016180" 
               mrnaVer                protAcc                protVer 
                   "4"            "NP_057264"                    "3" 
              fxnClass           readingFrame                 allele 
            "missense"                    "3"                    "G" 
               residue             aaPosition                 soTerm 
                   "L"                  "373" "non_synonymous_codon" 

此列表中有很多.attrs条目,它们通常是重复的。还有其他重复条目,例如:

$Rs$Ss$Sequence$Seq5
$Rs$Assembly$Component$MapLoc$FxnSet

.attrs是什么意思,我如何理解这些数据? 我不知道你如何在一个列表中有两个同名的条目。

1 个答案:

答案 0 :(得分:2)

在R attributes中,attr是分配或提取属性的函数,但据我所知,`'。attr'只是一个列表位置名称。它的意义基本上是作者认为它应该意味着什么....之后是你的代码完成解析XML并将其转换为R列表。它不是R定义的一部分,所以请阅读文档。

我现在看到你被名字相同的列表项所困扰。这在R中是可能的。“[”和“[[”将检索树中与名称匹配的第一项。访问需要是数字或由lapply或sapply调解,这些函数遍历树的上层以避免歧义。

> mylist=vector("list", length=2)
> mylist
[[1]]
NULL

[[2]]
NULL

> names(mylist) <- c("a","a")
> mylist
$a
NULL

$a
NULL

> mylist[['a']]
NULL
> mylist['a']
$a
NULL

> lapply( mylist , "[[", "a")
$a
NULL

$a
NULL

(我也没有看到在提取和处理数据的过程中使用这些函数定义中的任何一个。)