如何在R中获取XML中的子项数

时间:2018-06-13 01:12:53

标签: r xml

我是R的新手。现在我想解析一个XML文件(https://da5020.weebly.com/uploads/8/6/5/9/8659576/pubmedsample.jun18.xml),除了每篇文章的作者数量外,每个配对都要完成。我从Efficiently get the number of children with specific name using XML and R采用了一段代码:

authors_number = xpathSApply(xmldata, "count(//PubmedArticle/MedlineCitation/Article/AuthorList/Author/LastName)", xmlValue)

但它返回XML中作者的总数。其余的解析由

完成
library(tidyverse)
library(XML)
library(methods)
xmldata <- xmlParse("pubmedsample.jun18.xml", useInternalNodes = TRUE)
publication <- tibble(PMID = as.numeric(xpathSApply(xmldata, '//MedlineCitation/PMID', xmlValue)),

                       ISSN = xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./Article/Journal/ISSN)")) {
                          xpathSApply(x, "./Article/Journal/ISSN", xmlValue)
                         } else {
                           NA
                         }}),#parse ISSN

                      data_completed_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Year)")) {
                          xpathSApply(x, "./DateCompleted/Year", xmlValue)
                         } else {
                           NA
                         }})),
                      data_completed_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Month)")) {
                          xpathSApply(x, "./DateCompleted/Month", xmlValue)
                         } else {
                           NA
                         }})),
                      data_completed_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateCompleted/Day)")) {
                          xpathSApply(x, "./DateCompleted/Day", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Year)")) {
                          xpathSApply(x, "./DateRevised/Year", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Month)")) {
                          xpathSApply(x, "./DateRevised/Month", xmlValue)
                         } else {
                           NA
                         }})),
                      data_revised_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
                         if  (xpathSApply(x, "boolean(./DateRevised/Day)")) {
                          xpathSApply(x, "./DateRevised/Day", xmlValue)
                         } else {
                           NA
                         }})),
                      publication_type = as.character(xpathSApply(xmldata, '//PublicationTypeList', xmlValue))[1],#parse the first type, if more than one
                      article_title = as.character(xpathSApply(xmldata, '//ArticleTitle', xmlValue))) %>%

  mutate(completed_date = as.character(make_date(data_completed_year, data_completed_month, data_completed_day)), revised_date = as.character(make_date(data_revised_year, data_revised_month, data_revised_day))) %>%

  select(PMID, ISSN, completed_date, revised_date, publication_type, article_title)

有人可以教我如何获得每篇文章的作者数量吗?非常感谢!

3 个答案:

答案 0 :(得分:0)

为了帮助您入门,我会这样做:

# Convert MedlineCitation node from XML file to a list
lst <- lapply(xmlToList(xmldata), function(x) x$MedlineCitation)

# Extract the AuthorList node
lst.author <- lapply(lst, function(x) x$Article$AuthorList);

# Count the number of authors
n.author <- sapply(lst.author, function(x) sum(names(x) == "Author"));
#PubmedArticle PubmedArticle PubmedArticle PubmedArticle PubmedArticle
#            1             0            10             1             6
#PubmedArticle
#            2

我发现使用list比使用XMLInternalDocument更容易使用转换。然后,任务归结为导航嵌套的list并提取相关的信息。

答案 1 :(得分:0)

对于使用xml2purrr的人:

library(xml2)
library(purrr)

doc <- read_xml("https://da5020.weebly.com/uploads/8/6/5/9/8659576/pubmedsample.jun18.xml")

xml_find_all(doc, ".//MedlineCitation/Article") %>% 
  map_dbl(~xml_find_first(.x, "count(.//AuthorList/Author/LastName)"))
##   [1]   1   0  10   1   6   2   4   2   5   4  11   3   4   8   2   3
##  [17]   7  25   5   3   3   0   1   2   2   6   2   4   1   1   4   4
##  [33]   8   2  25   7   6   5   2   3   8   2   6   7   9   2   3  10
##  [49] 256   6   9   5   9   0   4   3   2   9   2   4   4   2   2   1
##  [65]   1   2   1   1   1   1   1   1   4   5   4   1   0   2   1   5
##  [81]   2   2   1   1  11   1   4   1   2   4   2   3   1   1   1   1
##  [97]   0   0   1   1   1   1  10   1 620   4   5   5   1   7   4   3
## [113]   1   2   4   3   9   1   1   3   2   1   3   1   6   4   5   3
## [129]   2   2   5   9   2   2   1  23   3   2   1  14  41  12  12   1
## [145]   4   3   0   2   3   2   7   0   1   1   9   1   2   2   2  18
## [161]   4   2   7   5   1   9   5  14   0   0  10  20   0   0   0   0
## [177]   0

答案 2 :(得分:0)

XPath解决方案是计算每个PubmedArticle中AuthorList节点的xmlChildren数量:

library(XML)
library(tidyverse)
library(plyr)

xmlParse("pubmedsample.jun18.xml") -> doc

getNodeSet(doc, "//PubmedArticle") -> articles

ldply(articles, function(x) {
  xpathSApply(x, ".//Article/Journal/ISSN", xmlValue) -> ISSN
  xpathSApply(x, ".//DateCompleted/Year", xmlValue) -> data_completed_year

  xpathSApply(x, ".//AuthorList", xmlChildren) %>% length() -> author_count
  ifelse(author_count > 0, author_count, NA) -> authors

  # ... #

  data_frame(ISSN, data_completed_year, authors)

}) %>%
  tbl_df() ->
  output

head(output)

输出:

# A tibble: 161 x 3
  ISSN      data_completed_year authors
  <chr>     <chr>                 <int>
1 0095-3814 1976                      1
2 0377-8231 1993                     NA
3 0022-2623 1991                     10
4 0021-8820 1987                      6
5 0014-2956 1994                      2
6 1051-0443 1993                      4
7 0017-0011 1996                      2
8 0026-895X 1996                      5
9 1059-2725 1996                      4
10 0009-7322 1997                     11
# ... with 151 more rows