我是R的新手。现在我想解析一个XML文件(https://da5020.weebly.com/uploads/8/6/5/9/8659576/pubmedsample.jun18.xml),除了每篇文章的作者数量外,每个配对都要完成。我从Efficiently get the number of children with specific name using XML and R采用了一段代码:
authors_number = xpathSApply(xmldata, "count(//PubmedArticle/MedlineCitation/Article/AuthorList/Author/LastName)", xmlValue)
但它返回XML中作者的总数。其余的解析由
完成library(tidyverse)
library(XML)
library(methods)
xmldata <- xmlParse("pubmedsample.jun18.xml", useInternalNodes = TRUE)
publication <- tibble(PMID = as.numeric(xpathSApply(xmldata, '//MedlineCitation/PMID', xmlValue)),
ISSN = xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
if (xpathSApply(x, "boolean(./Article/Journal/ISSN)")) {
xpathSApply(x, "./Article/Journal/ISSN", xmlValue)
} else {
NA
}}),#parse ISSN
data_completed_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
if (xpathSApply(x, "boolean(./DateCompleted/Year)")) {
xpathSApply(x, "./DateCompleted/Year", xmlValue)
} else {
NA
}})),
data_completed_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
if (xpathSApply(x, "boolean(./DateCompleted/Month)")) {
xpathSApply(x, "./DateCompleted/Month", xmlValue)
} else {
NA
}})),
data_completed_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
if (xpathSApply(x, "boolean(./DateCompleted/Day)")) {
xpathSApply(x, "./DateCompleted/Day", xmlValue)
} else {
NA
}})),
data_revised_year = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
if (xpathSApply(x, "boolean(./DateRevised/Year)")) {
xpathSApply(x, "./DateRevised/Year", xmlValue)
} else {
NA
}})),
data_revised_month = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
if (xpathSApply(x, "boolean(./DateRevised/Month)")) {
xpathSApply(x, "./DateRevised/Month", xmlValue)
} else {
NA
}})),
data_revised_day = as.numeric(xpathSApply(xmldata, '//PubmedArticle/MedlineCitation', function(x) {
if (xpathSApply(x, "boolean(./DateRevised/Day)")) {
xpathSApply(x, "./DateRevised/Day", xmlValue)
} else {
NA
}})),
publication_type = as.character(xpathSApply(xmldata, '//PublicationTypeList', xmlValue))[1],#parse the first type, if more than one
article_title = as.character(xpathSApply(xmldata, '//ArticleTitle', xmlValue))) %>%
mutate(completed_date = as.character(make_date(data_completed_year, data_completed_month, data_completed_day)), revised_date = as.character(make_date(data_revised_year, data_revised_month, data_revised_day))) %>%
select(PMID, ISSN, completed_date, revised_date, publication_type, article_title)
有人可以教我如何获得每篇文章的作者数量吗?非常感谢!
答案 0 :(得分:0)
为了帮助您入门,我会这样做:
# Convert MedlineCitation node from XML file to a list
lst <- lapply(xmlToList(xmldata), function(x) x$MedlineCitation)
# Extract the AuthorList node
lst.author <- lapply(lst, function(x) x$Article$AuthorList);
# Count the number of authors
n.author <- sapply(lst.author, function(x) sum(names(x) == "Author"));
#PubmedArticle PubmedArticle PubmedArticle PubmedArticle PubmedArticle
# 1 0 10 1 6
#PubmedArticle
# 2
我发现使用list
比使用XMLInternalDocument
更容易使用转换。然后,任务归结为导航嵌套的list
并提取相关的信息。
答案 1 :(得分:0)
对于使用xml2
和purrr
的人:
library(xml2)
library(purrr)
doc <- read_xml("https://da5020.weebly.com/uploads/8/6/5/9/8659576/pubmedsample.jun18.xml")
xml_find_all(doc, ".//MedlineCitation/Article") %>%
map_dbl(~xml_find_first(.x, "count(.//AuthorList/Author/LastName)"))
## [1] 1 0 10 1 6 2 4 2 5 4 11 3 4 8 2 3
## [17] 7 25 5 3 3 0 1 2 2 6 2 4 1 1 4 4
## [33] 8 2 25 7 6 5 2 3 8 2 6 7 9 2 3 10
## [49] 256 6 9 5 9 0 4 3 2 9 2 4 4 2 2 1
## [65] 1 2 1 1 1 1 1 1 4 5 4 1 0 2 1 5
## [81] 2 2 1 1 11 1 4 1 2 4 2 3 1 1 1 1
## [97] 0 0 1 1 1 1 10 1 620 4 5 5 1 7 4 3
## [113] 1 2 4 3 9 1 1 3 2 1 3 1 6 4 5 3
## [129] 2 2 5 9 2 2 1 23 3 2 1 14 41 12 12 1
## [145] 4 3 0 2 3 2 7 0 1 1 9 1 2 2 2 18
## [161] 4 2 7 5 1 9 5 14 0 0 10 20 0 0 0 0
## [177] 0
答案 2 :(得分:0)
XPath解决方案是计算每个PubmedArticle中AuthorList节点的xmlChildren数量:
library(XML)
library(tidyverse)
library(plyr)
xmlParse("pubmedsample.jun18.xml") -> doc
getNodeSet(doc, "//PubmedArticle") -> articles
ldply(articles, function(x) {
xpathSApply(x, ".//Article/Journal/ISSN", xmlValue) -> ISSN
xpathSApply(x, ".//DateCompleted/Year", xmlValue) -> data_completed_year
xpathSApply(x, ".//AuthorList", xmlChildren) %>% length() -> author_count
ifelse(author_count > 0, author_count, NA) -> authors
# ... #
data_frame(ISSN, data_completed_year, authors)
}) %>%
tbl_df() ->
output
head(output)
输出:
# A tibble: 161 x 3
ISSN data_completed_year authors
<chr> <chr> <int>
1 0095-3814 1976 1
2 0377-8231 1993 NA
3 0022-2623 1991 10
4 0021-8820 1987 6
5 0014-2956 1994 2
6 1051-0443 1993 4
7 0017-0011 1996 2
8 0026-895X 1996 5
9 1059-2725 1996 4
10 0009-7322 1997 11
# ... with 151 more rows