我是R的新手。我昨天抓了一个需要登录的网站,页面是xml格式,如下所示。
<result status="success">
<code>1</code>
<note>success</note>
<teacherList>
<teacher id="D95">
<name>Mary</name>
<department id="420">
<name>Math</name>
</department>
<department id="421">
<name>Statistics</name>
</department>
</teacher>
<teacher id="D73">
<name>Adam</name>
<department id="412">
<name>English</name>
</department>
</teacher>
</teacherList>
</result>
最近我刚刚将XML转换为列表。
library(XML)
library(rvest)
library(plyr)
library(dplyr)
library(httr)
library(pipeR)
library(xml2)
url.address <- "http://xxxxxxxxxxxxxxxxx"
session <-html_session(url.address)
form <-html_form(read_html(url.address))[[1]]
filled_form <- set_values(form,
"userid" = "id",
"Password" = "password")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
z1 = as_list(z)
z2 <- z1$teacherList
现在我需要从列表中提取数据并将其作为数据框。顺便说一下,有些人属于2个部门,但有些人只属于1.部分列表z2如下所示:
z2[[1]]
$name
$name[[1]]
[1] "Mary"
$department
$department$name
$department$name[[1]]
[1] "Math"
attr(,"id")
[1] "420"
$department
$department$name
$department$name[[1]]
[1] "statistics"
attr(,"id")
[1] "421"
attr(,"id")
[1] "D95236"
当我逐一提取它们时,花了太长时间:
attr(z2[[1]],"id")
“D95”
z2[[1]][[1]][[1]]
“玛丽”
z2[[1]][[2]][[1]][[1]]
“数学”
attr(z2[[1]][[2]], "id")
“420”
z2[[1]][[3]][[1]][[1]]
“统计”
attr(z2[[1]][[3]], "id")
“421”
attr(z2[[2]],"id")
“D73”
z2[[2]][[1]][[1]]
“亚当”
z2[[2]][[2]][[1]][[1]]
“英语”
attr(z2[[2]][[2]],"id")
“412”
所以我试着写一个循环:
for (x in 1:2){
for (y in 2:3){
a <- attr(z2[[x]],"id")
b <- z2[[x]][[1]][[1]]
d <- z2[[x]][[y]][[1]][[1]]
e <- attr(z2[[x]][[y]],"id")
g <- cbind(print(a),print(b),print(d),print(e))
}}
但它根本不起作用,因为有些人只属于一个部门。我预期的结果是:
任何建议将不胜感激!
dput(head(z2, 10))
structure(list(teacher = structure(list(name = list("Mary"),
department = structure(list(name = list("Math")), .Names = "name", id = "420"),
department = structure(list(name = list("statistics")), .Names = "name", id = "421")), .Names = c("name",
"department", "department"), id = "D95"), teacher = structure(list(
name = list("Adam"), department = structure(list(name = list(
"English")), .Names = "name", id = "412")), .Names = c("name",
"department"), id = "D73"), teacher = structure(list(name = list(
"Kevin"), department = structure(list(name = list("Chinese")), .Names = "name", id = "201")), .Names = c("name",
"department"), id = "D101"), teacher = structure(list(name = list(
"Nana"), department = structure(list(name = list("Science")), .Names = "name", id = "205")), .Names = c("name",
"department"), id = "D58"), teacher = structure(list(name = list(
"Nelson"), department = structure(list(name = list("Music")), .Names = "name", id = "370")), .Names = c("name",
"department"), id = "D14"), teacher = structure(list(name = list(
"Esther"), department = structure(list(name = list("Medicine")), .Names = "name", id = "361")), .Names = c("name",
"department"), id = "D28"), teacher = structure(list(name = list(
"Mia"), department = structure(list(name = list("Chemistry")), .Names = "name", id = "326")), .Names = c("name",
"department"), id = "D17"), teacher = structure(list(name = list(
"Jack"), department = structure(list(name = list("German")), .Names = "name", id = "306")), .Names = c("name",
"department"), id = "D80"), teacher = structure(list(name = list(
"Tom"), department = structure(list(name = list("French")), .Names = "name", id = "360")), .Names = c("name",
"department"), id = "D53"), teacher = structure(list(name = list(
"Allen"), department = structure(list(name = list("Spanish")), .Names = "name", id = "322")), .Names = c("name",
"department"), id = "D18")), .Names = c("teacher", "teacher",
"teacher", "teacher", "teacher", "teacher", "teacher", "teacher", "teacher",
"teacher"))
答案 0 :(得分:2)
构建起来有点疯狂,但我认为它或多或少符合以前版本帖子中发布的所需输出。我必须在sapply
函数中使用lapply
来提取第二个ID变量。
do.call(rbind, # rbind list of data.frames output by lapply
lapply(unname(z2), # loop through list, first drop outer names
function(x) { # begin lapply function
temp <- unlist(x) # unlist inner elements to a vector
data.frame(name=temp[names(temp) == "name"], # subset on names
dept=temp[names(temp) == "department.name"], # subset on dept
id=attr(x, "id"), # extract one id
id2=unlist(sapply(x, attr, "id")), # extract other id
row.names=NULL) # end data.frame function, drop row.names
})) # end lapply function, lapply, and do.call
返回
name dept id id2
1 Mary Math D95 420
2 Mary statistics D95 421
3 Adam English D73 412
4 Kevin Chinese D101 201
5 Nana Science D58 205
6 Nelson Music D14 370
7 Esther Medicine D28 361
8 Mia Chemistry D17 326
9 Jack German D80 306
10 Tom French D53 360
11 Allen Spanish D18 322
第二个列表的结构在很多方面与最初的例子不同。首先:删除一个巢。也就是说,新列表的深度比初始示例的深度少一个。就好像你为初始列表提供了z2 [[1]]。其次,第二个例子缺少我最初称为id的值(D95和D101等值)。
通过对原始代码的一些操作,我可以使用
lapply(list(z3), # loop through list, first drop outer names
function(x) { # begin lapply function
temp <- unlist(x) # unlist inner elements to a vector
data.frame(name=temp[names(temp) == "name"], # subset on names
dept=temp[names(temp) == "department.name"], # subset on dept
# id=attr(x, "id"), # extract one id
id2=unlist(sapply(x, attr, "id")), # extract other id
row.names=NULL) # end data.frame function, drop row.names
})
在z2之前我所提到的代码地址的更改被list(z3)
替换为lapply
的第一个参数,它构造了所需的列表深度。此外,内部函数id=attr(x, "id"),
的行已被注释掉,因为id2不存在。
答案 1 :(得分:0)
在R
中,XML通常很容易处理使用library(XML)
和library(plyr)
来避免编写循环:
第一步是阅读XML
我将示例XML保存为名为Demo.xml
的.xml文件。您还可以传递xmlParse URL。
rawXML <- xmlParse("Demo.xml")
然后将XML转换为list:
xmlList <- xmlToList(rawXML)
然后使用plyr
df1 <- ldply(xmlList, data.frame)
这是一般过程,如果您提供样本数据,我们可以对其进行优化以匹配您的特定用例。
这里是结果摘要输出。这是你正在寻找的吗?
str(df1)
'data.frame': 4 obs. of 12 variables:
$ .id : chr "code" "note" "teacherList" ".attrs"
$ X..i.. : Factor w/ 2 levels "1","success": 1 2 NA 2
$ teacher.name : Factor w/ 1 level "Mary": NA NA 1 NA
$ teacher.department.name : Factor w/ 1 level "Math": NA NA 1 NA
$ teacher.department..attrs : Factor w/ 1 level "420": NA NA 1 NA
$ teacher.department.name.1 : Factor w/ 1 level "Statistics": NA NA 1 NA
$ teacher.department..attrs.1: Factor w/ 1 level "421": NA NA 1 NA
$ teacher..attrs : Factor w/ 1 level "D95": NA NA 1 NA
$ teacher.name.1 : Factor w/ 1 level "Adam": NA NA 1 NA
$ teacher.department.name.2 : Factor w/ 1 level "English": NA NA 1 NA
$ teacher.department..attrs.2: Factor w/ 1 level "412": NA NA 1 NA
$ teacher..attrs.1 : Factor w/ 1 level "D73": NA NA 1 NA