我从登录xml格式的网站上进行了网页抓取,并将其转换为列表。现在我很难从嵌套列表中提取数据,因为它非常复杂。
这是我的z2结构的一部分:
dput(z2)
structure(list(scheduleList = structure(list(
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("2"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011c", status = "-2"),
class = structure(list(name = list("013"), people = list("0"), teacher = structure(list(name = list("B")), .Names = "name", id = "D14")), .Names = c("name", "people", "teacher"), id = "602d", status = "-4"),
class = structure(list(name = list("603"), people = list("6"), teacher = structure(list(name = list("C")), .Names = "name", id = "D31")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("4"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011", status = "-2"),
class = structure(list(name = list("015c"), people = list("51"), teacher = structure(list(name = list("D")), .Names = "name", id = "D23")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class","class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("1"), teacher = structure(list(name = list("E")), .Names = "name", id = "D15")), .Names = c("name", "people", "teacher"), id = "017", status = "-2"),
class = structure(list(name = list("019c"), people = list("22"), teacher = structure(list(name = list("F")), .Names = "name", id = "D28")), .Names = c("name", "people", "teacher"), id = "561", status = "-4"),
class = structure(list(name = list("562d"), people = list("28"), teacher = structure(list(name = list("G")), .Names = "name", id = "D21")), .Names = c("name", "people", "teacher"), id = "562", status = "-4")),
.Names = c("class", "class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-25"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("80"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("37"), teacher = structure(list(name = list("I")), .Names = "name", id = "D18")), .Names = c("name", "people", "teacher"), id = "669", status = "-4"),
class = structure(list(name = list("751d"), people = list("15"), teacher = structure(list(name = list("J")), .Names = "name", id = "D61")), .Names = c("name", "people", "teacher"), id = "751", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("29"), teacher = structure(list(name = list("K")), .Names = "name", id = "D13")), .Names = c("name", "people", "teacher"), id = "567", status = "-2"),
class = structure(list(name = list("666d"), people = list("14"), teacher = structure(list(name = list("L")), .Names = "name", id = "D16")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("21"), teacher = structure(list(name = list("M")), .Names = "name", id = "D22")), .Names = c("name", "people", "teacher"), id = "015", status = "-4"),
class = structure(list(name = list("602d"), people = list("18"), teacher = structure(list(name = list("N")), .Names = "name", id = "D10")), .Names = c("name", "people", "teacher"), id = "602", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-26"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("33"), teacher = structure(list(name = list("O")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("70"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "601", status = "-2"),
class = structure(list(name = list("603d"), people = list("0"), teacher = structure(list(name = list("P")), .Names = "name", id = "D27")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("56"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "602", status = "-4"),
class = structure(list(name = list("666d"), people = list("8"), teacher = structure(list(name = list("Q")), .Names = "name", id = "D20")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("5"), teacher = structure(list(name = list("R")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "017", status = "-4"),
class = structure(list(name = list("021c"), people = list("6"), teacher = structure(list(name = list("S")), .Names = "name", id = "D19")), .Names = c("name", "people", "teacher"), id = "561", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-27")),
.Names = c("schedule", "schedule", "schedule"), from = "2017-01-25", to = "2017-01-27")),
.Names = "scheduleList")
这是z2的一部分:
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "017C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "5"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "R"
attr(,"id")
[1] "D30"
attr(,"id")
[1] "017"
attr(,"status")
[1] "-4"
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "021C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "6"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "S"
attr(,"id")
[1] "D19"
attr(,"id")
[1] "561"
attr(,"status")
[1] "-4"
attr(,"id")
[1] "3"
attr(,"date")
[1] "2017-01-27"
attr(,"from")
[1] "2017-01-25"
attr(,"to")
[1] "2017-01-27"
我需要从嵌套列表中提取我需要的信息,因为我是新手,所以我使用效率最低的方法:
for (i in 1:length(z2[[1]])){ #length(z2[[1]])=7
for (j in 1:length(z2[[1]][[i]])){ #length(z2[[1]][[i]])=3
for (k in 1:length(z[[1]][[i]][[j]])){
cbind=(
Date=attr(z2[[1]][[i]],"date"), #date
Score=attr(z2[[1]][[i]][[j]],"id"), #score
People=z2[[1]][[i]][[j]][[k]][[2]][[1]], #people
TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]], #teacher name
TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"), #teacher ID
CName=z2[[1]][[i]][[j]][[k]][[1]][[1]], #class name
CID=attr(z2[[1]][[i]][[j]][[k]],"id"), #class ID
CSta=attr(z2[[1]][[i]][[j]][[k]],"status") #class status
)
}
}
}
它在我的循环中不起作用。我想将其输出为数据帧或数组。我期待的结果是:
Date Score TID TName CName CID CSta People
2017-01-25 1 D14 B 013c 602 -4 0
2017-01-26 2 D16 L 666d 666 -4 14
XML格式网站示例:
<result status="success">
<code>1</code>
<note>success</note>
<scheduleList from="2017-01-25" to="2017-01-26">
<schedule date="2017-01-25">
<score id="1">
<class id="011" status="-4">
<name>011c</name>
<people>116</people>
<teacher id="D47">
<name>A</name>
</teacher>
</class>
<class id="669" status="-4">
<name>669d</name>
<people>10</people>
<teacher id="D29">
<name>B</name>
</teacher>
</class>
</score>
<score id="2">
<class id="013" status="-4">
<name>013c</name>
<people>9</people>
<teacher id="D9">
<name>C</name>
</teacher>
</class>
</score>
<score id="3">
<class id="016" status="-4">
<name>016c</name>
<people>36</people>
<teacher id="D18">
<name>D</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>9</people>
<teacher id="D30">
<name>E</name>
</teacher>
</class>
</score>
</schedule>
<schedule date="2017-01-26">
<score id="1">
<class id="011" status="-2">
<name>011c</name>
<people>2</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
<class id="013" status="-2">
<name>013c</name>
<people>0</people>
<teacher id="D14">
<name>G</name>
</teacher>
</class>
</score>
<score id="2">
<class id="011" status="-2">
<name>011c</name>
<people>4</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
</score>
<score id="3">
<class id="017" status="-2">
<name>017c</name>
<people>1</people>
<teacher id="D141">
<name>H</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>22</people>
<teacher id="D291">
<name>I</name>
</teacher>
</class>
<class id="020" status="-4">
<name>020c</name>
<people>8</people>
<teacher id="D143">
<name>J</name>
</teacher>
</class>
</score>
</schedule>
</scheduleList>
</result>
代码:
url <- "xxxxxxx"
session <-html_session(url)
form <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
"fromDate" = "2017-01-25",
"toDate" = "2017-01-26",
"userid" = "xxx",
"Password" = "aaa")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
答案 0 :(得分:3)
您没有分配cbind
的结果。 (并且它以错误的方式使用,不要cbind=something
,等号是错误。)
这是一种快速且可能无效的方法。
result <- data.frame()
for (i in 1:length(z2[[1]])){ #length(z2[[1]])=7
for (j in 1:length(z2[[1]][[i]])){ #length(z2[[1]][[i]])=3
for (k in 1:length(z2[[1]][[i]][[j]])){
row <- cbind(
Date=attr(z2[[1]][[i]],"date"), #date
Score=attr(z2[[1]][[i]][[j]],"id"), #score
People=z2[[1]][[i]][[j]][[k]][[2]][[1]], #people
TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]], #teacher name
TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"), #teacher ID
CName=z2[[1]][[i]][[j]][[k]][[1]][[1]], #class name
CID=attr(z2[[1]][[i]][[j]][[k]],"id"), #class ID
CSta=attr(z2[[1]][[i]][[j]][[k]],"status") #class status
)
result <- rbind(result, row)
}
}
}
head(result)
Date Score People TName TID CName CID CSta
1 2017-01-25 1 2 A D29 011c 011c -2
2 2017-01-25 1 0 B D14 013 602d -4
3 2017-01-25 1 6 C D31 603 603 -4
4 2017-01-25 2 4 A D29 011c 011 -2
5 2017-01-25 2 51 D D23 015c 666 -4
6 2017-01-25 3 1 E D15 017c 017 -2
答案 1 :(得分:1)
使用purrr
中的dplyr
和tidyverse
个包可以帮助完成此任务
z2$scheduleList %>%
map_df(~ map_df(.x,
~ data_frame(
TID = map_chr(.x, list("teacher", attr_getter("id"))),
TName = map_chr(.x, list("teacher", "name", 1)),
CName = map_chr(.x, list("name", 1)),
CID = map_chr(.x, list(attr_getter("id"))),
Csta = map_chr(.x, list(attr_getter("status"))),
People = map_chr(.x, list("people", 1))) %>%
mutate(Score = attr(.x, "id")
)) %>%
mutate(Date = attr(.x, "date"))) %>%
select(Date, Score, everything())
#> # A tibble: 22 x 8
#> Date Score TID TName CName CID Csta People
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 2017-01-25 1 D29 A 011c 011c -2 2
#> 2 2017-01-25 1 D14 B 013 602d -4 0
#> 3 2017-01-25 1 D31 C 603 603 -4 6
#> 4 2017-01-25 2 D29 A 011c 011 -2 4
#> 5 2017-01-25 2 D23 D 015c 666 -4 51
#> 6 2017-01-25 3 D15 E 017c 017 -2 1
#> 7 2017-01-25 3 D28 F 019c 561 -4 22
#> 8 2017-01-25 3 D21 G 562d 562 -4 28
#> 9 2017-01-26 1 D47 H 011c 011 -4 80
#> 10 2017-01-26 1 D18 I 013c 669 -4 37
#> # ... with 12 more rows
不知道它是否更有效,但阅读和理解可能更清楚。
很好的用例来理解purrr
。