一个网站包含一些书籍的评论,我想用rvest来抓。可以获得这样的数据:
library(rvest)
library(purrr)
library(tibble)
library(tidyr)
library(dplyr)
result_list <-
lapply(1:2, function(i) {
url <- paste0("http://www.deutschlandradiokultur.de/buchkritik.949.de.html?drbm:page=", i)
parse_url <-
url %>%
xml2::read_html()
parse_page <-
list(page = parse_url %>% html_nodes("span.drk-paginationanzahl") %>% html_text(),
date = parse_url %>% html_nodes(".drk-container") %>% html_nodes(".drk-sendungdatum") %>% html_text(),
text = parse_url %>% html_nodes(".drk-container") %>% html_nodes(".drk-overline") %>% html_text(),
stringsAsFactors=FALSE) %>%
rbind()
})
“日期”的长度有时与“文本”不同,所以我使用了列表。现在我很难将列表转换为数据帧。你有什么提示让我转换清单吗?也许有一种更优雅的方法可以让webscrape避免这些...数据框应该有“page”,“date”和“text”列。 (在下一步中,我将作者和标题中的文本内容分开)
我尝试了这些方法:
result_df1 <-
as.data.frame(do.call(rbind, result_list))
result_df2 <-
as.data.frame(do.call(rbind, lapply(result_list, data.frame, stringsAsFactors=FALSE)))
result_df3 <-
as.data.frame(Reduce( rbind, lapply(result_list, unlist) ))
result_df4 <-
as.data.frame(lapply(result_list, unlist))
result_df5 <-
lapply(result_list, tidyr::unnest)
result_df6 <-
result_list %>% purrr::dmap(unlist)
result_df7 <-
result_list %>%
unlist(recursive = FALSE) %>%
tibble::enframe() %>%
unnest()
在result_df1和result_df2中,数据框在每个单元格中都有一个列表。如何通过列取消这些列表? 我认为一个大问题是每个列表元素的长度不同。我怎么处理这个?
示例1类似于我的问题,列表中的长度不同。在长度相等的情况下(例子2),我也在努力转换为数据帧。
example1 <-
list(structure(list("page 1/490",
c("a", "b", "c", "d"),
c("author1: \"title1\"", "author2: \"title2\"", "author3: \"title3\"", "author4: \"title4\""),
FALSE),
.Dim = c(1L, 4L),
.Dimnames = list(".", c("page", "date", "text", "stringsAsFactors"))),
structure(list("page 2/490",
c("e", "f", "g"),
c("author5: \"title5\"", "author6: \"title6\"", "author7: \"title7\"", "author8: \"title8\""),
FALSE),
.Dim = c(1L, 4L),
.Dimnames = list(".", c("page", "date", "text", "stringsAsFactors")))
)
example2 <-
list(structure(list(c("a", "b", "c", "d"),
c("author1: \"title1\"", "author2: \"title2\"", "author3: \"title3\"", "author4: \"title4\""),
FALSE),
.Dim = c(1L, 3L),
.Dimnames = list(".", c("date", "text", "stringsAsFactors"))),
structure(list(c("e", "f", "g", "h"),
c("author5: \"title5\"", "author6: \"title6\"", "author7: \"title7\"", "author8: \"title8\""),
FALSE),
.Dim = c(1L, 3L),
.Dimnames = list(".", c("date", "text", "stringsAsFactors")))
)
答案 0 :(得分:3)
在第二页上,其相应项目中缺少.drk-sendungdatum
项之一,因此元素数量不均匀,因此无法构造data.frame。由于没有封闭标记,因此很难确定NA
应该去哪里,但通过在之前查找.drk-overline
来分配额外的span
项是相对容易的。 (+
)封闭的article
代码:
library(tidyverse)
library(rvest)
pages <- 1:2 %>%
paste0("http://www.deutschlandradiokultur.de/buchkritik.949.de.html?drbm:page=", .) %>%
map(read_html)
result_list <- pages %>%
map(html_nodes, '.drk-container') %>%
map_df(~data_frame(page = .x %>% html_node("span.drk-paginationanzahl") %>% html_text(),
date = .x %>% html_nodes(".drk-sendungdatum") %>% html_text(),
text = .x %>% html_nodes("span + article .drk-overline") %>% html_text()))
如果你真的想要,你可以重新分析pages
以添加有问题的观察,虽然它需要一些工作:
result_list <- pages %>%
map_df(~list(page = .x %>% html_node('span.drk-paginationanzahl') %>% html_text(),
date = NA,
text = .x %>% html_node('.drk-container :not(span) + article .drk-overline') %>% html_text())) %>%
drop_na(text) %>%
bind_rows(result_list)
现在请注意顶部NA
的{{1}}值:
date