Question

一个网站包含一些书籍的评论，我想用rvest来抓。可以获得这样的数据：

library(rvest)
library(purrr)
library(tibble)
library(tidyr)
library(dplyr)

result_list <-
  lapply(1:2, function(i) {
    url <- paste0("http://www.deutschlandradiokultur.de/buchkritik.949.de.html?drbm:page=",         i)
    parse_url <- 
      url %>%
      xml2::read_html()
    parse_page <-
      list(page = parse_url %>% html_nodes("span.drk-paginationanzahl") %>% html_text(),
           date = parse_url %>% html_nodes(".drk-container") %>% html_nodes(".drk-sendungdatum") %>% html_text(),
           text = parse_url %>% html_nodes(".drk-container") %>% html_nodes(".drk-overline") %>% html_text(),
           stringsAsFactors=FALSE) %>%
      rbind()
  })

“日期”的长度有时与“文本”不同，所以我使用了列表。现在我很难将列表转换为数据帧。你有什么提示让我转换清单吗？也许有一种更优雅的方法可以让webscrape避免这些...数据框应该有“page”，“date”和“text”列。（在下一步中，我将作者和标题中的文本内容分开）

我尝试了这些方法：

result_df1 <-
  as.data.frame(do.call(rbind, result_list))

result_df2 <-
  as.data.frame(do.call(rbind, lapply(result_list, data.frame, stringsAsFactors=FALSE)))

result_df3 <-
  as.data.frame(Reduce( rbind, lapply(result_list, unlist) ))

result_df4 <-
  as.data.frame(lapply(result_list, unlist))

result_df5 <-
  lapply(result_list, tidyr::unnest)

result_df6 <-
  result_list %>% purrr::dmap(unlist)

result_df7 <-
  result_list %>%
  unlist(recursive = FALSE) %>%
  tibble::enframe() %>%
  unnest()

在result_df1和result_df2中，数据框在每个单元格中都有一个列表。如何通过列取消这些列表？我认为一个大问题是每个列表元素的长度不同。我怎么处理这个？

示例1类似于我的问题，列表中的长度不同。在长度相等的情况下（例子2），我也在努力转换为数据帧。

example1 <- 
  list(structure(list("page 1/490",
                      c("a", "b", "c", "d"), 
                      c("author1: \"title1\"", "author2: \"title2\"", "author3: \"title3\"", "author4: \"title4\""), 
                      FALSE),   
                 .Dim = c(1L, 4L), 
                 .Dimnames = list(".", c("page", "date", "text", "stringsAsFactors"))), 
       structure(list("page 2/490", 
                      c("e", "f", "g"), 
                      c("author5: \"title5\"", "author6: \"title6\"", "author7: \"title7\"", "author8: \"title8\""),
                      FALSE),   
                 .Dim = c(1L, 4L), 
                 .Dimnames = list(".", c("page", "date", "text", "stringsAsFactors")))

)


example2 <- 
  list(structure(list(c("a", "b", "c", "d"), 
                      c("author1: \"title1\"", "author2: \"title2\"", "author3: \"title3\"", "author4: \"title4\""), 
                      FALSE),   
                 .Dim = c(1L, 3L), 
                 .Dimnames = list(".", c("date", "text", "stringsAsFactors"))), 
       structure(list(c("e", "f", "g", "h"), 
                      c("author5: \"title5\"", "author6: \"title6\"", "author7: \"title7\"", "author8: \"title8\""),
                      FALSE),   
                 .Dim = c(1L, 3L), 
                 .Dimnames = list(".", c("date", "text", "stringsAsFactors")))

  )

Answer 1

在第二页上，其相应项目中缺少.drk-sendungdatum项之一，因此元素数量不均匀，因此无法构造data.frame。由于没有封闭标记，因此很难确定NA应该去哪里，但通过在之前查找.drk-overline来分配额外的span项是相对容易的。（+）封闭的article代码：

library(tidyverse)
library(rvest)

pages <- 1:2 %>% 
    paste0("http://www.deutschlandradiokultur.de/buchkritik.949.de.html?drbm:page=", .) %>% 
    map(read_html)

result_list <- pages %>% 
    map(html_nodes, '.drk-container') %>%
    map_df(~data_frame(page = .x %>% html_node("span.drk-paginationanzahl") %>% html_text(),
                       date = .x %>% html_nodes(".drk-sendungdatum") %>% html_text(),
                       text = .x %>% html_nodes("span + article .drk-overline") %>% html_text()))

如果你真的想要，你可以重新分析pages以添加有问题的观察，虽然它需要一些工作：

result_list <- pages %>% 
    map_df(~list(page = .x %>% html_node('span.drk-paginationanzahl') %>% html_text(), 
                 date = NA, 
                 text = .x %>% html_node('.drk-container :not(span) + article .drk-overline') %>% html_text())) %>% 
    drop_na(text) %>% 
    bind_rows(result_list)

现在请注意顶部NA的{{1}}值：

date

将包含列表的列表转换为数据帧

1 个答案: