我抓取了一个网址,然后得到了一个嵌套列表。在下一步中,我想将其转换为数据帧。但我的解决方案不起作用。我认为问题在于listelemts 有名称,在转换之前我无法删除它。希望大家给点提示。
require(tidyverse)
require(rvest)
#create a list, which looks like this what I get after webscrape ( my code for scraping is at the end)
item1 <- data.frame(id_course1 = c("id_course1", "id_course1"),
course1 = c("participants", 15),
course1 = c("mark1", 1),
course1 = c("mark2", 2),
course1 = c("mark3", 3),
course1 = c("mark4", 4),
course1 = c("mark5", 5))
item2 <- data.frame(id_course2 = c("id_course2", "id_course2"),
course2 = c("participants", 30),
course2 = c("mark1", 10),
course2 = c("mark2", 8),
course2 = c("mark3", 6),
course2 = c("mark4", 4),
course2 = c("mark5", 2))
item3 <- data.frame(id_course3 = c("id_course3", "id_course3"),
course3 = c("participants", 15),
course3 = c("mark1", 2),
course3 = c("mark2", 4),
course3 = c("mark3", 5),
course3 = c("mark4", 3),
course3 = c("mark5", 1))
my.list <- list(item1, item2, item3)
#create dataframe, but the result is not what I want
require(data.table)
data.table::rbindlist(my.list, fill=TRUE)
dplyr::bind_rows(my.list)
dplyr::bind_rows(unname(my.list))
# try to use only the second row of the table, but the result is not what I want
do.call("cbind", lapply(my.list, "[[", 2) )
do.call("rbind", lapply(my.list, "[[", 2) )
lapply(my.list, "[[", 2) %>% dplyr::bind_rows
#at the end I want a table that looks like this
df_what_i_want <- data.frame(t(data.frame(c("id_course1", 15, 1, 2, 3, 4, 5 ),
c("id_course2", 30, 10, 8, 6, 4, 2 ),
c("id_course3", 15, 2, 4, 5, 3, 1 ))))
rownames(df_what_i_want) <- NULL
colnames(df_what_i_want) <- c("id_course1", "participants", "mark1", "mark2", "mark3", "mark4", "mark5" )
# scrape the website
url <- "https://www.fernuni-hagen.de/wirtschaftswissenschaft/studium/klausurstatistik.shtml"
courses_list <- read_html(url) %>%
html_nodes("li") %>%
html_nodes("table") %>%
html_table(fill = TRUE)
答案 0 :(得分:2)
这也行
library(janitor)
library(tidyverse)
map_dfr(my.list, ~(as.data.frame(.) %>% janitor::row_to_names(1) %>% setNames(my.list[[1]][1,])))
id_course1 participants mark1 mark2 mark3 mark4 mark5
1 id_course1 15 1 2 3 4 5
2 id_course2 30 10 8 6 4 2
3 id_course3 15 2 4 5 3 1
答案 1 :(得分:1)
我认为如果您在抓取自己的同时更正代码会更好。试试这个:
library(rvest)
url <- "https://www.fernuni-hagen.de/wirtschaftswissenschaft/studium/klausurstatistik.shtml"
read_html(url) %>%
html_nodes("li") %>%
html_nodes("table") %>%
head %>% #remove this later
html_table(fill = TRUE) %>%
purrr::map_df(~.x %>% setNames(.[1, ]) %>% slice(-1)) -> result
result