Question

我正在使用rstudio版本0.99.879并使用以下任务包rvest版本0.3.2（我是网络报废技术的初学者）：

对于一个研究项目，我想废弃期刊网站，以提取有关特定文章的作者，机构从属关系等的信息。
我这样做的方式如下：

#Specifying the url for desired website to be scrapped (article)
webpage001 <- read_html("https://link.springer.com/article/10.1007/s12286-017-0325-1")
#Using CSS selectors to scrap the rankings section (with Abstract)
abstract_html001 <- html_node(webpage001,".Para")
authors_html001 <- html_nodes(webpage001,".authors__name")
affiliation_html001 <- html_nodes(webpage001, ".affiliation__item")

#Converting the title data to text 
abstract001 <- html_text(abstract_html001)
authors001 <- html_text(authors_html001)
affiliation001 <- html_text(affiliation_html001)
# creating a data frame
text01 <- data.frame(Abstract = abstract001, Author = authors001, 
Institution = affiliation001)
text01

这很好用，我得到一个obs。和三个变量。但也有学术文章，如书评不包含摘要。如果我尝试相同的审查，

webpage002 <- read_html("https://link.springer.com/article/10.1007/s12286-017-0324-2")
authors_html002 <- html_nodes(webpage002,".authors__name")
affiliation_html002 <- html_nodes(webpage002, ".affiliation__item")
authors002 <- html_text(authors_html002)
affiliation002 <- html_text(affiliation_html002)
# creating a data frame
text02 <- data.frame(Author = authors001, Institution = affiliation001)
text02

我得到一个障碍物。和两个变量。最后，我想合并两个数据帧，但由于列数不等，rbind函数不起作用。
对于一些文本，我可以分配以下内容

abstract002 <- NA
text02 <- data.frame(Abstract = abstract002, Author = authors002, 
Institution = affiliation002)
text02

total <- rbind(text01, text02)

然而，对于大量的文本，这将是太多了，我想知道是否有办法以不同方式或甚至半自动化（例如，每次，文本没有摘要，< em> NA 被分配到正确的列中。

有谁知道如何做到这一点或解决它？

提前致谢！

Answer 1

使用rvest和purrr，我们可以：

library(rvest)
library(purrr)

url <- 'https://link.springer.com/article/10.1007/s12286-017-0325-1'
url2 <- 'https://link.springer.com/article/10.1007/s12286-017-0324-2'

l <- list(url, url2)

l %>% 
    map_df( ~{

        h <- read_html(.x)

        abstract <- html_node(h, '.Para') %>%
            html_text()

        author <- html_node(h, '.authors__name') %>% 
            html_text()

        affiliation <- html_node(h, '.affiliation__item') %>% 
            html_text()

        data.frame(abstract, author, affiliation, stringsAsFactors = FALSE)
    })
#>  abstract
#> 1 When the Cold War ended, many non-democratic...
#> 2 <NA>
#>               author
#> 1       MarleneÂ Mauk
#> 2 ChristinaÂ Forsbach
#>                                                                        affiliation
#> 1 Institut fÃ¼r PolitikwissenschaftJohannes Gutenberg-UniversitÃ¤t MainzMainzGermany
#> 2     Institut fÃ¼r SozialwissenschaftenUniversitÃ¤t HildesheimHildesheimDeutschland

Answer 2

这是假数据：

df1 <- data.frame( Abstract = letters[1:3], Author = letters[4:6], Institution = letters[7:9] )
df2 <- data.frame( Author = letters[10:12], Institution = letters[13:15] )

df1
  Abstract Author Institution
1        a      d           g
2        b      e           h
3        c      f           i

df2
  Author Institution
1      j           m
2      k           n
3      l           o

我建议添加一列Obs以保留每个观察的唯一标识符：

df1 <- df1 %>% mutate( Obs = 1:nrow(df1) )
df2 <- df2 %>% mutate( Obs = (nrow(df1)+1):(nrow(df1)+nrow(df2)) )

  Abstract Author Institution Obs
1        a      d           g   1
2        b      e           h   2
3        c      f           i   3

然后gather将数据首先转换为长格式并合并它们（仅显示head）：

df3 <- df1 %>%
       gather(key,value,-Obs) %>% 
       rbind(gather(df2,key,value,-Obs))

   Obs         key value
1    1    Abstract     a
2    2    Abstract     b
3    3    Abstract     c
4    1      Author     d
5    2      Author     e

然后再次spread数据：

df3 <- df3 %>% spread(key=key,value=value)

  Obs Abstract Author Institution
1   1        a      d           g
2   2        b      e           h
3   3        c      f           i
4   4     <NA>      j           m
5   5     <NA>      k           n
6   6     <NA>      l           o

网页抓取不同数量的变量（rvest）

2 个答案: