使用 R 抓取 pdf 文件

时间:2021-03-30 19:17:51

标签: html r web-scraping

我一直在网上从牛津期刊中抓取 R 语言的文章,并想获取特定文章的全文。所有文章都有指向它们的 pdf 链接,所以我一直在尝试拉出 pdf 链接并将整个文本刮到 csv 上。全文应全部放入 1 行,但 csv 文件中的输出显示一篇 11 行的文章。我该如何解决这个问题?

代码如下:

####install.packages("rvest")
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#for Fulltext to read pdf
####install.packages("pdftools")
library(pdftools)


fullText <- function(parsedDocument){
  endLink <- parsedDocument %>%
    html_node('.article-pdfLink') %>% html_attr('href')
  frontLink <- "https://academic.oup.com"
  #link  of pdf
  pdfLink <- paste(frontLink,endLink,sep = "")
  #extract full text  from pdfLink
  pdfFullText <- pdf_text(pdfLink)
  fulltext <- paste(pdfFullText, sep = "\n")
  return(fulltext)
}
#############################################

#main function with input as parameter year
testFullText <- function(DOIurl){
  parsedDocument <- read_html(DOIurl)
  DNAresearch <- data.frame()
  allData <- data.frame("Full Text" = fullText(parsedDocument), stringsAsFactors = FALSE)
  DNAresearch <-  rbind(DNAresearch, allData)
  write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
}
testFullText("https://doi.org/10.1093/dnares/dsm026")

1 个答案:

答案 0 :(得分:0)

这就是我处理这项任务的方式。

library(tidyverse)
library(rvest)

df <- data.frame(
  # you have a data.frame with a column where there are links to html research articles
  links_to_articles = c("https://doi.org/10.1093/dnares/dsm026", "https://doi.org/10.1093/dnares/dsm027")
) %>%
  # telling R to process each row separately (it is useful because functions such as read_html process one link rather than a vector of links)
  rowwise() %>%
  mutate(
    pdf_link =  read_html(links_to_articles) %>%
      html_node('.article-pdfLink') %>%
      html_attr('href') %>%
      paste0("https://academic.oup.com", .),
    articles_txt = pdf_text(pdf_link) %>%
      paste0(collapse = " ")
  ) %>%
  ungroup()

# writing the csv
df %>%
  write_csv(file = "DNAresearch.csv")

使用您的代码,我会这样做:

####install.packages("rvest")
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#for Fulltext to read pdf
####install.packages("pdftools")
library(pdftools)


fullText <- function(parsedDocument){
  endLink <- parsedDocument %>%
    html_node('.article-pdfLink') %>% html_attr('href')
  frontLink <- "https://academic.oup.com"
  #link  of pdf
  pdfLink <- paste(frontLink,endLink,sep = "")
  #extract full text  from pdfLink
  pdfFullText <- pdf_text(pdfLink)
  fulltext <- paste(pdfFullText, collapse = " ") # here I changed sep to collapse
  return(fulltext)
}
#############################################

#main function with input as parameter year
testFullText <- function(DOIurl){
  parsedDocument <- read_html(DOIurl)
  DNAresearch <- data.frame()
  allData <- data.frame("Full Text" = fullText(parsedDocument) %>% str_squish(), stringsAsFactors = FALSE) # here I used str_squish to remove extra spaces
  DNAresearch <-  rbind(DNAresearch, allData)
  write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
}
testFullText("https://doi.org/10.1093/dnares/dsm026")