我正在尝试将网络上的pdf导入R:
library(tm)
webpdf <- "https://www.lme.com/~/media/Files/Market%20data/COTR/2015/2015_01/Cotr%2019%20Jan%202015.pdf"
uri <- sprintf("file://%s", system.file(file.path("doc", webpdf), package = "tm"))
if(all(file.exists(Sys.which(c("pdfinfo", "pdftotext"))))) {
pdf <- readPDF(control = list(text = "-layout"))(elem = list(uri = uri),
language = "en",
id = "id1")
content(pdf)[1:13]
}
VCorpus(URISource(uri, mode = ""),
readerControl = list(reader = readPDF(engine = "ghostscript")))
我无法执行此操作并收到错误消息:
Error in system2(gs_cmd, c("-dNODISPLAY -q", sprintf("-sFile=%s", shQuote(file)), :
'""' not found
答案 0 :(得分:1)
初始设置存在很多问题。这将为您提供PDF内容,但您应该针对您将要遇到的tm
Corpus
问题提出另一个问题。
library(tm)
library(httr) # this will make it easier to get to https conent
webpdf <- "https://www.lme.com/~/media/Files/Market%20data/COTR/2015/2015_01/Cotr%2019%20Jan%202015.pdf"
doc <- "cotr.pdf"
# save the file locally, write_disk() will act like a cache
stop_for_status(GET(webpdf, write_disk(doc)))
if(all(file.exists(Sys.which(c("pdfinfo", "pdftotext"))))) {
pdf <- readPDF(control = list(text = "-layout"))(elem = list(uri = doc),
language = "en",
id = "id1")
# httr also has a "content()" so make the call explicit
NLP::content(pdf)[1:13]
}
print(str(pdf))
## List of 2
## $ content: chr [1:113] "Commitment of Trader Report - Market Report as of 2015/01/21" "" "Metal" "AA" ...
## $ meta :List of 7
## ..$ author : NULL
## ..$ datetimestamp: POSIXlt[1:1], format: "2015-01-21 08:59:10"
## ..$ description : NULL
## ..$ heading : NULL
## ..$ id : chr "cotr.pdf"
## ..$ language : chr "en"
## ..$ origin : NULL
## ..- attr(*, "class")= chr "TextDocumentMeta"
## - attr(*, "class")= chr [1:2] "PlainTextDocument" "TextDocument"
## NULL