我已经学习R一周了,但是我一直在尝试从多个pdf中提取特定单词。有人对此有任何解决方案吗?我一直在到处寻找它。
library(tidyverse)
library(pdftools)
library(dplyr)
library(tm) library(RWeka)
library(magrittr)
library(slam)
library(quanteda)
library(SnowballC)
read in all the .pdf files
files <- list.files("C:../Desktop/pdfs",full.names = TRUE, pattern='.pdf')
files_text <- lapply(files, pdf_text)
length(files_text[7])
head(files_text[7])
lapply(files_text,length)
Create a document matrix
corp <- Corpus(URISource(files),readerControl = list(reader = readPDF))
corp <- tm_map(corp, removePunctuation, ucp = TRUE)
files_text.tdm <- TermDocumentMatrix(corp,control =list(stopwords = TRUE,tolower = TRUE,stemming = TRUE,removeNumbers = TRUE,bounds = list(global = c(3, Inf))))
Search for specific words
corp <- corpus(files_text)
files_text <- tokens(corp, what="sentence")
grep ("risk|upside|downside", files_text, value= TRUE)