我有一个pdf列表和关键字列表,每个关键字都应贯穿每个pdf,如果pdf中存在,则返回TRUE
尝试了两个for
循环,但是它仅返回last关键字,并且结果也不正确,所有都返回TRUE。只有一个pdf包含该单词,但所有pdf都显示TRUE
library(pdftools)
library(stringr)
library(tm)
library(filesstrings)
library(RODBC)
setwd("C:/RProject/ReadPDF/InputFiles/")
SelectFirstKeyword <- list("new formula" , "new research", "morning up")
ID <- list.files("C:/RProject/ReadPDF/InputFiles/", full.names = T)
ID_ <- ID[ID != ""]
files <- ID_
for(i in 1:length(files)){
for(j in 1:length(SelectFirstKeyword)){
filename <- files[i]
read <- readPDF(control = list(text = "-layout"))
mystring <- Corpus(URISource(filename), readerControl = list(reader = read))
lower_string <- tolower(mystring)
CleanData <- gsub("\n", " ",lower_string)
second_string <- tolower(SelectFirstKeyword[j])
print(second_string)
mystring <- paste(CleanData,sep=" ")
mystring_vector <- str_split(CleanData, "!")[[1]]
FirstMatch <- second_string
Match1 <- grepl(FirstMatch,mystring_vector[1])
mystring <- paste(CleanData,sep=" ")
mystring_vector <- str_split(CleanData, "!")[[1]]
FirstMatch <- second_string
Match1 <- grepl(FirstMatch,mystring_vector[1])
FinalOutput <- paste(pdf_list,Outid, Match1)
View(FinalOutput)
if (Match1 == TRUE)
{
DATA1 <- paste(ID = pdf_list, Outid = FirstMatch , Keywordinnote = Match1)
}
}
}
View(DATA1)
### I shall email you the pdf files
预期输出为:
ID outid keywordinnote
1 news TRUE
2 new formula TRUE