我试图从我从pdf文件中提取的文本中搜索一个单词,这是OCR&#d; d格式。此pdf文件有多个页面,因此对于每个页面,我都会搜索该单词,如果找到该单词,则写入文件名,状态(是否存在)现在),找到它的页面以及它在数据框中找到的单词。但数据框正在给出状态"现在"对于所有文件,我只想这样
file_name Status Page words
test1.pdf "Present" test1_2,test1_4 gym,school
test2.pdf "Not Present" - -
test3.pdf "Present" test3_1 gym
我在这段代码中遗漏了什么。
这是代码
All_files=Sys.glob("*.pdf")
v1 <- numeric(length(All_files))
chk_words=c("Swimming pool","Gym","west","para")
word <- "Gym"
tc=c()
ps=c()
x=list()
df <- data.frame()
Status="Present"
for (i in seq_along(All_files)){
file_name <- All_files[i]
cnt <- pdf_info(All_files[i])$pages
print(cnt)
for(j in seq_len(cnt)){
img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
text <- ocr(img_file)
ocr_text <- capture.output(cat(text))
check <- sapply(ocr_text, paste, collapse="")
junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff")
file.remove(junk)
br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"
else "Present"
print(br)
if(br=="Present") {
v1[i] <- j
break}
for(k in chk_words){
br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")}
if(br == "Present")
ps=k
x[[k]]=ps
tc=unlist(unique(x))
}
}
print(tc)
Status <- if(v1[i] == 0) "Not Present" else "Present"
pages <- if(v1[i] == 0) "-" else
paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
words <- if(v1[i] == 0) "-" else word
df <- rbind(df, cbind(file_name = basename(file_name),
Status, pages = pages, words = words,tc))
}
任何建议都值得赞赏。
由于
答案 0 :(得分:2)
这是单词
的选项v1 <- numeric(length(All_files))
word <- "school"
df <- data.frame()
Status="Present"
for (i in seq_along(All_files)){
file_name <- All_files[i]
cnt <- pdf_info(All_files[i])$pages
print(cnt)
for(j in seq_len(cnt)){
img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
text <- ocr(img_file)
ocr_text <- capture.output(cat(text))
check <- sapply(ocr_text, paste, collapse="")
junk <- dir(path= paste0(path, "/tiff"), pattern="tiff")
file.remove(junk)
br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"
else "Present"
print(br)
if(br=="Present") {
v1[i] <- j
break}
}
Status <- if(v1[i] == 0) "Not Present" else "Present"
pages <- if(v1[i] == 0) "-" else
paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
words <- if(v1[i] == 0) "-" else word
df <- rbind(df, cbind(file_name = basename(file_name),
Status, pages = pages, words = words))
}
-output
df
# file_name Status pages words
#1 Amenities.pdf Not Present - -
#2 test.pdf Present test_2 school