在R中写入数据帧时出错

时间:2017-09-25 05:24:57

标签: r for-loop if-statement dataframe break

我试图从我从pdf文件中提取的文本中搜索一个单词,这是OCR&#d; d格式。此pdf文件有多个页面,因此对于每个页面,我都会搜索该单词,如果找到该单词,则写入文件名状态(是否存在)现在),找到它的页面以及它在数据框中找到的单词。但数据框正在给出状态"现在"对于所有文件,我只想这样

file_name       Status        Page              words
test1.pdf    "Present"       test1_2,test1_4    gym,school
test2.pdf    "Not Present"     -                 -
test3.pdf    "Present"       test3_1            gym

我在这段代码中遗漏了什么。

这是代码

    All_files=Sys.glob("*.pdf")
v1 <- numeric(length(All_files))
chk_words=c("Swimming pool","Gym","west","para")
word <- "Gym"
tc=c()
ps=c()
x=list()
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

  cnt <- pdf_info(All_files[i])$pages
  print(cnt)

  for(j in seq_len(cnt)){
    img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
    text <- ocr(img_file)
    ocr_text <- capture.output(cat(text))
    check <- sapply(ocr_text, paste, collapse="")
    junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff")
    file.remove(junk)
    br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
    else "Present" 
    print(br)       
    if(br=="Present") {
      v1[i] <- j
      break}

    for(k in chk_words){ 
      br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")}
      if(br == "Present")
        ps=k
      x[[k]]=ps
      tc=unlist(unique(x))
    }




  }

  print(tc)
  Status <- if(v1[i] == 0) "Not Present" else "Present"
  pages <- if(v1[i] == 0) "-" else 
    paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
  words <- if(v1[i] == 0) "-" else word
  df <- rbind(df, cbind(file_name = basename(file_name),
                        Status, pages = pages, words = words,tc))


}

任何建议都值得赞赏。

由于

1 个答案:

答案 0 :(得分:2)

这是单词

的选项
v1 <- numeric(length(All_files))
word <- "school"
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

    cnt <- pdf_info(All_files[i])$pages
    print(cnt)

    for(j in seq_len(cnt)){
      img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
      text <- ocr(img_file)
      ocr_text <- capture.output(cat(text))
      check <- sapply(ocr_text, paste, collapse="")
      junk <- dir(path= paste0(path, "/tiff"), pattern="tiff")
      file.remove(junk)
      br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
              else "Present" 
      print(br)       
      if(br=="Present") {
         v1[i] <- j
         break}

    }

    Status <- if(v1[i] == 0) "Not Present" else "Present"
    pages <- if(v1[i] == 0) "-" else 
     paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
    words <- if(v1[i] == 0) "-" else word
    df <- rbind(df, cbind(file_name = basename(file_name),
              Status, pages = pages, words = words))


}

-output

df
#     file_name      Status  pages  words
#1 Amenities.pdf Not Present      -      -
#2      test.pdf     Present test_2 school