Question

我正在尝试将一些pdf文件转换为txt，并且在解析时出现意外的EOF。我还尝试使用pypdf2库，因此不必写txt文件，但它只打印空格。因此，我尝试以这种方式转换pdf文件。而且，我得到的错误是在api的psparser.py文件中。（https://github.com/euske/pdfminer/blob/master/pdfminer/psparser.py）

pdf文件包含文本和表格。

Traceback (most recent call last):
  File "<ipython-input-1-32d167ab52db>", line 1, in <module>
    runfile('C:/Users/av/Desktop/Data/Preprocessing.py', wdir='C:/Users/av/Desktop/Data')
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 786, in runfile
    execfile(filename, namespace)
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)
  File "C:/Users/av/Desktop/Data/Preprocessing.py", line 102, in <module>
    main()
  File "C:/Users/av/Desktop/Data/Preprocessing.py", line 98, in main
    preproc.convertMultiple(txtDir)
  File "C:/Users/av/Desktop/Data/Preprocessing.py", line 20, in convertMultiple
    text = self.convert(pdfFilename) #get string of text content of pdf
  File "C:/Users/av/Desktop/Data/Preprocessing.py", line 39, in convert
    for page in PDFPage.get_pages(infile, pagenums):
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 134, in get_pages
    for (pageno, page) in enumerate(klass.create_pages(doc)):
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 107, in create_pages
    for (objid, tree) in search(document.catalog['Pages'], document.catalog):
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 100, in search
    for x in search(c, tree):
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 88, in search
    tree = dict_value(obj).copy()
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdftypes.py", line 164, in dict_value
    x = resolve1(x)
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdftypes.py", line 84, in resolve1
    x = x.resolve(default=default)
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdftypes.py", line 71, in resolve
    return self.doc.getobj(self.objid)
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfdocument.py", line 691, in getobj
    obj = self._getobj_parse(index, objid)
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfdocument.py", line 647, in _getobj_parse
    self._parser.seek(pos)
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\psparser.py", line 524, in seek
    PSBaseParser.seek(self, pos)
  File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\psparser.py", line 211, in seek
    self.fp.seek(pos)
ValueError: seek of closed file

代码：

from pathlib import *
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
class Preprocessing:

    def convertMultiple(self, txtDir):
        dir_path = Path('C:\\Users\\av\\Desktop\\Data\\pdf\\test')
        pdf_files = dir_path.glob('*.pdf')

        for pdfFilename in pdf_files: #iterate through pdfs in pdf directory
            text = self.convert(pdfFilename) #get string of text content of pdf
            textFilename = txtDir + pdf + ".txt"
            textFile = open(textFilename, "w") #make text file
            textFile.write(text) #write text to text file

    #converts pdf, returns its text content as a string
    def convert(self, fname, pages=None):
        #print(fname)
        if not pages:
            pagenums = set()
        else:
            pagenums = set(pages)

        output = StringIO()
        manager = PDFResourceManager()
        converter = TextConverter(manager, output, laparams=LAParams())
        #print("Output", output)
        interpreter = PDFPageInterpreter(manager, converter)
        infile = open(fname, 'rb')
        for page in PDFPage.get_pages(infile, pagenums):
            interpreter.process_page(page)
            infile.close()
            converter.close()
            text = output.getvalue()
            output.close
            print(text)
        return text 
def main():
    txtDir = "C:/Users/av/Desktop/Data/pdf/txt"
    preproc = Preprocessing()
    preproc.convertMultiple(txtDir)

if __name__ == "__main__":
    #call main function
    main()

Answer 1

问题在于，您仍在文档中时，为每个页面调用zip的时间过早。因此，当您的infile.close()尝试进入下一页时，输入文件将不再打开。

在完全完成文档处理之前，请不要关闭文件。

也就是说，get_pages()的底部应该更像：

convert()

pdfminer PDFPage.get_pages（）中的“查找关闭的文件”

1 个答案: