我正在尝试将一些pdf文件转换为txt,并且在解析时出现意外的EOF。我还尝试使用pypdf2库,因此不必写txt文件,但它只打印空格。因此,我尝试以这种方式转换pdf文件。而且,我得到的错误是在api的psparser.py文件中。 (https://github.com/euske/pdfminer/blob/master/pdfminer/psparser.py)
pdf文件包含文本和表格。
Traceback (most recent call last):
File "<ipython-input-1-32d167ab52db>", line 1, in <module>
runfile('C:/Users/av/Desktop/Data/Preprocessing.py', wdir='C:/Users/av/Desktop/Data')
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 786, in runfile
execfile(filename, namespace)
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/av/Desktop/Data/Preprocessing.py", line 102, in <module>
main()
File "C:/Users/av/Desktop/Data/Preprocessing.py", line 98, in main
preproc.convertMultiple(txtDir)
File "C:/Users/av/Desktop/Data/Preprocessing.py", line 20, in convertMultiple
text = self.convert(pdfFilename) #get string of text content of pdf
File "C:/Users/av/Desktop/Data/Preprocessing.py", line 39, in convert
for page in PDFPage.get_pages(infile, pagenums):
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 134, in get_pages
for (pageno, page) in enumerate(klass.create_pages(doc)):
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 107, in create_pages
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 100, in search
for x in search(c, tree):
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfpage.py", line 88, in search
tree = dict_value(obj).copy()
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdftypes.py", line 164, in dict_value
x = resolve1(x)
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdftypes.py", line 84, in resolve1
x = x.resolve(default=default)
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdftypes.py", line 71, in resolve
return self.doc.getobj(self.objid)
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfdocument.py", line 691, in getobj
obj = self._getobj_parse(index, objid)
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\pdfdocument.py", line 647, in _getobj_parse
self._parser.seek(pos)
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\psparser.py", line 524, in seek
PSBaseParser.seek(self, pos)
File "C:\Users\av\AppData\Local\Continuum\anaconda3\lib\site-packages\pdfminer\psparser.py", line 211, in seek
self.fp.seek(pos)
ValueError: seek of closed file
代码:
from pathlib import *
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
class Preprocessing:
def convertMultiple(self, txtDir):
dir_path = Path('C:\\Users\\av\\Desktop\\Data\\pdf\\test')
pdf_files = dir_path.glob('*.pdf')
for pdfFilename in pdf_files: #iterate through pdfs in pdf directory
text = self.convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
#converts pdf, returns its text content as a string
def convert(self, fname, pages=None):
#print(fname)
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
#print("Output", output)
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
print(text)
return text
def main():
txtDir = "C:/Users/av/Desktop/Data/pdf/txt"
preproc = Preprocessing()
preproc.convertMultiple(txtDir)
if __name__ == "__main__":
#call main function
main()
答案 0 :(得分:0)
问题在于,您仍在文档中时,为每个页面调用zip
的时间过早。因此,当您的infile.close()
尝试进入下一页时,输入文件将不再打开。
在完全完成文档处理之前,请不要关闭文件。
也就是说,get_pages()
的底部应该更像:
convert()