我写了一个函数,可以将目录中的每个pdf转换为文本,我想从pdf的txt文件中获取转换后的文本。我正在
"TypeError: expected str, bytes or os.PathLike object, not tuple".
有人可以帮我吗?在此处附加代码:
import io
import os
import os.path
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.BytesIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
# close open handles
converter.close()
fake_file_handle.close()
if text:
return text
def parse_pdf(filename):
"Parse a pdf into text"
content = io.getPDFContent(filename)
encoded = content.encode("utf-8")
## split of the pdf extension to add .txt instead.
(root, _) = os.path.splitext(filename)
text_file = open(root + ".txt", "w")
text_file.write(encoded)
text_file.close()
if __name__ == '__main__':
pdf_path = 'C:\\trainingdata_eng'
for file in os.walk(pdf_path):
extract_text_from_pdf(file)
for f in file:
parse_pdf(f)