我正在将PDF解析为本地CSV文件。解析器可以正常工作,但是生成的CSV不包含任何信息。我该如何解决?
我需要pandas
DataFrame
中不同列上的文档名称和文档正文。当前,生成的CSV除了列名之外,不包含任何信息。
#Import packages
import io
import os
import glob
from os import walk
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import pandas as pd
pdf_path = r'D:/Users/JAY/Downloads/ABC'
#Create a list named result to store parsed data
result = []
#PDF parser function
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh, caching=True,check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
converter.close()
fake_file_handle.close()
if text:
return text
if __name__ == '__main__':
files = [i for i in os.listdir(pdf_path) if i.endswith('pdf')]
name = [i for i in os.listdir(pdf_path)]
#Test on first 20 docs append parsed data to list result
for case in files[1:20]:
a=extract_text_from_pdf(case)
result.append([a, name])
#Create dataframe and assign column name: name and summary
df=pd.DataFrame(results, columns = ['name','summary'])
#Export to CSV
export_csv = df.to_csv (r'D:\Users\JAY\Downloads\ABC.csv', index = None, header=True)