错误的输出:从pdf,docx的pptx提取文本将不会在其自己的分行中输出

时间:2019-04-19 17:07:54

标签: python pandas anaconda pdfminer pathlib

我创建了一个函数,该函数将打开目录中的每个文件,并从每个文件中提取文本,然后使用Pandas将其输出到excel工作表中。每种文件类型的索引似乎都可以正常工作。但是,从每个文件中提取的文本在列表中紧挨着出现,并且没有分开并且紧挨着它们的对应文件。

有关当前输出和我想要的输出,请参见脚本底部。

**我相信问题出在loader()函数,该函数采用路径,遍历每个目录文件,检查文件.ext并提取文本。

谢谢!

import re
#import PyPDF4
import pathlib
from pathlib import Path 
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
import more_itertools as mit



p = Path('C:/Users/XXXX/Desktop/test')


txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
#excel_files = list(p.rglob('xls'))




def pdf_to_text(x):

    # PDFMiner 
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = open(x, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text

#-------------------------------------------------------------------------------

def loader(path):
    with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
        docx_out,pptx_out,pdf_out = [],[],[]
        if path.suffix == ".pdf":
            for name1 in PDF_files:
                pdf_out.append(pdf_to_text(name1))
            return pdf_out
        elif path.suffix == ".docx":
            for name2 in docx_files:
                docx_out.append(docx2txt.process(name2))
            return docx_out
        elif path.suffix == ".pptx":
            for file in pptx_files:
                prs = Presentation(file)
                for slide in prs.slides:
                    for shape in slide.shapes:
                        if not shape.has_text_frame:
                            continue
                        for paragraph in shape.text_frame.paragraphs:
                            for run in paragraph.runs:
                                pptx_out.append(run.text)
            return pptx_out
        else:
            return f.readlines()

print(pdf_out)



def file_generator():
    files = txt_files+PDF_files+csv_files+docx_files+pptx_files
    for item in files:
        yield {
            "path": item,
            "name": item.name[0:],
            "created": time.ctime(item.stat().st_ctime),
            "modified": time.ctime(item.stat().st_mtime),
            "content": loader(item) 
        }


def to_xlsx():
    df = pd.DataFrame.from_dict(file_generator())
    df.head()
    df.to_excel("tester4.xlsx")

if __name__ == "__main__":
    to_xlsx()
#------------------------------------------------------------
OUTPUT EXAMPLE

current output:                                
  content
["content_test1","content_test2"]  test1.pdf
["content_test1","content_test2"]  test2.pdf

What I want:
["content_test1"]  test1.pdf
["content_test2"]  test2.pdf

1 个答案:

答案 0 :(得分:1)

每个filetype_out函数调用的追加看起来就像是将每个文件的内容添加到与该文件类型有关的列表的末尾。如果要使用每个文件的内容生成一个唯一列表,建议您为每个文件类型创建一个单独的dict,然后为每个处理的文件包括一个单独的列表。以PDF为例:

def loader(path):
    with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
        docx_out,pptx_out,pdf_out = {},{},{}
        if path.suffix == ".pdf":
            for name1 in PDF_files:
                name1_contents = []
                name1_contents.append(pdf_to_text(name1))
                pdf_out[name1] = name1_contents
            return pdf_out

然后以与以前相似的方式打印结果:

for name, contents in pdf_out:
    print(contents + '  ' + name)