我创建了一个函数,该函数将打开目录中的每个文件,并从每个文件中提取文本,然后使用Pandas将其输出到excel工作表中。每种文件类型的索引似乎都可以正常工作。但是,从每个文件中提取的文本在列表中紧挨着出现,并且没有分开并且紧挨着它们的对应文件。
有关当前输出和我想要的输出,请参见脚本底部。
**我相信问题出在loader()函数,该函数采用路径,遍历每个目录文件,检查文件.ext并提取文本。
谢谢!
import re
#import PyPDF4
import pathlib
from pathlib import Path
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
import more_itertools as mit
p = Path('C:/Users/XXXX/Desktop/test')
txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
#excel_files = list(p.rglob('xls'))
def pdf_to_text(x):
# PDFMiner
rsrcmgr = PDFResourceManager()
sio = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Extract text
fp = open(x, 'rb')
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
# Get text from StringIO
text = sio.getvalue()
# Cleanup
device.close()
sio.close()
return text
#-------------------------------------------------------------------------------
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = [],[],[]
if path.suffix == ".pdf":
for name1 in PDF_files:
pdf_out.append(pdf_to_text(name1))
return pdf_out
elif path.suffix == ".docx":
for name2 in docx_files:
docx_out.append(docx2txt.process(name2))
return docx_out
elif path.suffix == ".pptx":
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
pptx_out.append(run.text)
return pptx_out
else:
return f.readlines()
print(pdf_out)
def file_generator():
files = txt_files+PDF_files+csv_files+docx_files+pptx_files
for item in files:
yield {
"path": item,
"name": item.name[0:],
"created": time.ctime(item.stat().st_ctime),
"modified": time.ctime(item.stat().st_mtime),
"content": loader(item)
}
def to_xlsx():
df = pd.DataFrame.from_dict(file_generator())
df.head()
df.to_excel("tester4.xlsx")
if __name__ == "__main__":
to_xlsx()
#------------------------------------------------------------
OUTPUT EXAMPLE
current output:
content
["content_test1","content_test2"] test1.pdf
["content_test1","content_test2"] test2.pdf
What I want:
["content_test1"] test1.pdf
["content_test2"] test2.pdf
答案 0 :(得分:1)
每个filetype_out函数调用的追加看起来就像是将每个文件的内容添加到与该文件类型有关的列表的末尾。如果要使用每个文件的内容生成一个唯一列表,建议您为每个文件类型创建一个单独的dict,然后为每个处理的文件包括一个单独的列表。以PDF为例:
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = {},{},{}
if path.suffix == ".pdf":
for name1 in PDF_files:
name1_contents = []
name1_contents.append(pdf_to_text(name1))
pdf_out[name1] = name1_contents
return pdf_out
然后以与以前相似的方式打印结果:
for name, contents in pdf_out:
print(contents + ' ' + name)