我有一个从PowerPoint中提取文本的功能。但是,输出是来自一个大列表中所有powerpoint文件的所有文本。如何分隔文本,以便最终得到我提取的两个PowerPoint文件的两个文本列表?
text_runs = []
def pptx_collect(x):
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_runs.append(run.text)
return(text_runs)
def Powerpoint(pptx_files):
for name in pptx_files:
#print(name)
IP_list = (pptx_collect(name))
for item in IP_list:
#print(item)
keyword = re.findall(inp,item)
keyword1 = re.findall(inp1,item)
keyword2 = re.findall(word_search,item)
#print(ip_test)
file_dict['keyword'].append(keyword+keyword1+keyword2)
file_dict['name'].append(name.name[0:])
file_dict['created'].append(time.ctime(name.stat().st_ctime))
file_dict['modified'].append(time.ctime(name.stat().st_mtime))
file_dict['path'].append(name)
file_dict["content"].append(IP_list) #<--- This is where the
#problem is.
#print(file_dict)
return(file_dict)
Powerpoint(pptx_files)
我得到的输出是:
['Billy’s ', 'pii', 'Just a test', '04/15/1991', '04.15.1991', '234-23-6456-billys ', 'SSN', 'Address: 58 bonnie ', 'rd', ', 'mass 07037', 'Text from second 2 ', 'Text from second ', 'powerpoint', ' ', '(second page)', 'Text from second 2 ', 'Text from second ', 'powerpoint', ' ', '(second page)', 'FOUO Test', 'Secret', 'This is a test to check ', 'for keywords']
我想得到:
['Billy’s ', 'pii', 'Just a test', '04/15/1991', '04.15.1991', '234-23-6456-billys ', 'SSN', 'Address: 58 bonnie ', 'rd', ', Boston, mass 07037', 'Text from second 2 ']
['Text from second ', 'powerpoint', ' ', '(second page)', 'Text from second 2 ', 'Text from second ', 'powerpoint', ' ', '(second page)', 'FOUO Test', 'Secret', 'This is a test to check ', 'for keywords']
答案 0 :(得分:0)
pptx_collect()函数遍历所有文件。试试这个:
def pptx_collect(x):
prs = Presentation(x)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_runs.append(run.text)
return(text_runs)
答案 1 :(得分:0)
def pptx_collect(x):
for file in pptx_files:
inner_list = []
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
inner_list.append(run.text)
text_runs.append(inner_list)
return(text_runs)
我还建议在函数中定义text_runs