我有一个可在pdf文档中查找文本的应用程序。在完整的应用程序中,我包括了多处理过程,因为读取某些pdf可能非常慢,并且具有超时功能。
作为脚本('python main.py')运行时,此过程可以正常工作。
我在Windows上使用pyinstaller创建一个exe文件。
如果我将这一行注释掉:
document_text = self._get_pages_through_process(path)
有效。
但是,如果我使用此行并注释掉
document_text = self._get_pages(path),
单击“查找”按钮会生成框架的第二个副本,并且该过程无法运行。
有人可以提出解决方案吗?
import wx
import os
import PyPDF4
from multiprocessing import Process, Queue
class SearchScreen(wx.Frame):
def __init__(self, *args, **kwargs):
super(SearchScreen, self).__init__(None, *args, **kwargs)
self.Title = 'Text search'
panel = MainPanel(self)
sizer = wx.BoxSizer()
sizer.Add(panel)
self.SetSizerAndFit(sizer)
self.Show()
self.Center()
def on_cmd_find_click(self, event):
del event
search = Search().search(path='.', search_text='anything')
label = '{} documents found'.format(len(search))
self.lbl_found.SetLabel(label)
class MainPanel(wx.Panel):
def __init__(self, parent, *args, **kwargs):
super(MainPanel, self).__init__(parent, *args, **kwargs)
cmd_find = wx.Button(self, id=wx.ID_FIND)
cmd_find.Bind(wx.EVT_BUTTON, parent.on_cmd_find_click)
parent.lbl_found = wx.StaticText(self, label='')
sizer = wx.BoxSizer(wx.VERTICAL)
sizer.Add((200, 0))
sizer.Add(cmd_find, flag=wx.ALL, border=5)
sizer.Add(parent.lbl_found, flag=wx.LEFT, border=5)
self.SetSizer(sizer)
class Search(object):
def __init__(self):
self.time_out = 5
def search(self, path, search_text):
documents_searched = 0
documents_with_text = []
self.document_paths = self._get_all_docs(path)
for path in self.document_paths:
#document_text = self._get_pages_through_process(path)
document_text = self._get_pages(path)
for page_text in document_text:
if search_text in page_text:
documents_with_text.append(path)
break
documents_searched += 1
return documents_with_text
def _get_pages_through_process(self, path, pfd_output=None):
document_text = []
pfd_output = Queue()
process = Process(target=self._get_pages, args=(path, pfd_output))
process.start()
process.join(self.time_out)
if process.is_alive():
document_text = pfd_output.get()
process.terminate()
if not document_text:
self.timed_out.append(path)
else:
document_text = pfd_output.get()
return document_text
def _get_pages(self, path, pfd_output=None):
full_text = []
doc = PyPDF4.PdfFileReader(path)
pages = doc.getNumPages()
for page_number in range(pages):
page = doc.getPage(page_number)
text = page.extractText()
full_text.append(text)
if pfd_output:
pfd_output.put(full_text)
return full_text
def _get_all_docs(self, path):
documents = []
for root, dirs, files in os.walk(path):
for file_name in files:
extension = os.path.splitext(file_name)[1]
if extension.replace('.', '') == 'pdf':
file_path = root + os.sep + file_name
documents.append(file_path)
return documents
if __name__ == '__main__':
wx_app = wx.App()
SearchScreen()
wx_app.MainLoop()