Question

我有一个可在pdf文档中查找文本的应用程序。在完整的应用程序中，我包括了多处理过程，因为读取某些pdf可能非常慢，并且具有超时功能。

作为脚本（'python main.py'）运行时，此过程可以正常工作。

我在Windows上使用pyinstaller创建一个exe文件。

如果我将这一行注释掉：

document_text = self._get_pages_through_process(path)

有效。

但是，如果我使用此行并注释掉

document_text = self._get_pages(path),

单击“查找”按钮会生成框架的第二个副本，并且该过程无法运行。

有人可以提出解决方案吗？

    import wx
    import os
    import PyPDF4
    from multiprocessing import Process, Queue


    class SearchScreen(wx.Frame):
        def __init__(self, *args, **kwargs):
            super(SearchScreen, self).__init__(None, *args, **kwargs)
            self.Title = 'Text search'
            panel = MainPanel(self)
            sizer = wx.BoxSizer()
            sizer.Add(panel)
            self.SetSizerAndFit(sizer)
            self.Show()
            self.Center()

        def on_cmd_find_click(self, event):
            del event
            search = Search().search(path='.', search_text='anything')
            label = '{} documents found'.format(len(search))
            self.lbl_found.SetLabel(label)


    class MainPanel(wx.Panel):
        def __init__(self, parent, *args, **kwargs):
            super(MainPanel, self).__init__(parent, *args, **kwargs)

            cmd_find = wx.Button(self, id=wx.ID_FIND)
            cmd_find.Bind(wx.EVT_BUTTON, parent.on_cmd_find_click)
            parent.lbl_found = wx.StaticText(self, label='')

            sizer = wx.BoxSizer(wx.VERTICAL)
            sizer.Add((200, 0))
            sizer.Add(cmd_find, flag=wx.ALL, border=5)
            sizer.Add(parent.lbl_found, flag=wx.LEFT, border=5)
            self.SetSizer(sizer)


    class Search(object):
        def __init__(self):
            self.time_out = 5

        def search(self, path, search_text):
            documents_searched = 0
            documents_with_text = []
            self.document_paths = self._get_all_docs(path)
            for path in self.document_paths:
                #document_text = self._get_pages_through_process(path)
                document_text = self._get_pages(path)
                for page_text in document_text:
                    if search_text in page_text:
                        documents_with_text.append(path)
                        break
                documents_searched += 1
            return documents_with_text

        def _get_pages_through_process(self, path, pfd_output=None):
            document_text = []
            pfd_output = Queue()
            process = Process(target=self._get_pages, args=(path, pfd_output))
            process.start()
            process.join(self.time_out)
            if process.is_alive():
                document_text = pfd_output.get()
                process.terminate()
                if not document_text:
                    self.timed_out.append(path)
            else:
                document_text = pfd_output.get()
            return document_text

        def _get_pages(self, path, pfd_output=None):
            full_text = []
            doc = PyPDF4.PdfFileReader(path)
            pages = doc.getNumPages()
            for page_number in range(pages):
                page = doc.getPage(page_number)
                text = page.extractText()
                full_text.append(text)
            if pfd_output:
                pfd_output.put(full_text)
            return full_text

        def _get_all_docs(self, path):
            documents = []
            for root, dirs, files in os.walk(path):
                for file_name in files:
                    extension = os.path.splitext(file_name)[1]
                    if extension.replace('.', '') == 'pdf':
                        file_path = root + os.sep + file_name
                        documents.append(file_path)
            return documents


    if __name__ == '__main__':
        wx_app = wx.App()
        SearchScreen()
        wx_app.MainLoop()

pyinstaller通过多处理程序产生了额外的进程

0 个答案: