为什么多线程处理时我的PyQt代码无法完全执行?

时间:2019-07-26 13:29:39

标签: python multithreading web-scraping pyqt

我正在尝试使用PyQt5和多线程编写网络抓取工具,以便可以并行抓取多个url(我知道这一点:Scrape multiple urls using QWebPage,但我确实想编写一个并行版本,并且确实可以(不明白为什么它不起作用) 我已经编写了这段代码:

python
import sys
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *

from PyQt5.QtWebEngineWidgets import QWebEnginePage

import time

urlb = "https://www.google.fr/"


class Worker(QRunnable, QWebEnginePage):
    '''
    Worker thread
    '''
    def __init__(self, url):
        super(Worker, self).__init__()
        self.url = url

    def _on_load_finished(self):
        print("tfouuu")
        self.html = self.toHtml(self.Callable)
        print('Load finished')

    def Callable(self, html_str):
        self.html = html_str

    @pyqtSlot()
    def run(self):
        print("a") 
        time.sleep(2)
        print(self.url)
        print("b")
        QWebEnginePage.__init__(self)
        print("c")
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        print("d")

class MainWindow(QMainWindow):


    def __init__(self, *args, **kwargs):

        self.threadpool = QThreadPool()
        print("Multithreading with maximum %d threads" % self.threadpool.maxThreadCount())

        super(MainWindow, self).__init__(*args, **kwargs)

        worker = Worker(urlb)
        worker2 = Worker(urlb)
        self.threadpool.start(worker)
        self.threadpool.start(worker2)




app = QApplication([])
window = MainWindow()
app.exec_()

但是我有两个问题:

  • 第一个是我的代码不断运行而不停止(我想这与缺少app.quit()行有关,但我真的不知道将其放在哪里)

  • 大多数情况下,第二个问题是我的代码仅打印'a','b','c'->它不运行connect and load部分

1 个答案:

答案 0 :(得分:1)

QWebEngineView无法并且不应在另一个线程上运行。

相反,如果要异步获取html,则应使用Qt信号:

from functools import partial
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets


class WebManager(QtCore.QObject):
    def __init__(self, parent=None):
        super(WebManager, self).__init__(parent)
        self.pages = []
        self.results = []

    def load(self, url):
        page = QtWebEngineWidgets.QWebEnginePage(self)
        page.loadFinished.connect(self._on_load_finished)
        self.pages.append(page)
        page.load(QtCore.QUrl(url))

    @QtCore.pyqtSlot(bool)
    def _on_load_finished(self, ok):
        page = self.sender()
        if not isinstance(page, QtWebEngineWidgets.QWebEnginePage):
            return
        if ok:
            wrapper = partial(self.callable, page)
            page.toHtml(wrapper)
        else:
            self.pages.remove(page)
            page.deleteLater()

    def callable(self, page, html):
        self.pages.remove(page)
        url = page.requestedUrl().toString()
        page.deleteLater()
        self.results.append((url, html))
        if not self.pages:
            QtWidgets.QApplication.quit()


if __name__ == "__main__":
    import sys

    app = QtWidgets.QApplication(sys.argv)

    manager = WebManager()

    pages = []
    format_url = "http://pyqt.sourceforge.net/Docs/PyQt5/%s.html"
    for name in dir(QtWebEngineWidgets):
        if name.startswith("Q"):
            url = format_url % name.lower()
            manager.load(url)
    app.exec_()
    for url, html in manager.results:
        print(url)
        print(html)