我正在尝试使用PyQt5和多线程编写网络抓取工具,以便可以并行抓取多个url(我知道这一点:Scrape multiple urls using QWebPage,但我确实想编写一个并行版本,并且确实可以(不明白为什么它不起作用) 我已经编写了这段代码:
python
import sys
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from PyQt5.QtWebEngineWidgets import QWebEnginePage
import time
urlb = "https://www.google.fr/"
class Worker(QRunnable, QWebEnginePage):
'''
Worker thread
'''
def __init__(self, url):
super(Worker, self).__init__()
self.url = url
def _on_load_finished(self):
print("tfouuu")
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
@pyqtSlot()
def run(self):
print("a")
time.sleep(2)
print(self.url)
print("b")
QWebEnginePage.__init__(self)
print("c")
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
print("d")
class MainWindow(QMainWindow):
def __init__(self, *args, **kwargs):
self.threadpool = QThreadPool()
print("Multithreading with maximum %d threads" % self.threadpool.maxThreadCount())
super(MainWindow, self).__init__(*args, **kwargs)
worker = Worker(urlb)
worker2 = Worker(urlb)
self.threadpool.start(worker)
self.threadpool.start(worker2)
app = QApplication([])
window = MainWindow()
app.exec_()
但是我有两个问题:
第一个是我的代码不断运行而不停止(我想这与缺少app.quit()行有关,但我真的不知道将其放在哪里)
大多数情况下,第二个问题是我的代码仅打印'a','b','c'->它不运行connect and load部分
答案 0 :(得分:1)
QWebEngineView无法并且不应在另一个线程上运行。
相反,如果要异步获取html,则应使用Qt信号:
from functools import partial
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebManager(QtCore.QObject):
def __init__(self, parent=None):
super(WebManager, self).__init__(parent)
self.pages = []
self.results = []
def load(self, url):
page = QtWebEngineWidgets.QWebEnginePage(self)
page.loadFinished.connect(self._on_load_finished)
self.pages.append(page)
page.load(QtCore.QUrl(url))
@QtCore.pyqtSlot(bool)
def _on_load_finished(self, ok):
page = self.sender()
if not isinstance(page, QtWebEngineWidgets.QWebEnginePage):
return
if ok:
wrapper = partial(self.callable, page)
page.toHtml(wrapper)
else:
self.pages.remove(page)
page.deleteLater()
def callable(self, page, html):
self.pages.remove(page)
url = page.requestedUrl().toString()
page.deleteLater()
self.results.append((url, html))
if not self.pages:
QtWidgets.QApplication.quit()
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
manager = WebManager()
pages = []
format_url = "http://pyqt.sourceforge.net/Docs/PyQt5/%s.html"
for name in dir(QtWebEngineWidgets):
if name.startswith("Q"):
url = format_url % name.lower()
manager.load(url)
app.exec_()
for url, html in manager.results:
print(url)
print(html)