我开始使用PyQt5进行动态javascript抓取并且已经遇到了麻烦。当我有多个urls要刮,python在第一个或第二个url后崩溃,无论域名是什么。我可以从第一页获取数据,但不能从第二页获取数据。来自Windows的错误日志显示Qt5WebEngineCore.dll是错误的原因,但我不知道该怎么做。我真的没有从网上的其他地方找到任何有用的东西。 这是代码:
import sys
import requests
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from bs4 import BeautifulSoup
import re
class Client(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url)) #Ignote mainFrame from PyQt4
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def scrape_pyqt5():
lists = ['example1.com/a', 'example1.com/b', 'example1.com/c']
for url in lists:
r = Client(url)
bs = BeautifulSoup(r.html, 'html.parser')
for link in bs.find_all('div', {'id': 'media-player'}):
for directlink in link.find_all('iframe'):
print(directlink)