我对Python很陌生,我想编写一个脚本,我可以从网站上获取一些细节,如你所知,有些网站使用JavaScript来完成页面的内容,所以我已经尝试过PhantomJS但没有成功,所以我换到了PYQT5,我遇到了一个问题,我想你可以帮我解决它的问题。
我的Python脚本适用于一次迭代,但是如果我把它放在一个循环中它会崩溃python。
我试过time.sleep()
,但它似乎无法发挥作用。
将我的问题标记为重复的尖锐答案 Scrape multiple urls using QWebPage 不是出于同一目的,给定的答案循环与已经给定的URL列表在课堂上,而在我的情况下,它应该在外部。
这是我的剧本:
"""Render HTML for scraping"""
# -*- coding: utf-8 -*-
import os
import sys
import requests
from contextlib import contextmanager
from multiprocessing import Pool
import bs4
import time
try:
TimeoutError
except NameError:
from multiprocessing import TimeoutError # Python 2
def _render(url):
"""Return rendered HTML."""
try:
from PyQt5.QtCore import QEventLoop,QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.app = QApplication.instance()
if self.app is None:
self.app = QApplication(sys.argv)
else:
print('QApplication instance already exists: %s' % str(app))
#-----------------------------
#self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
#self.setHtml(html)
self.load(QUrl(url))
while self.html is None:
self.app.processEvents(
QEventLoop.ExcludeUserInputEvents |
QEventLoop.ExcludeSocketNotifiers |
QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
except ImportError:
from PyQt5.QtWebKitWidgets import QWebPage
from PyQt5.QtWidgets import QApplication
class Render(QWebPage):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
with devnull():
return Render(url).html
@contextmanager
def devnull():
"""Temporarily redirect stdout and stderr to /dev/null."""
try:
original_stderr = os.dup(sys.stderr.fileno())
original_stdout = os.dup(sys.stdout.fileno())
null = open(os.devnull, 'w')
os.dup2(null.fileno(), sys.stderr.fileno())
os.dup2(null.fileno(), sys.stdout.fileno())
yield
finally:
if original_stderr is not None:
os.dup2(original_stderr, sys.stderr.fileno())
if original_stdout is not None:
os.dup2(original_stdout, sys.stdout.fileno())
if null is not None:
null.close()
urllist=[
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/",
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/",
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/",
"https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/"]
for url in urllist:
time.sleep(1)
rendered_html = _render(url)
# get the BeautifulSoup
soup = bs4.BeautifulSoup(rendered_html, 'html.parser')
print('title is %r' % soup.select_one('title').text)
虽然这个问题被标记为重复(感谢所有人这样做),但我想分享我的工作脚本,以便将来可能有用 无论如何,现在我想知道如何为这个 QWebEnginePage 设置假窗口大小我可以得到一些帮助
import bs4
import os
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
from contextlib import contextmanager
from multiprocessing import Pool
def _render(url):
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
self.html = None
def start(self, url):
self._url = url
self.load(QtCore.QUrl(url))
while self.html is None:
QtWidgets.qApp.processEvents(
QtCore.QEventLoop.ExcludeUserInputEvents |
QtCore.QEventLoop.ExcludeSocketNotifiers |
QtCore.QEventLoop.WaitForMoreEvents)
QtWidgets.qApp.quit()
def processCurrentPage(self, data):
url = self.url().toString()
self.html = data
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
with devnull():
webpage = WebPage()
webpage.start(url)
return webpage.html
@contextmanager
def devnull():
"""Temporarily redirect stdout and stderr to /dev/null."""
try:
original_stderr = os.dup(sys.stderr.fileno())
original_stdout = os.dup(sys.stdout.fileno())
null = open(os.devnull, 'w')
os.dup2(null.fileno(), sys.stderr.fileno())
os.dup2(null.fileno(), sys.stdout.fileno())
yield
finally:
if original_stderr is not None:
os.dup2(original_stderr, sys.stderr.fileno())
if original_stdout is not None:
os.dup2(original_stdout, sys.stdout.fileno())
if null is not None:
null.close()
urllist=[
"https://www.tutorialspoint.com/python/index.htm",
"https://www.tutorialspoint.com/python/index.htm",
"https://www.tutorialspoint.com/python/index.htm",
"https://www.tutorialspoint.com/python/index.htm"]
app = QtWidgets.QApplication(sys.argv)
for url in urllist:
rendered_html = _render(url)
soup = bs4.BeautifulSoup(rendered_html, 'html.parser')
print('title is %r' % soup.select_one('title').text)