我有以下类来返回任何给定WebPage的HTML:
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import sys
import signal
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.html = None
signal.signal(signal.SIGINT, signal.SIG_DFL)
self.connect(self, SIGNAL('loadFinished(bool)'), self._finished_loading)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _finished_loading(self, result):
self.html = self.mainFrame().toHtml()
self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
self.app.quit()
我有一个循环来迭代使用需要运行的JavaScript的WebPages列表,例如:
l = ["http://host.com/page1", "http://host.com/page2"]
for page in l:
soup = Render(page).soup
#Do-something
问题是JavaScript代码只在加载的第一页中执行,而不是在之后解释。
答案 0 :(得分:0)
页面可能已成功加载,但它有多个框架。更准确地说,有时候page.mainFrame().childFrames()
不是空的。您不仅需要处理主框架,还需要处理其子框架
例如:
def _finished_loading(self, result):
self.html = self.mainFrame().toHtml()
self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
# process childFrames
self.htmls = [frame.toHtml() for frame in self.mainFrame().childFrames()]
self.soups = [BeautifulSoup(UnicodeDammit(html).unicode_markup) for html in self.htmls]
self.app.quit()