我正在尝试使用PyQt4和Beautifulsoup使用以下代码遍历URL列表:
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, pyqtSignal
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print ('Downloading', url)
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass
soup = BeautifulSoup(unicode(html), "lxml")
t = soup.findAll("div", {"class": "detalhamento_label_valor hidden-print ng-binding"})[0].text
print t
urls = ["http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000004?modulo=0&sistema=portal" ]
r = Render(urls, cb=scrape)
如果网址相同[0,1]
,它似乎运作良好,但一旦网址更改[2]
就会卡住。我对PyQt4并不熟悉,所以我想知道是否有一些我可能会遗漏的微不足道的东西。
修改
程序在此操作上运行url列表的第三项时挂起:
self.mainFrame().load(QUrl(url))
除此之外,我得到的唯一警告是:
libpng警告:iCCP:已知错误的sRGB配置文件
虽然我不确定这意味着什么,但它似乎与这个问题无关。