我尝试使用Javascript生成的内容从网站的多个页面中提取一些数据。 所以我使用PyQt4和Beautiful Soup来遍历页面并提取一些数据字段。
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
class Client(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self.on_page_load)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def on_page_load(self):
self.app.quit()
products_titles = []
urls= ['url1', 'url2', 'url3']
for url in urls:
print "Parsing URL: " + url + '\n'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = BeautifulSoup(source, "html.parser")
print get_product_category(soup)
但是当我运行它时会崩溃并发出错误:
QObject::connect: Cannot connect (null)::configurationAdded(QNetworkConfiguration) to QNetworkConfigurationManager::configurationAdded(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationRemoved(QNetworkConfiguration) to QNetworkConfigurationManager::configurationRemoved(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationChanged(QNetworkConfiguration) to QNetworkConfigurationManager::configurationChanged(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::onlineStateChanged(bool) to QNetworkConfigurationManager::onlineStateChanged(bool)
QObject::connect: Cannot connect (null)::configurationUpdateComplete() to QNetworkConfigurationManager::updateCompleted()
[1] 14809 segmentation fault python products.py
我不知道我做错了什么,如果你知道发生了什么,请帮忙。
答案 0 :(得分:2)
我会发送一个URL列表,让你的QApplication的一个实例按顺序加载它们,而不是实例化并销毁一堆QApplications。
换句话说,尝试更像这样的东西......
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, pyqtSignal
from PyQt4.QtWebKit import QWebPage
class Client(QWebPage):
new_url = pyqtSignal(['QString'], name='new_url')
def __init__(self, urls):
self.app = QApplication(sys.argv)
self.urls = urls
self.pages = dict()
QWebPage.__init__(self)
self.new_url.connect(self.load_url)
self.loadFinished.connect(self.on_page_load)
if len(self.urls):
self.new_url.emit(urls.pop())
self.app.exec_()
def load_url(self, url):
self.current_url = url
print "Loading: {0}".format(url)
self.mainFrame().load(QUrl(url))
def on_page_load(self):
print "Retrieved: {0}".format(self.current_url)
self.pages[self.current_url] = unicode(self.mainFrame().toHtml())
if len(self.urls):
self.new_url.emit(self.urls.pop())
else:
self.app.quit()
urls= ['http://www.google.com', 'http://www.yahoo.com', 'http://www.bing.com']
client = Client(urls)
for (url, page) in client.pages.items():
soup = BeautifulSoup(page, "html.parser")
print "{0}\t{1}".format(url, soup.title.text)
重新实例化一堆QApplications似乎是一个非常糟糕的主意,我可以理解该上下文中的分段错误。然而,在分段错误之前的网络错误对我来说有点奇怪。试试上面的内容,看看你是否有更好的运气。它对我来说很好。