我正在寻求从具有地址的网页列表中抓取链接,其格式是使用for循环最有效(例如,在范围(1,N)中为n)并连接它们用基本网址
...
因为内容是由Javascripts生成的,所以我尝试使用 PyQt4 构建简单的Python web-scraper,并提供模板here:
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
[更新] 我可以使用ekhumoro的方法,但是当我以这种方式编码时它有点不稳定:
import sys, signal, time
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
QtWebKit.QWebPage.__init__(self)
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.mainFrame().load(QtCore.QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
self._func(self._url, self.mainFrame().toHtml())
if not self.fetchNext():
print('# processing complete')
QtGui.qApp.quit()
def funcA(url, html):
print '# processing:', url
soup = BeautifulSoup(unicode(html), "lxml")
newslinks = soup.select('dt a')
if len(newslinks) == 0:
print 'captured', len(newslinks), 'urls - this does not work'
else:
print 'captured', len(newslinks), 'urls'
url = "http://english.yonhapnews.co.kr/search1/2601000000.html?query=japan&page_no="
targets = []
failedtargets = []
futurelinks = []
newslinks = []
for n in range(1,30):
newurl = url + str(n), funcA
targets.append(newurl)
signal.signal(signal.SIGINT, signal.SIG_DFL)
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.process(targets)
sys.exit(app.exec_())
每轮,我希望收集十个地址。然而,当迭代到达20时,程序开始获得大量的零(显示"捕获的0网址 - 这不起作用"在终端屏幕上)。但是这些零的出现似乎在不同的试验中处于不同的地方。