Question

我正在寻求从具有地址的网页列表中抓取链接，其格式是使用for循环最有效（例如，在范围（1，N）中为n）并连接它们用基本网址

example.com/page1
example.com/page2
example.com/page3

...

example.com/page的Ñ

因为内容是由Javascripts生成的，所以我尝试使用 PyQt4 构建简单的Python web-scraper，并提供模板here：

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *  

class Render(QWebPage):  
  def __init__(self, url):  
    self.app = QApplication(sys.argv)  
    QWebPage.__init__(self)  
    self.loadFinished.connect(self._loadFinished)  
    self.mainFrame().load(QUrl(url))  
    self.app.exec_()  

  def _loadFinished(self, result):  
    self.frame = self.mainFrame()  
    self.app.quit()

[更新] 我可以使用ekhumoro的方法，但是当我以这种方式编码时它有点不稳定：

import sys, signal, time
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit

class WebPage(QtWebKit.QWebPage):
    def __init__(self):
        QtWebKit.QWebPage.__init__(self)
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def process(self, items):
        self._items = iter(items)
        self.fetchNext()

    def fetchNext(self):
        try:
            self._url, self._func = next(self._items)
            self.mainFrame().load(QtCore.QUrl(self._url))
        except StopIteration:
            return False
        return True

    def handleLoadFinished(self):
        self._func(self._url, self.mainFrame().toHtml())
        if not self.fetchNext():
            print('# processing complete')
            QtGui.qApp.quit()

def funcA(url, html):
    print '# processing:', url
    soup = BeautifulSoup(unicode(html), "lxml")
    newslinks = soup.select('dt a')
    if len(newslinks) == 0:
        print 'captured', len(newslinks), 'urls - this does not work'
    else:
        print 'captured', len(newslinks), 'urls'

url = "http://english.yonhapnews.co.kr/search1/2601000000.html?query=japan&page_no="
targets = []
failedtargets = []
futurelinks = []
newslinks = []

for n in range(1,30):
    newurl = url + str(n), funcA
    targets.append(newurl)

signal.signal(signal.SIGINT, signal.SIG_DFL)
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.process(targets)
sys.exit(app.exec_())

每轮，我希望收集十个地址。然而，当迭代到达20时，程序开始获得大量的零（显示＆＃34;捕获的0网址 - 这不起作用＆＃34;在终端屏幕上）。但是这些零的出现似乎在不同的试验中处于不同的地方。

使用PyQt和for循环

0 个答案: