我正在尝试更改this code一次抓取多个网址并捕获特定的javascript标记。这是我的代码:
#!/usr/bin/env python
import sys
import signal
import urllib
import re
import csv
import time
from optparse import OptionParser
from PyQt4 import QtCore
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import QWebPage
class Crawler( QWebPage ):
def __init__(self, url, file):
QWebPage.__init__( self )
self._url = url
#print(url)
self._file = file
#print(file)
def crawl( self ):
signal.signal( signal.SIGINT, signal.SIG_DFL )
self.connect( self, SIGNAL( 'loadFinished(bool)' ), self._finished_loading )
#print(self._url)
self.mainFrame().load( QUrl( self._url ) )
def _finished_loading( self, result ):
print(self._url)
file = open( self._file, 'a' )
s = QtCore.QString('aJavascriptKeyword')
val = (self.mainFrame().toHtml().indexOf(s) >= 0)
#print val
file.write( self._url + '^' + str(val) )
#print( self._url + '^' + str(val) )
file.close()
sys.exit( 0 )
def main():
app = QApplication( sys.argv )
urls = open(r'urls.txt','r')
#output=open(r'C:\Users\me\output.txt','w')
for url in urls:
#print(url)
crawler = Crawler( url, "output.txt" )
crawler.crawl()
sys.exit( app.exec_() )
if __name__ == '__main__':
main()
在这种情况下,urls.txt只是一个文本文件,每行有一个url。当我使用包含1个url的文件运行它时,它可以很好地工作,但是当我使用>运行时1个url,只有最终的url传递给finished_loading()。我做错了什么?
答案 0 :(得分:1)
问题在于
for url in urls:
...
crawler = Crawler( url, "output.txt" )
crawler.crawl()
在没有运行主PyQt应用程序的情况下完成。这意味着每次循环迭代时都会重置crawler,最后它只是具有最后一个url的爬虫。
一种解决方案可能是保存实例化的Crawlers的实例
def main():
app = QApplication( sys.argv )
# Get all the urls
with open(r'urls.txt','r') as urls:
my_urls = [u.rstrip("\n") for u in urls]
# Instantiate all the Crawlers
crawlers = [Crawler(u, "output.txt") for u in my_urls]
# Call all of the crawlers' crawl methdos
for crawler in crawlers:
crawler.crawl()
sys.exit( app.exec_() )