这是我的代码:
class AAA(scrapy.Spider):
name = 'aaa'
start_urls = [
'https://forum.lowyat.net/topic/377400/all'
]
COOKIES_ENABLED = False
count = 0
check = 0
item = AAAItem()
toDownload = []
def parse(self, response):
for sel in response.xpath('//*[@id="contentmiddle"]/div[3]/ol/li'):
self.item['name'] = sel.xpath('div/div/div[1]/p[1]/a/text()').extract()
self.item['date'] = sel.xpath('div/div/div[2]/p[4]/text()').extract()
lastUpdateDate = self.getLastUpdateDate()
date_object1 = self.convertToDate(self.item['date'][0]+"")
date_object2 = self.convertToDate(lastUpdateDate)
if date_object1 <= date_object2:
self.haha2(response)
self.stopSpider()
self.item['link'] = sel.xpath('div/div/div[4]/p[3]/a/@href').extract()
self.arrangeDownloadUrl()
yield self.item
def arrangeDownloadUrl(self):
try:
downloadUrl = "http://AAA.com"+self.item['link'][0]+""
self.toDownload.append(downloadUrl)
except IndexError:
print 'file not downloaded, link dead'
def haha2(self, response):
for i in range (len(self.toDownload)):
Request(self.toDownload[i], self.haha3)
def haha3(self, response):
print 'haha3.................................................................'
def stopSpider(self):
raise scrapy.exceptions.CloseSpider('done')
def getLastUpdateDate(self):
date = "Nov 5, 2001 - 1:06 PM"
return date
def convertToDate(self, value):
result = datetime.strptime(value, '%b %d, %Y - %I:%S %p')
return result
def convertToString(self, value):
result = value.strftime("%b %w, %Y - %I:%S %p")
return result
出于隐私目的,我更改了网页的网址。
无论如何,问题是haha2
函数中的请求未能请求回调,haha3
...
它不会进入haha3函数,除非我用这样的self.haha3(response)
调用它...但是这会打败目的,因为我想打开链接并使响应成为我想要的链接打开......我知道哪里出错了?
答案 0 :(得分:0)
尝试
def haha2(self, response):
for i in range (len(self.toDownload)):
yield Request(self.toDownload[i], callback=self.haha3)