Scrapy请求回调无效

时间:2014-11-09 03:36:25

标签: python web-scraping scrapy

这是我的代码:

class AAA(scrapy.Spider):   
    name = 'aaa'
    start_urls = [
        'https://forum.lowyat.net/topic/377400/all'
    ]
    COOKIES_ENABLED = False
    count = 0
    check = 0
    item = AAAItem()
    toDownload = []


    def parse(self, response):
        for sel in response.xpath('//*[@id="contentmiddle"]/div[3]/ol/li'):
            self.item['name'] = sel.xpath('div/div/div[1]/p[1]/a/text()').extract()
            self.item['date'] = sel.xpath('div/div/div[2]/p[4]/text()').extract()
            lastUpdateDate = self.getLastUpdateDate()
            date_object1 = self.convertToDate(self.item['date'][0]+"")
            date_object2 = self.convertToDate(lastUpdateDate)
            if date_object1 <= date_object2:
                self.haha2(response)
                self.stopSpider()
            self.item['link'] = sel.xpath('div/div/div[4]/p[3]/a/@href').extract()
            self.arrangeDownloadUrl()
            yield self.item                 

    def arrangeDownloadUrl(self):
        try:
            downloadUrl = "http://AAA.com"+self.item['link'][0]+""
            self.toDownload.append(downloadUrl)
        except IndexError:
            print 'file not downloaded, link dead'

    def haha2(self, response):
        for i in range (len(self.toDownload)):
            Request(self.toDownload[i], self.haha3)

    def haha3(self, response):
        print 'haha3.................................................................'


    def stopSpider(self):
        raise scrapy.exceptions.CloseSpider('done')


    def getLastUpdateDate(self):
            date = "Nov 5, 2001 - 1:06 PM"
            return date

    def convertToDate(self, value):
        result = datetime.strptime(value, '%b %d, %Y - %I:%S %p')
        return result

    def convertToString(self, value):
        result = value.strftime("%b %w, %Y - %I:%S %p")
        return result

出于隐私目的,我更改了网页的网址。 无论如何,问题是haha2函数中的请求未能请求回调,haha3 ... 它不会进入haha3函数,除非我用这样的self.haha3(response)调用它...但是这会打败目的,因为我想打开链接并使响应成为我想要的链接打开......我知道哪里出错了?

1 个答案:

答案 0 :(得分:0)

尝试

def haha2(self, response):
    for i in range (len(self.toDownload)):
        yield Request(self.toDownload[i], callback=self.haha3)