我不确定是否存在问题,我使用scrapy抓取数据并将其保存。
这是我的方式:
def parse(self, response):
# i use yield Request to another function prase_page
i = 1000
dataLen = len(response.xpath('//div[@class="release_foto"]'))
photoNodes = response.xpath('//div[@class="release_foto"]')
for photoNode in photoNodes:
contentHref = photoNode.xpath('.//a/@href').extract_first()
yield Request(contentHref, callback=self.parse_page, priority = i, dont_filter=True)
i -= 1
photoHref = photoNode.xpath('.//a/img/@src').extract_first()
moviePhotoArray.append(photoHref)
detailDict.update({
'cnName': cnNameArray,
'enName': enNameArray,
'movieContent': movieContentArray,
'versionType': versionTypeArray,
'movieStyle': movieStyleArray,
'releasedTime': releasedTimeArray,
'moviePhoto': moviePhotoArray,
'movieDate': movieDateArray,
'movieTime': movieTimeArray,
'movieStills': movieStillsArray,
'movieActorCn': movieActorCnArray,
'imdbScore': imdbArray,
'rottenScore': rottenArray,
'videoId': videoIdArray,
'movieActorPhoto': movieActorPhotoArray})
print 'Test enNameArray=>'
print ', '.join(enNameArray)
print 'Test movieDateArray=>'
print ', '.join(movieDateArray)
我发现我无法获得movieDateArray值。它打印为空,但是当我将它保存到数据库时它有价值。
这是我的打印movieDateArray
为空:
Test enNameArray=>
Pacific Rim Uprising, Tomb Raider, Ready Player One
Test movieDateArray=>
我发现我的全局列表中的所有函数都是空的:
def parse_page(self, response):
global movieContentArray, countLen, dataLen
global movieDateArray, movieTimeArray, movieStillsArray, movieStyleArray, movieActorCnArray, movieActorPhotoArray
movieContent = response.xpath('//div[@class="gray_infobox_inner"]/span/text()').extract_first()
movieDate = response.xpath('//*[@class="movie_intro_info_r"]/span/text()')[0].extract()
movieTime = response.xpath('//*[@class="movie_intro_info_r"]/span/text()')[1].extract()
movieStills = response.xpath('//ul[@class="trailer_list imglist"]//div[@class="foto"]/img/@src').extract()
movieStyle = response.xpath('//div[@class="level_name_box"]//div[@class="level_name"]/a/text()').extract()
movieActorCn = response.xpath('//ul[@class="trailer_list alist starlist"]/li/a//div[@class="fotoinner"]/img/@title').extract()
movieActorPhoto = response.xpath('//ul[@class="trailer_list alist starlist"]/li/a//div[@class="fotoinner"]/img/@src').extract()
movieContentArray.append(movieContent)
movieDateArray.append(movieDate)
movieTimeArray.append(movieTime)
movieStillsArray.append(movieStills)
movieStyleArray.append(movieStyle)
movieActorCnArray.append(movieActorCn)
movieActorPhotoArray.append(movieActorPhoto)
# i can print the value here
print 'Test=>inside movieDateArray=>',movieDateArray
countLen += 1
if countLen == dataLen:
yield tainan
我不知道为什么列表movieDateArray
为空。因为我将列表保存到数据库它不是空的,但当我尝试print
它或for in
时它是空的。
太奇怪了。有人可以告诉我我错过了哪一步。提前谢谢。
答案 0 :(得分:2)
我相信问题是,每次调用parse_page
时,都会定义一个名为movieDateArray
的新全局变量。也许在上次调用期间,此变量初始化为None
,这就是您不打印任何内容的原因。通过在每个循环迭代中打印movieDateArray
来尝试调试