# -*- coding: utf-8 -*- from scrapy_redis.spiders import RedisSpider from scrapy.spider import Request
from scrapy_redis_slaver.items import MzituSlaverItem
class MzituSpider(RedisSpider):
name = 'mzitu'
redis_key = 'mzitu:start_urls' # get start url from redis
def __init__(self, *args, **kwargs):
self.item = MzituSlaverItem()
def parse(self, response):
max_page = response.xpath(
"descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(default="N/A")
max_page = int(max_page)
name = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A")
self.item['name'] = name
self.item['url'] = response.url
item_id = response.url.split('/')[-1]
self.item['item_id'] = item_id
# name: the pictures' title
# url: the pictures' the first url
# item_id: the pictures' id
# max_page: the pictures' max page
for num in range(1, max_page+1): # The cycle is turning pages.
# page_url is page address for each picture.
page_url = response.url + '/' + str(num)
yield Request(page_url, callback=self.img_url,meta={"name":name,
"item_id":item_id,
"max_page":max_page
})
def img_url(self, response):
# this function: get a picture's url from response
img_urls = response.xpath("descendant::div[@class='main-image']/descendant::img/@src").extract_first()
# a img_url
self.server.sadd('{}:{}:images'.format(response.meta['name'], response.meta['item_id']), img_urls)
# add a img_url to list of redis
len_redis_img_list = self.server.scard('{}:{}:images'.format(response.meta['name'], response.meta['item_id']))
# get the length of the img_url_list from redis
if len_redis_img_list == response.meta['max_page']:
self.item['img_urls'] = self.server.smembers('{}:{}:images'.format(response.meta['name'], response.meta['item_id']))
print("yield item",response.meta['item_id'])
yield self.item
# in my mind,when the len_redis_img_list is equal the max_page,the item will be yield one
# but actually,the item was yield the max_page times(very very more)
>输出:
产量项目将被多次调用,并且时间等于max_page
产量项目148762
产量项目148762
产量项目148762
产量项目148762
...非常非常多,数量等于max_page
>我的想法:
“收益项目”只会被调用一次
但实际上产量项目被多次调用
>问题:
我不知道为什么代码如此工作
答案 0 :(得分:1)
我也很难理解您的搜寻器。
您当前的循环是这样的:
1. Go to product page
2. Find some item data
3. Split the item into `max_page` forks
3.1. Carry over data from #2 to every fork
4. Yield item in from every fork
我想你想要的是:
1. Go to product page
2. Find some item data
3. Find image urls
4. Chain loop through image urls to complete single item
您的抓取工具应如下所示:
class MzituSpider(RedisSpider):
name = 'mzitu'
redis_key = 'mzitu:start_urls' # get start url from redis
def parse( response):
# make item
item = MzituSlaverItem()
item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A")
item['url'] = response.url
item['item_id'] = response.url.split('/')[-1]
item['image_urls'] = []
# yield request for every page
max_page = ...
image_urls = []
for num in range(1, max_page+1): # The cycle is turning pages.
image_urls.append(response.url + '/' + str(num))
yield Request(
image_urls.pop(),
self.parse_images,
meta={'item': item, 'image_urls': image_urls}
)
def parse_images( response):
images = response.xpath("descendant::div[@class='main-image']/descendant::img/@src").extract_first()
item = response.meta['item']
item['images'].append(images)
# if more pages left continue crawling
if response.meta['image_urls']:
yield Request(
image_urls.pop(),
self.parse_images,
meta={'item': item, 'image_urls': image_urls}
)
else: # else return completed item
yield item