使用刮y的翻页并获取每个页面的图像的url,但是回调方法对我而言不起作用

时间:2018-08-19 09:28:06

标签: scrapy

# -*- coding: utf-8 -*- from scrapy_redis.spiders import RedisSpider from scrapy.spider import Request


from scrapy_redis_slaver.items import MzituSlaverItem

class MzituSpider(RedisSpider):
    name = 'mzitu'
    redis_key = 'mzitu:start_urls'    # get start url from redis
    def __init__(self, *args, **kwargs):
        self.item = MzituSlaverItem()

def parse(self, response):

    max_page = response.xpath(
        "descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(default="N/A")
    max_page = int(max_page)
    name = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A")
    self.item['name'] = name
    self.item['url'] = response.url
    item_id = response.url.split('/')[-1]
    self.item['item_id'] = item_id
    # name:      the pictures' title
    # url:       the pictures' the first url
    # item_id:   the pictures' id
    # max_page:  the pictures' max page

    for num in range(1, max_page+1):         # The cycle is turning pages.
        # page_url is page address for each picture.
        page_url = response.url + '/' + str(num)
        yield Request(page_url, callback=self.img_url,meta={"name":name,
                                                            "item_id":item_id,
                                                            "max_page":max_page
                                                            })
def img_url(self, response):
    # this function:   get a picture's url from response
    img_urls = response.xpath("descendant::div[@class='main-image']/descendant::img/@src").extract_first()
    # a img_url
    self.server.sadd('{}:{}:images'.format(response.meta['name'], response.meta['item_id']), img_urls)
    # add a img_url to list of redis

    len_redis_img_list = self.server.scard('{}:{}:images'.format(response.meta['name'], response.meta['item_id']))
    # get the length of the img_url_list from redis

    if len_redis_img_list == response.meta['max_page']:
        self.item['img_urls'] = self.server.smembers('{}:{}:images'.format(response.meta['name'], response.meta['item_id']))
        print("yield item",response.meta['item_id'])
        yield self.item
    # in my mind,when the len_redis_img_list is equal the max_page,the item will be yield one
    # but actually,the item was yield the max_page times(very very more)

>输出:

  

产量项目将被多次调用,并且时间等于max_page

     

产量项目148762

     

产量项目148762

     

产量项目148762

     

产量项目148762

     

...非常非常多,数量等于max_page

     

>我的想法:

     

“收益项目”只会被调用一次

     

但实际上产量项目被多次调用

     

>问题:

     

我不知道为什么代码如此工作

1 个答案:

答案 0 :(得分:1)

我也很难理解您的搜寻器。

您当前的循环是这样的:

1. Go to product page
2. Find some item data
3. Split the item into `max_page` forks
3.1. Carry over data from #2 to every fork
4. Yield item in from every fork

我想你想要的是:

1. Go to product page
2. Find some item data
3. Find image urls
4. Chain loop through image urls to complete single item

Something like this

您的抓取工具应如下所示:

class MzituSpider(RedisSpider):
    name = 'mzitu'
    redis_key = 'mzitu:start_urls'    # get start url from redis

    def parse( response):
        # make item 
        item = MzituSlaverItem()
        item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default="N/A")
        item['url'] = response.url
        item['item_id'] = response.url.split('/')[-1]
        item['image_urls'] = []
        # yield request for every page
        max_page = ...
        image_urls = []
        for num in range(1, max_page+1):         # The cycle is turning pages.
            image_urls.append(response.url + '/' + str(num))
        yield Request(
            image_urls.pop(),
            self.parse_images,
            meta={'item': item, 'image_urls': image_urls}
        )

    def parse_images( response):
        images = response.xpath("descendant::div[@class='main-image']/descendant::img/@src").extract_first()
        item = response.meta['item']
        item['images'].append(images)

        # if more pages left continue crawling
        if response.meta['image_urls']:
            yield Request(
                image_urls.pop(),
                self.parse_images,
                meta={'item': item, 'image_urls': image_urls}
            )
        else:  # else return completed item
            yield item