Scrapy无法将图片下载到本地

时间:2014-12-21 05:13:21

标签: python scrapy pipeline

我正在使用Scrapy (0.22)抓取一个网站。我需要做三件事:

  1. 我需要图像的类别和子类别
  2. 我需要下载图片并将其存储在本地
  3. 我需要在Mongo
  4. 中存储类别,子类别,图片网址

    但是现在我被封锁了,我使用'pipelines'来下载图片,但是我的代码无法正常工作,它无法将图片下载到本地。

    另外,由于我想将信息存储在Mongo中,任何人都可以给我一些关于“Mongo表结构”的建议吗?

    我的代码如下:

    settings.py

    BOT_NAME = 'tutorial'
    
    SPIDER_MODULES = ['tutorial.spiders']
    NEWSPIDER_MODULE = 'tutorial.spiders'
    
    ITEM_PIPELINES = {'tutorial.pipelines.TutorialPipeline': 1}
    IMAGES_STORE = '/ttt'
    

    items.py

    from scrapy.item import Item, Field
    
        class TutorialItem(Item):
            # define the fields for your item here like:
            # name = Field()
            catname=Field()
            caturl=Field()
            image_urls = Field()
            images = Field()
            pass
    

    pipelines.py

    from scrapy.contrib.pipeline.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    from scrapy.http import Request
    from pprint import pprint as pp
    
    class TutorialPipeline(object):
        # def get_media_requests(self, item, info):
        #     for image_url in item['image_urls']:
        #         yield Request(image_url)
    
        # def process_item(self, item, spider):
            # print '**********************===================*******************'
            # return item
            # pp(item)
            # pass
    
        def get_media_requests(self,item,info):
            # pass
            pp('**********************===================*******************')
    
            # yield Request(item['image_urls'])
            for image_url in item['image_urls']:
                # pass
                # print image_url
                yield Request(image_url)
    

    spider.py

    import scrapy
    import os
    from pprint import pprint as pp
    from scrapy import log
    from scrapy.http import Request
    from scrapy.selector import Selector
    from scrapy.spider import Spider
    
    from scrapy.spider import Spider
    from scrapy.selector import Selector
    
    from tutorial.items import TutorialItem
    from pprint import pprint as pp
    
    class BaiduSpider(scrapy.spider.Spider):
        name='baidu'
        start_urls=[
            # 'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/'
            'http://giphy.com/categories'
        ]
    
        domain='http://giphy.com'
    
        def parse(self,response):
            selector=Selector(response)
    
            topCategorys=selector.xpath('//div[@id="None-list"]/a')
    
            # pp(topCategorys)
            items=[]
            for tc in topCategorys:
                item=TutorialItem()
                item['catname']=tc.xpath('./text()').extract()[0]
                item['caturl']=tc.xpath('./@href').extract()[0]
                if item['catname']==u'ALL':
                    continue
                reqUrl=self.domain+'/'+item['caturl']
                # pp(reqUrl)
                yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getSecondCategory)
        def getSecondCategory(self,response):
            selector=Selector(response)
            # pp(response.meta['caturl'])
            # pp('*****************=================**************')
    
            secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a')
    
            # pp(secondCategorys)
            items=[]
            for sc in secondCategorys:
                item=TutorialItem()
                item['catname']=sc.xpath('./div/h4/text()').extract()[0]
                item['caturl']=sc.xpath('./@href').extract()[0]
                items.append(item)
    
                reqUrl=self.domain+item['caturl']
            # pp(items)
                # pp(item)
                # pp(reqUrl)
                yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getImages)
    
        def getImages(self,response):
            selector=Selector(response)
            # pp(response.meta['caturl'])
            # pp('*****************=================**************')
    
    
            # images=selector.xpath('//ul[@class="gifs  freeform grid_12"]/div[position()=3]')
            images=selector.xpath('//*[contains (@class,"hoverable-gif")]')
            # images=selector.xpath('//ul[@class="gifs  freeform grid_12"]//div[@class="hoverable-gif"]')
            # pp(len(images))
            items=[]
            for image in images:
                item=TutorialItem()
                item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0]
                # item['imgName']=image.xpath('./a/figure/img/@alt').extract()[0]
                items.append(item)
                # pp(item)
                # pp(items)
                # pp('==============************==============')
    
            # pp(items)
            # items=[{'images':"hello world"}]
            return items
    

    另外,输出中没有错误,只是如下:

    2014-12-21 13:49:56+0800 [scrapy] INFO: Enabled item pipelines: TutorialPipeline
    2014-12-21 13:49:56+0800 [baidu] INFO: Spider opened
    2014-12-21 13:49:56+0800 [baidu] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
    2014-12-21 13:49:56+0800 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
    2014-12-21 13:49:56+0800 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
    2014-12-21 13:50:07+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/categories> (referer: None)
    2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/science/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/sports/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/news-politics/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/transportation/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/interests/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/memes/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/tv/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/gaming/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/nature/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/emotions/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/movies/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/holiday/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/reactions/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/music/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/decades/> (referer: http://giphy.com/categories)
    2014-12-21 13:50:12+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/search/the-colbert-report/> (referer: http://giphy.com//categories/news-politics/)
    2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
        {'image_urls': u'http://media1.giphy.com/media/2BDLDXFaEiuBy/200_s.gif'}
    2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
        {'image_urls': u'http://media2.giphy.com/media/WisjAI5QGgsrC/200_s.gif'}
    2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
        {'image_urls': u'http://media3.giphy.com/media/ZgDGEMihlZXCo/200_s.gif'}
    .............
    

1 个答案:

答案 0 :(得分:2)

据我所知,您无需覆盖ImagesPipeline,因为您没有修改其行为。但是,既然你正在这样做,你应该正确地做到这一点 覆盖ImagesPipeline时,应覆盖两种方法:

  • get_media_requests(item,info)应为Request中的每个网址返回image_urls。这部分你做得正确。

  • item_completed(结果,项目,信息)在单个项目的所有图像请求都已完成(完成下载或因某种原因失败)时被调用。来自official documentation

      

    item_completed()方法必须返回将要发送的输出   到后续项目管道阶段,所以你必须返回(或删除)   项目,就像在任何管道中一样。

因此,要使自定义图片管道正常工作,您需要覆盖 item_completed()方法,如下所示:

def item_completed(self, results, item, info):
    image_paths = [x['path'] for ok, x in results if ok]
    if not image_paths:
        raise DropItem("Item contains no images")
    item['image_paths'] = image_paths
    return item

接下来,关于代码中的其他问题,使其无法按预期工作:

  1. 您实际上并未创建任何有用的项目 如果您查看parse()getSecondCategory()函数,您会注意到您没有返回也没有产生任何项目。虽然您似乎准备了items列表,您显然希望用它来存储您的项目,但它从未用于实际将项目进一步传递到处理路径中。有一次,您只需为下一页生成Request,当该功能完成后,您的items就会被删除。

  2. 您没有使用通过caturl字典传递的meta信息。您正在parse()˙和getSecondCategory()中传递此信息,但您永远不会在回调函数中收集它。因此它也被忽略了。

  3. 所以,唯一能够正常工作的是图像管道,如果按照我的建议修复它。要在您的代码中解决这些问题,请遵循以下准则(请注意,这不是经过测试的,只是您需要考虑的准则):

    def parse(self,response):
        selector=Selector(response)
        topCategorys=selector.xpath('//div[@id="None-list"]/a')
    
        for tc in topCategorys:
            # no need to create the item just yet,
            # only get the category and the url so we can
            # continue the work in our callback
            catname = tc.xpath('./text()').extract()[0]
            caturl = tc.xpath('./@href').extract()[0]
            if catname == u'ALL':
                continue
            reqUrl=self.domain + '/' + caturl
    
            # pass the category name in the meta so we can retreive it
            # from the response in the callback function
            yield Request(url=reqUrl,meta={'catname': catname},
                          callback=self.getSecondCategory)
    
    def getSecondCategory(self,response):
        selector=Selector(response)
        secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a')
    
        # retreive the category name from the response
        # meta dictionary, which was copied from our request
        catname = response.meta['catname']
    
        for sc in secondCategorys:
            # still no need to create the item, 
            # since we are just trying to get to 
            # the subcategory
            subcatname = sc.xpath('./div/h4/text()').extract()[0]
            subcaturl = sc.xpath('./@href').extract()[0]
    
            reqUrl=self.domain + '/' + subcaturl
    
            # this time pass both the category and the subcategory
            # so we can read them both in the callback function
            yield Request(url=reqUrl,meta={'catname':catname, 'subcatname':subcatname},
                            callback=self.getImages)
    
    def getImages(self,response):
        selector=Selector(response)
    
        # retreive the category and subcategory name
        catname = response.meta['catname']
        subcatname = response.meta['subcatname']
    
        images = selector.xpath('//*[contains (@class,"hoverable-gif")]')
    
        for image in images:
            # now could be a good time to create the items
            item=TutorialItem()
    
            # fill the items category information. You can concatenate
            # the category and subcategory if you like, or you can 
            # add another field in your TutorialItem called subcatname
            item['catname'] = catname + ":" + subcatname
            # or alternatively:
            # item['catname'] = catname
            # item['subcatname'] = subcatname
    
            item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0]
    
            # no need to store the items in the list to return
            # it later, we can just yield the items as they are created
            yield item