Question

我正在使用Scrapy (0.22)抓取一个网站。我需要做三件事：

我需要图像的类别和子类别
我需要下载图片并将其存储在本地
我需要在Mongo

但是现在我被封锁了，我使用'pipelines'来下载图片，但是我的代码无法正常工作，它无法将图片下载到本地。

另外，由于我想将信息存储在Mongo中，任何人都可以给我一些关于“Mongo表结构”的建议吗？

我的代码如下：

settings.py

BOT_NAME = 'tutorial'

SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'

ITEM_PIPELINES = {'tutorial.pipelines.TutorialPipeline': 1}
IMAGES_STORE = '/ttt'

items.py

from scrapy.item import Item, Field

    class TutorialItem(Item):
        # define the fields for your item here like:
        # name = Field()
        catname=Field()
        caturl=Field()
        image_urls = Field()
        images = Field()
        pass

pipelines.py

from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
from pprint import pprint as pp

class TutorialPipeline(object):
    # def get_media_requests(self, item, info):
    #     for image_url in item['image_urls']:
    #         yield Request(image_url)

    # def process_item(self, item, spider):
        # print '**********************===================*******************'
        # return item
        # pp(item)
        # pass

    def get_media_requests(self,item,info):
        # pass
        pp('**********************===================*******************')

        # yield Request(item['image_urls'])
        for image_url in item['image_urls']:
            # pass
            # print image_url
            yield Request(image_url)

spider.py

import scrapy
import os
from pprint import pprint as pp
from scrapy import log
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spider import Spider

from scrapy.spider import Spider
from scrapy.selector import Selector

from tutorial.items import TutorialItem
from pprint import pprint as pp

class BaiduSpider(scrapy.spider.Spider):
    name='baidu'
    start_urls=[
        # 'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/'
        'http://giphy.com/categories'
    ]

    domain='http://giphy.com'

    def parse(self,response):
        selector=Selector(response)

        topCategorys=selector.xpath('//div[@id="None-list"]/a')

        # pp(topCategorys)
        items=[]
        for tc in topCategorys:
            item=TutorialItem()
            item['catname']=tc.xpath('./text()').extract()[0]
            item['caturl']=tc.xpath('./@href').extract()[0]
            if item['catname']==u'ALL':
                continue
            reqUrl=self.domain+'/'+item['caturl']
            # pp(reqUrl)
            yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getSecondCategory)
    def getSecondCategory(self,response):
        selector=Selector(response)
        # pp(response.meta['caturl'])
        # pp('*****************=================**************')

        secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a')

        # pp(secondCategorys)
        items=[]
        for sc in secondCategorys:
            item=TutorialItem()
            item['catname']=sc.xpath('./div/h4/text()').extract()[0]
            item['caturl']=sc.xpath('./@href').extract()[0]
            items.append(item)

            reqUrl=self.domain+item['caturl']
        # pp(items)
            # pp(item)
            # pp(reqUrl)
            yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getImages)

    def getImages(self,response):
        selector=Selector(response)
        # pp(response.meta['caturl'])
        # pp('*****************=================**************')


        # images=selector.xpath('//ul[@class="gifs  freeform grid_12"]/div[position()=3]')
        images=selector.xpath('//*[contains (@class,"hoverable-gif")]')
        # images=selector.xpath('//ul[@class="gifs  freeform grid_12"]//div[@class="hoverable-gif"]')
        # pp(len(images))
        items=[]
        for image in images:
            item=TutorialItem()
            item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0]
            # item['imgName']=image.xpath('./a/figure/img/@alt').extract()[0]
            items.append(item)
            # pp(item)
            # pp(items)
            # pp('==============************==============')

        # pp(items)
        # items=[{'images':"hello world"}]
        return items

另外，输出中没有错误，只是如下：

2014-12-21 13:49:56+0800 [scrapy] INFO: Enabled item pipelines: TutorialPipeline
2014-12-21 13:49:56+0800 [baidu] INFO: Spider opened
2014-12-21 13:49:56+0800 [baidu] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2014-12-21 13:50:07+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/categories> (referer: None)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/science/> (referer: http://giphy.com/categories)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/sports/> (referer: http://giphy.com/categories)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/news-politics/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/transportation/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/interests/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/memes/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/tv/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/gaming/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/nature/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/emotions/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/movies/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/holiday/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/reactions/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/music/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/decades/> (referer: http://giphy.com/categories)
2014-12-21 13:50:12+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/search/the-colbert-report/> (referer: http://giphy.com//categories/news-politics/)
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
    {'image_urls': u'http://media1.giphy.com/media/2BDLDXFaEiuBy/200_s.gif'}
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
    {'image_urls': u'http://media2.giphy.com/media/WisjAI5QGgsrC/200_s.gif'}
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
    {'image_urls': u'http://media3.giphy.com/media/ZgDGEMihlZXCo/200_s.gif'}
.............

Answer 1

据我所知，您无需覆盖ImagesPipeline，因为您没有修改其行为。但是，既然你正在这样做，你应该正确地做到这一点覆盖ImagesPipeline时，应覆盖两种方法：

get_media_requests（item，info）应为Request中的每个网址返回image_urls。这部分你做得正确。
item_completed（结果，项目，信息）在单个项目的所有图像请求都已完成（完成下载或因某种原因失败）时被调用。来自official documentation：

item_completed（）方法必须返回将要发送的输出到后续项目管道阶段，所以你必须返回（或删除）项目，就像在任何管道中一样。

因此，要使自定义图片管道正常工作，您需要覆盖 item_completed（）方法，如下所示：

def item_completed(self, results, item, info):
    image_paths = [x['path'] for ok, x in results if ok]
    if not image_paths:
        raise DropItem("Item contains no images")
    item['image_paths'] = image_paths
    return item

接下来，关于代码中的其他问题，使其无法按预期工作：

您实际上并未创建任何有用的项目如果您查看parse()和getSecondCategory()函数，您会注意到您没有返回也没有产生任何项目。虽然您似乎准备了items列表，您显然希望用它来存储您的项目，但它从未用于实际将项目进一步传递到处理路径中。有一次，您只需为下一页生成Request，当该功能完成后，您的items就会被删除。
您没有使用通过caturl字典传递的meta信息。您正在parse()˙和getSecondCategory()中传递此信息，但您永远不会在回调函数中收集它。因此它也被忽略了。

所以，唯一能够正常工作的是图像管道，如果按照我的建议修复它。要在您的代码中解决这些问题，请遵循以下准则（请注意，这不是经过测试的，只是您需要考虑的准则）：

def parse(self,response):
    selector=Selector(response)
    topCategorys=selector.xpath('//div[@id="None-list"]/a')

    for tc in topCategorys:
        # no need to create the item just yet,
        # only get the category and the url so we can
        # continue the work in our callback
        catname = tc.xpath('./text()').extract()[0]
        caturl = tc.xpath('./@href').extract()[0]
        if catname == u'ALL':
            continue
        reqUrl=self.domain + '/' + caturl

        # pass the category name in the meta so we can retreive it
        # from the response in the callback function
        yield Request(url=reqUrl,meta={'catname': catname},
                      callback=self.getSecondCategory)

def getSecondCategory(self,response):
    selector=Selector(response)
    secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a')

    # retreive the category name from the response
    # meta dictionary, which was copied from our request
    catname = response.meta['catname']

    for sc in secondCategorys:
        # still no need to create the item, 
        # since we are just trying to get to 
        # the subcategory
        subcatname = sc.xpath('./div/h4/text()').extract()[0]
        subcaturl = sc.xpath('./@href').extract()[0]

        reqUrl=self.domain + '/' + subcaturl

        # this time pass both the category and the subcategory
        # so we can read them both in the callback function
        yield Request(url=reqUrl,meta={'catname':catname, 'subcatname':subcatname},
                        callback=self.getImages)

def getImages(self,response):
    selector=Selector(response)

    # retreive the category and subcategory name
    catname = response.meta['catname']
    subcatname = response.meta['subcatname']

    images = selector.xpath('//*[contains (@class,"hoverable-gif")]')

    for image in images:
        # now could be a good time to create the items
        item=TutorialItem()

        # fill the items category information. You can concatenate
        # the category and subcategory if you like, or you can 
        # add another field in your TutorialItem called subcatname
        item['catname'] = catname + ":" + subcatname
        # or alternatively:
        # item['catname'] = catname
        # item['subcatname'] = subcatname

        item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0]

        # no need to store the items in the list to return
        # it later, we can just yield the items as they are created
        yield item

Scrapy无法将图片下载到本地

1 个答案: