我正在使用Scrapy (0.22)抓取一个网站。我需要做三件事:
但是现在我被封锁了,我使用'pipelines'来下载图片,但是我的代码无法正常工作,它无法将图片下载到本地。
另外,由于我想将信息存储在Mongo中,任何人都可以给我一些关于“Mongo表结构”的建议吗?
我的代码如下:
settings.py
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ITEM_PIPELINES = {'tutorial.pipelines.TutorialPipeline': 1}
IMAGES_STORE = '/ttt'
items.py
from scrapy.item import Item, Field
class TutorialItem(Item):
# define the fields for your item here like:
# name = Field()
catname=Field()
caturl=Field()
image_urls = Field()
images = Field()
pass
pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
from pprint import pprint as pp
class TutorialPipeline(object):
# def get_media_requests(self, item, info):
# for image_url in item['image_urls']:
# yield Request(image_url)
# def process_item(self, item, spider):
# print '**********************===================*******************'
# return item
# pp(item)
# pass
def get_media_requests(self,item,info):
# pass
pp('**********************===================*******************')
# yield Request(item['image_urls'])
for image_url in item['image_urls']:
# pass
# print image_url
yield Request(image_url)
spider.py
import scrapy
import os
from pprint import pprint as pp
from scrapy import log
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spider import Spider
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import TutorialItem
from pprint import pprint as pp
class BaiduSpider(scrapy.spider.Spider):
name='baidu'
start_urls=[
# 'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/'
'http://giphy.com/categories'
]
domain='http://giphy.com'
def parse(self,response):
selector=Selector(response)
topCategorys=selector.xpath('//div[@id="None-list"]/a')
# pp(topCategorys)
items=[]
for tc in topCategorys:
item=TutorialItem()
item['catname']=tc.xpath('./text()').extract()[0]
item['caturl']=tc.xpath('./@href').extract()[0]
if item['catname']==u'ALL':
continue
reqUrl=self.domain+'/'+item['caturl']
# pp(reqUrl)
yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getSecondCategory)
def getSecondCategory(self,response):
selector=Selector(response)
# pp(response.meta['caturl'])
# pp('*****************=================**************')
secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a')
# pp(secondCategorys)
items=[]
for sc in secondCategorys:
item=TutorialItem()
item['catname']=sc.xpath('./div/h4/text()').extract()[0]
item['caturl']=sc.xpath('./@href').extract()[0]
items.append(item)
reqUrl=self.domain+item['caturl']
# pp(items)
# pp(item)
# pp(reqUrl)
yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getImages)
def getImages(self,response):
selector=Selector(response)
# pp(response.meta['caturl'])
# pp('*****************=================**************')
# images=selector.xpath('//ul[@class="gifs freeform grid_12"]/div[position()=3]')
images=selector.xpath('//*[contains (@class,"hoverable-gif")]')
# images=selector.xpath('//ul[@class="gifs freeform grid_12"]//div[@class="hoverable-gif"]')
# pp(len(images))
items=[]
for image in images:
item=TutorialItem()
item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0]
# item['imgName']=image.xpath('./a/figure/img/@alt').extract()[0]
items.append(item)
# pp(item)
# pp(items)
# pp('==============************==============')
# pp(items)
# items=[{'images':"hello world"}]
return items
另外,输出中没有错误,只是如下:
2014-12-21 13:49:56+0800 [scrapy] INFO: Enabled item pipelines: TutorialPipeline
2014-12-21 13:49:56+0800 [baidu] INFO: Spider opened
2014-12-21 13:49:56+0800 [baidu] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2014-12-21 13:50:07+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/categories> (referer: None)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/science/> (referer: http://giphy.com/categories)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/sports/> (referer: http://giphy.com/categories)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/news-politics/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/transportation/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/interests/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/memes/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/tv/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/gaming/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/nature/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/emotions/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/movies/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/holiday/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/reactions/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/music/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/decades/> (referer: http://giphy.com/categories)
2014-12-21 13:50:12+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/search/the-colbert-report/> (referer: http://giphy.com//categories/news-politics/)
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
{'image_urls': u'http://media1.giphy.com/media/2BDLDXFaEiuBy/200_s.gif'}
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
{'image_urls': u'http://media2.giphy.com/media/WisjAI5QGgsrC/200_s.gif'}
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
{'image_urls': u'http://media3.giphy.com/media/ZgDGEMihlZXCo/200_s.gif'}
.............
答案 0 :(得分:2)
据我所知,您无需覆盖ImagesPipeline
,因为您没有修改其行为。但是,既然你正在这样做,你应该正确地做到这一点
覆盖ImagesPipeline
时,应覆盖两种方法:
get_media_requests(item,info)应为Request
中的每个网址返回image_urls
。这部分你做得正确。
item_completed(结果,项目,信息)在单个项目的所有图像请求都已完成(完成下载或因某种原因失败)时被调用。来自official documentation:
item_completed()方法必须返回将要发送的输出 到后续项目管道阶段,所以你必须返回(或删除) 项目,就像在任何管道中一样。
因此,要使自定义图片管道正常工作,您需要覆盖 item_completed()方法,如下所示:
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
接下来,关于代码中的其他问题,使其无法按预期工作:
您实际上并未创建任何有用的项目
如果您查看parse()
和getSecondCategory()
函数,您会注意到您没有返回也没有产生任何项目。虽然您似乎准备了items
列表,您显然希望用它来存储您的项目,但它从未用于实际将项目进一步传递到处理路径中。有一次,您只需为下一页生成Request
,当该功能完成后,您的items
就会被删除。
您没有使用通过caturl
字典传递的meta
信息。您正在parse()
˙和getSecondCategory()
中传递此信息,但您永远不会在回调函数中收集它。因此它也被忽略了。
所以,唯一能够正常工作的是图像管道,如果按照我的建议修复它。要在您的代码中解决这些问题,请遵循以下准则(请注意,这不是经过测试的,只是您需要考虑的准则):
def parse(self,response):
selector=Selector(response)
topCategorys=selector.xpath('//div[@id="None-list"]/a')
for tc in topCategorys:
# no need to create the item just yet,
# only get the category and the url so we can
# continue the work in our callback
catname = tc.xpath('./text()').extract()[0]
caturl = tc.xpath('./@href').extract()[0]
if catname == u'ALL':
continue
reqUrl=self.domain + '/' + caturl
# pass the category name in the meta so we can retreive it
# from the response in the callback function
yield Request(url=reqUrl,meta={'catname': catname},
callback=self.getSecondCategory)
def getSecondCategory(self,response):
selector=Selector(response)
secondCategorys=selector.xpath('//div[@class="grid_9 omega featured-category-tags"]/div/a')
# retreive the category name from the response
# meta dictionary, which was copied from our request
catname = response.meta['catname']
for sc in secondCategorys:
# still no need to create the item,
# since we are just trying to get to
# the subcategory
subcatname = sc.xpath('./div/h4/text()').extract()[0]
subcaturl = sc.xpath('./@href').extract()[0]
reqUrl=self.domain + '/' + subcaturl
# this time pass both the category and the subcategory
# so we can read them both in the callback function
yield Request(url=reqUrl,meta={'catname':catname, 'subcatname':subcatname},
callback=self.getImages)
def getImages(self,response):
selector=Selector(response)
# retreive the category and subcategory name
catname = response.meta['catname']
subcatname = response.meta['subcatname']
images = selector.xpath('//*[contains (@class,"hoverable-gif")]')
for image in images:
# now could be a good time to create the items
item=TutorialItem()
# fill the items category information. You can concatenate
# the category and subcategory if you like, or you can
# add another field in your TutorialItem called subcatname
item['catname'] = catname + ":" + subcatname
# or alternatively:
# item['catname'] = catname
# item['subcatname'] = subcatname
item['image_urls']=image.xpath('./a/figure/img/@src').extract()[0]
# no need to store the items in the list to return
# it later, we can just yield the items as they are created
yield item