我的目标是使用scrapy抓取图片网址和图片alt标签。我尝试了很多组合,但仍然没有实现它。
这是我试过的
def parse_item(self, response):
sel = Selector(response)
item = imageItem()
item['crawl_time'] = time.asctime( time.localtime(time.time()))
item['crawl_date'] = time.asctime( time.localtime(time.strftime("%Y%m%d")))
item['url'] = response.url
for img in hxs.select('//img'):
item['title'] = node.xpath("@alt").extract()
item['iurl'] = node.xpath("@src").extract()
if response.meta['depth'] == 1:
exit
return item
答案 0 :(得分:2)
那里有一些问题:
sel
选择器。但是你在循环中使用hxs node
而不是img
这是我测试过的代码:
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
for img in images:
item = imageItem()
item['url'] = response.url
title = img.xpath('./@alt').extract() or ''
item_title = title[0] if title else ''
item['title'] = item_title
iurl = img.xpath('./@src').extract() or ''
item_iurl = iurl[0] if iurl else ''
item['iurl'] = item_iurl
yield item
答案 1 :(得分:1)
以下是使用我获得结果的代码,但深度仍为1
class MySpider(CrawlSpider):
name = 'imageaggr'
start_urls = ['http://www.dmoz.org/','http://timesofindia.indiatimes.com/','http://www.nytimes.com','http://www.washingtonpost.com/','http://www.jpost.com','http://www.rediff.com/']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('', ), deny=('defghi\.txt')), callback='parse_item'),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
# Rule(SgmlLinkExtractor(allow=('\.cms','\.html' )), deny=('parse_item\.html'))),
#Rule(SgmlLinkExtractor(allow=('news', )), callback='parse_item'),
)
def parse_item(self, response):
sel = Selector(response)
images = sel.xpath('//img')
image_count = len(images)
count = 0
while(count < image_count):
item = imageItem()
item['url'] = response.url
title = sel.xpath('//img/@alt').extract()[count] or ''
if title == '':
break
item['title'] = title
iurl = sel.xpath('//img/@src').extract()[count] or ''
item['iurl'] = iurl
item['crawl_time'] = time.asctime( time.localtime(time.time()))
crawl_date = time.strftime("%Y%m%d")
item['crawl_date'] = crawl_date
count = count + 1
return item