在一个页面中,我想抓取两个链接,然后进入每个链接抓取一些信息,然后在一个项目中收集它们,我的代码是:
def parse(self, response):
a = '/html/body/div[3]/div/div/div[3]/ul/li[position()>1]/ul/li/a/'
# function query returns HtmlXPathSelector(response).select(xpath).extract()
song_names = query(a + 'text()', response)
song_links = query(a + '@href', response)
for name, link in izip(song_names, song_links):
yield Request(
url=self.host + link,
meta={'item': BdmmsItem(singer=name)},
callback=self.parse_single_song)
def parse_single_song(self, response):
item = response.meta['item']
album_link = query('a[contains(@href, "/album/")]/@href', response)[0]
lrc_link = query('//a[@lyricdata]/@lyricdata', response)[0]
# here, i want to go into the two different page to get different information
if lrc_link:
yield Request(
url=lrc_link[0],
meta={'item': item},
callback=self.parse_lrc)
if album_link:
yield Request(
url=album_link[0],
meta={'item': item},
callback=self.parse_album)
# if use urllib2, but how do that in scrapy
'''
item['lrc'] = urllib2.urlopen(lrc_link).read()
item['album'] = some_other_func(urllib2.urlopen(album_link).read())
'''
def parse_lrc(self, response):
item = response.meta['item']
item['lrc'] = response.body
yield item
def parse_album(self, response):
item = response.meta['item']
item['album'] = query('div[@id="album-info"]', response)
yield item
它将生成两个项目。如何在一个项目中生成信息?
答案 0 :(得分:1)
我会做这样的事情:
def parse_single_song(self, response):
item = response.meta['item']
album_link = query('a[contains(@href, "/album/")]/@href', response)[0]
lrc_link = query('//a[@lyricdata]/@lyricdata', response)[0]
if album_link:
meta={'item': item}
if lrc_link:
meta['lrc_link'] = lrc_link[0]
yield Request(
url=album_link[0],
meta={'item': item},
callback=self.parse_album)
def parse_album(self, response):
item = response.meta['item']
item['album'] = query('div[@id="album-info"]', response)
lrc_link = response.meta.get('lrc_link')
if lrc_link:
yield Request(
url=lrc_link,
meta={'item': item},
callback=self.parse_lrc)
else:
yield item
def parse_lrc(self, response):
item = response.meta['item']
item['lrc'] = response.body
yield item
如果没有专辑链接,它将无效,但您应该明白这一点。