我想抓取这样一个网站的多个页面:
A B C D ...........
| | | | ...........
E F G H ...........
| | | | ...........
a b c d ...........
| | | | ...........
e f g h ...........
这是我的代码:
allowed_domains = ["ko.pokemon.wikia.com"]
start_urls = ["http://ko.pokemon.wikia.com/wiki/%EC%A0%84%EA%B5%AD%EB%8F%84%EA%B0%90/1%EC%84%B8%EB%8C%80"]
def parse(self, response):
sel = response.xpath('//table[@class="centered aligncenter wikitable"]/tr')
urlList = list()
for i in range(1, 4):
urlList.append('http://ko.pokemon.wikia.com'+sel[i].xpath('td[3]/a/@href').extract()[0])
for url in urlList:
yield Request(url, self.parse_pokemon)
print '*'*50
print len(urlList)
def parse_pokemon(self, response):
item = CrawlerItem()
item['evol_img1'] = []
item['evol_img2'] = []
item['evol_img3'] = []
item['url1'] = 'http://ko.pokemon.wikia.com'
item['url2'] = 'http://ko.pokemon.wikia.com'
item['url3'] = 'http://ko.pokemon.wikia.com'
item['index'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[1]/td/table/tr/td[2]/big/big/b/text()').extract()
item['name'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[1]/td/table/tr/td[1]/big/big/b/text()').extract()
item['img'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
item['detail'] = response.xpath('//*[@id="mw-content-text"]/table[3]/tr[2]/td[2]/text()').extract()
item['type'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[5]/td[1]/span/a/span/text()').extract()
item['evol_name1'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[1]/a/text()').extract()
item['evol_name2'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[2]/a/text()').extract()
item['evol_name3'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[3]/a/text()').extract()
if not item['evol_name1']:
item['evol_name1'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[1]/strong/text()').extract()
if item['evol_name1']:
item['evol_img1'] = item['img']
if not item['evol_name2']:
item['evol_name2'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[2]/strong/text()').extract()
if item['evol_name2']:
item['evol_img2'] = item['img']
if not item['evol_name3']:
item['evol_name3'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[3]/strong/text()').extract()
if item['evol_name3']:
item['evol_img3'] = item['img']
url1 = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[1]/a/@href').extract()
if url1:
item['url1'] = 'http://ko.pokemon.wikia.com'+url1[0]
url2 = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[2]/a/@href').extract()
if url2:
item['url2'] = 'http://ko.pokemon.wikia.com'+url2[0]
url3 = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[3]/a/@href').extract()
if url3:
item['url3'] = 'http://ko.pokemon.wikia.com'+url3[0]
if item['url1']:
request = Request(url=item['url1'], callback=self.parse_url1)
request.meta['item'] = item
yield request
else:
yield item
def parse_url1(self, response):
print "here\n"
item = response.meta['item']
if not item['evol_img1']:
item['evol_img1'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
if item['url2']:
request = Request(url=item['url2'], callback=self.parse_url2)
request.meta['item'] = item
yield request
else:
yield item
def parse_url2(self, response):
item = response.meta['item']
if not item['evol_img2']:
item['evol_img2'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
if item['url3']:
request = Request(url=item['url3'], callback=self.parse_url3)
request.meta['item'] = item
yield request
else:
yield item
def parse_url3(self, response):
item = response.meta['item']
if not item['evol_img3']:
item['evol_img3'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
yield item
它没有给出错误,但它不起作用。我尝试更改单词yield
- > return
,然后它有效......但它只抓取一组多页,然后停止。我想爬151套。