如何抓取网站的多个页面?

时间:2016-07-22 23:15:50

标签: python scrapy

我想抓取这样一个网站的多个页面:

A B C D ...........
| | | | ...........
E F G H ...........
| | | | ...........
a b c d ...........
| | | | ...........
e f g h ...........

这是我的代码:

allowed_domains = ["ko.pokemon.wikia.com"]
start_urls = ["http://ko.pokemon.wikia.com/wiki/%EC%A0%84%EA%B5%AD%EB%8F%84%EA%B0%90/1%EC%84%B8%EB%8C%80"]

def parse(self, response):

    sel = response.xpath('//table[@class="centered aligncenter wikitable"]/tr')
    urlList = list()
    for i in range(1, 4):
        urlList.append('http://ko.pokemon.wikia.com'+sel[i].xpath('td[3]/a/@href').extract()[0])

    for url in urlList:
        yield Request(url, self.parse_pokemon)

    print '*'*50
    print len(urlList)

def parse_pokemon(self, response):
    item = CrawlerItem()

    item['evol_img1'] = []
    item['evol_img2'] = []
    item['evol_img3'] = []
    item['url1'] = 'http://ko.pokemon.wikia.com'
    item['url2'] = 'http://ko.pokemon.wikia.com'
    item['url3'] = 'http://ko.pokemon.wikia.com'

    item['index'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[1]/td/table/tr/td[2]/big/big/b/text()').extract()
    item['name'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[1]/td/table/tr/td[1]/big/big/b/text()').extract()
    item['img'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
    item['detail'] = response.xpath('//*[@id="mw-content-text"]/table[3]/tr[2]/td[2]/text()').extract()
    item['type'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[5]/td[1]/span/a/span/text()').extract()

    item['evol_name1'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[1]/a/text()').extract()
    item['evol_name2'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[2]/a/text()').extract()
    item['evol_name3'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[3]/a/text()').extract()

    if not item['evol_name1']:
        item['evol_name1'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[1]/strong/text()').extract()
        if item['evol_name1']:
            item['evol_img1'] = item['img']

    if not item['evol_name2']:
        item['evol_name2'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[2]/strong/text()').extract()
        if item['evol_name2']:
            item['evol_img2'] = item['img']

    if not item['evol_name3']:
        item['evol_name3'] = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[3]/strong/text()').extract()
        if item['evol_name3']:
            item['evol_img3'] = item['img']

    url1 = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[1]/a/@href').extract()
    if url1:
        item['url1'] = 'http://ko.pokemon.wikia.com'+url1[0]
    url2 = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[2]/a/@href').extract()
    if url2:
        item['url2'] = 'http://ko.pokemon.wikia.com'+url2[0]
    url3 = response.xpath('//*[@id="mw-content-text"]/table[4]/tr[2]/td[3]/a/@href').extract()
    if url3:
        item['url3'] = 'http://ko.pokemon.wikia.com'+url3[0]

    if item['url1']:
        request = Request(url=item['url1'], callback=self.parse_url1)
        request.meta['item'] = item
        yield request
    else:
        yield item

def parse_url1(self, response):
    print "here\n"
    item = response.meta['item']
    if not item['evol_img1']:
        item['evol_img1'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
    if item['url2']:
        request = Request(url=item['url2'], callback=self.parse_url2)
        request.meta['item'] = item
        yield request
    else:
        yield item

def parse_url2(self, response):
    item = response.meta['item']
    if not item['evol_img2']:
        item['evol_img2'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
    if item['url3']:
        request = Request(url=item['url3'], callback=self.parse_url3)
        request.meta['item'] = item
        yield request
    else:
        yield item

def parse_url3(self, response):
    item = response.meta['item']
    if not item['evol_img3']:
        item['evol_img3'] = response.xpath('//*[@id="mw-content-text"]/table[2]/tr[2]/td/a/@href').extract()
    yield item

它没有给出错误,但它不起作用。我尝试更改单词yield - > return,然后它有效......但它只抓取一组多页,然后停止。我想爬151套。

0 个答案:

没有答案