Scrapy蜘蛛似乎找不到下一页的xpath

时间:2015-07-03 07:53:03

标签: python xpath scrapy

我的蜘蛛可以在第一页抓取我想要的任何东西,但是当它试图找到下一页的xpath时,我得到索引超出范围的错误。我在shell中测试过,xpath看起来很好,所以现在我迷失了该做什么。

rom scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from lrrytas.items import LrrytasItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class LrrytasSpider(Spider):
    name = "lrrytas"
    allowed_domains = ['http://www.lrytas.lt/']
    start_urls = ["http://www.lrytas.lt/?id=14355922181434706286&view=6"]
    rules = (
       Rule(LinkExtractor(allow=r'Items'), callback='parse_item', follow=True),
       Rule(LinkExtractor(restrict_xpaths=('//*[@class="comment-box-head"]/*')), callback='parse_comments_follow_next_page', follow=True)
)
    def parse(self, response):
     sel = Selector(response)
     site = sel.xpath('//*[@class="comment"]/*')
     node = sel.xpath('//*[@class="comments"]/*')

     for i in range(0, len(site), 2):
       item = LrrytasItem()
       item['name'] = node[i].xpath('*/div[contains(@class, "comment-nr")]/text()').extract()[0]
       item['ip'] = node[i].xpath('*/*/div[contains(@class, "comment-ip")]/text()').extract()[0]
       item['time'] = node[i].xpath('*/*/div[contains(@class, "comment-time")]/text()').extract()[0]
       item ['comment'] = site[i + 1].xpath('descendant-or-self::text()').extract()[0]
       yield item

    def parse_comments_follow_next_page(self, response):
        next_page = xpath('//*[contains(text(), "Kitas >>") and contains(@href, "id")]/@href')
        if next_page:
            url = response.urljoin(next_page[0].extract())
            yield Request(url, self.parse)

编辑:我通过len()

手动使循环更加自动化

1 个答案:

答案 0 :(得分:0)

我的CrawlSpider规则以及XPath的next_page检查似乎与我不合适。因此,我建议您使用简单的Spider并手动处理下一页请求。我编写了一些代码来说明如何做到这一点:

import scrapy

class Comment(scrapy.Item):
    name = scrapy.Field()
    ip = scrapy.Field()
    time = scrapy.Field()

class MySpider(scrapy.Spider):

    name = 'lrytas'
    allowed_domains = ['www.lrytas.lt']
    start_urls = ['http://www.lrytas.lt/?id=14355922181434706286&view=6']

    def parse(self, response):

        xpath_comments = '//div[@class="comments"]/div[@class="comment"]'
        sel_comments = response.xpath(xpath_comments)
        for sel in sel_comments:
            item = Comment()
            item['name'] = ' '.join(sel.xpath('.//div[@class="comment-nr"]//text()').extract())
            item['time'] = ' '.join(sel.xpath('.//div[@class="comment-time"]//text()').extract())
            # Other item fields go here ...
            yield item

        # Check if there is a next page link ...
        xpath_NextPage = './/a[contains(.,"Kitas >>")][1]/@href' # Take on of the two links
        if response.xpath(xpath_NextPage):
            # If YES: Create and submit request
            url_NextPage = 'http://www.lrytas.lt' + response.xpath(xpath_NextPage).extract()[0]
            request = scrapy.Request(url_NextPage, callback=self.parse)
            yield request