我的蜘蛛可以在第一页抓取我想要的任何东西,但是当它试图找到下一页的xpath时,我得到索引超出范围的错误。我在shell中测试过,xpath看起来很好,所以现在我迷失了该做什么。
rom scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from lrrytas.items import LrrytasItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class LrrytasSpider(Spider):
name = "lrrytas"
allowed_domains = ['http://www.lrytas.lt/']
start_urls = ["http://www.lrytas.lt/?id=14355922181434706286&view=6"]
rules = (
Rule(LinkExtractor(allow=r'Items'), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths=('//*[@class="comment-box-head"]/*')), callback='parse_comments_follow_next_page', follow=True)
)
def parse(self, response):
sel = Selector(response)
site = sel.xpath('//*[@class="comment"]/*')
node = sel.xpath('//*[@class="comments"]/*')
for i in range(0, len(site), 2):
item = LrrytasItem()
item['name'] = node[i].xpath('*/div[contains(@class, "comment-nr")]/text()').extract()[0]
item['ip'] = node[i].xpath('*/*/div[contains(@class, "comment-ip")]/text()').extract()[0]
item['time'] = node[i].xpath('*/*/div[contains(@class, "comment-time")]/text()').extract()[0]
item ['comment'] = site[i + 1].xpath('descendant-or-self::text()').extract()[0]
yield item
def parse_comments_follow_next_page(self, response):
next_page = xpath('//*[contains(text(), "Kitas >>") and contains(@href, "id")]/@href')
if next_page:
url = response.urljoin(next_page[0].extract())
yield Request(url, self.parse)
编辑:我通过len()
答案 0 :(得分:0)
我的CrawlSpider
规则以及XPath
的next_page检查似乎与我不合适。因此,我建议您使用简单的Spider
并手动处理下一页请求。我编写了一些代码来说明如何做到这一点:
import scrapy
class Comment(scrapy.Item):
name = scrapy.Field()
ip = scrapy.Field()
time = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'lrytas'
allowed_domains = ['www.lrytas.lt']
start_urls = ['http://www.lrytas.lt/?id=14355922181434706286&view=6']
def parse(self, response):
xpath_comments = '//div[@class="comments"]/div[@class="comment"]'
sel_comments = response.xpath(xpath_comments)
for sel in sel_comments:
item = Comment()
item['name'] = ' '.join(sel.xpath('.//div[@class="comment-nr"]//text()').extract())
item['time'] = ' '.join(sel.xpath('.//div[@class="comment-time"]//text()').extract())
# Other item fields go here ...
yield item
# Check if there is a next page link ...
xpath_NextPage = './/a[contains(.,"Kitas >>")][1]/@href' # Take on of the two links
if response.xpath(xpath_NextPage):
# If YES: Create and submit request
url_NextPage = 'http://www.lrytas.lt' + response.xpath(xpath_NextPage).extract()[0]
request = scrapy.Request(url_NextPage, callback=self.parse)
yield request