我正在使用scrapy收集意大利国家警察的新闻稿。我遇到的问题是刮刀没有跟随“下一个”链接,即使我有一个规则集来查找“下一个”或意大利语“Successiva”按钮并按照该链接。
这是我的代码。
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from items import ScrapyCrimeScraperItem
from string import replace
class ItalyScraper(CrawlSpider):
name = 'italy_crawler_test'
allowed_domains = ['poliziadistato.it']
start_urls = [
'http://www.poliziadistato.it/archivio/category/1298/2015/',
'http://www.poliziadistato.it/archivio/category/1298/2015/9/'
]
rules = (Rule(LinkExtractor(allow=('http://.*/articolo/view/*.....')), callback='parse_article', follow=False),
Rule(LinkExtractor(restrict_xpaths=("/html/body/div[@class='container'[1]/div[@class='row']/div[@class='col-md-6 col-md-push-3 padding0']/div[@class='trecolonne']/div[@class='center']/div[@class='bar']/ul[@class='paginazione']/li/a[contains(""@title,""'Successiva')]",)), follow=True))
# def generate_article_links(self, response):
# for href in response.css('a'):
# url = href.extract()
# yield scrapy.Request(url, callback=self.parse_article)
def parse_article(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = ScrapyCrimeScraperItem()
item['city'] = response.selector.css('h1').extract()[0]
item['country'] = 'italy'
item['site_link'] = response.url
item['article_link'] = response.url
item['article_raw_text'] = self.remove_carriage_returns(response.selector.css('.resetfont '
'p').extract(
)[0])
item['article_raw_date'] = response.selector.css('.data').extract()[0]
item['article_translated_text'] = ''
item['article_translated_date'] = ''
item['article_raw_markup'] = ''
item['crimes'] = ''
item['locations'] = ''
item['dateformat'] = ''
item['reserved1'] = ''
item['reserved2'] = ''
yield item
def remove_carriage_returns(self,item):
return(item.replace("\n", " "))
我查看了一些类似问题的其他回复,但我在第二条规则上使用了明确的follow=True
。我是否需要回调来生成新请求 - 或者后续变量是否应该负责生成新请求?
答案 0 :(得分:0)
我认为你刚搞砸了XPath表达式中的引号。请改用一个更简单的方法:
//ul[@class="paginazione"]/li/a[contains(@title, "Successiva")]