我没有从下一页获取数据(第一页就可以了。)
我尝试了如下所示的几种方法(首先,我执行robots_obey = false; download_delay = 8;并更改了用户代理。在第二种方法中,再次尝试根据网站的身份来更改用户代理,然后尝试使用该用户代理覆盖请求标头,每次将前一个注释掉,然后robots_obey再次设置为false。平台是Python v 3.6。第一种方法在Windows 10和Ubuntu 18上尝试过。第二种方法仅在Windows上尝试过。)< / p>
方法1
# -*- coding: utf-8 -*-
import scrapy
class ScrapeDfo2Spider(scrapy.Spider):
name = 'scrape-dfo2'
allowed_domains = ['canada.ca']
start_urls = [
'https://www.canada.ca/en/news/advanced-news-search/news-results.html?typ=newsreleases&dprtmnt=fisheriesoceans&start=&end=']
def parse(self, response):
quotes = response.xpath('//*[@class="h5"]')
for quote in quotes:
title = quote.xpath('.//a/text()').extract_first()
link = quote.xpath('.//a/@href').extract_first()
yield {'Title': title,
'Link': link}
next_page_url = response.xpath('//a[@rel="next"]/@href').extract()
if next_page_url:
yield scrapy.Request(response.urljoin(next_page_url))
方法2
# -*- coding: utf-8 -*-
import scrapy
class ScrapeDfo2Spider(scrapy.Spider):
name = 'scrape-dfo2'
allowed_domains = ['canada.ca']
# start_urls = ['https://www.canada.ca/en/news/advanced-news-search/news-results.html?typ=newsreleases&dprtmnt=fisheriesoceans&start=&end=']
def start_requests(self):
yield scrapy.Request(url='https://www.canada.ca/en/news/advanced-news-search/news-results.html?typ=newsreleases&dprtmnt=fisheriesoceans&start=&end=', callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'})
def parse(self, response):
for quote in response.xpath('//*[@class="h5"]'):
yield{
'Title': quote.xpath('.//a/text()').get(),
'Link': quote.xpath('.//a/@href').get(),
'User-Agent': response.request.headers['User-Agent']}
next_page_url = response.xpath('//a[@rel="next"]/@href').extract()
if next_page_url:
yield scrapy.Request(response.urljoin(next_page_url), headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'})
# def parse(self, response):
# quotes
答案 0 :(得分:0)
我认为它可以为您提供帮助。
# -*- coding: utf-8 -*-
import scrapy
class CanadaSpider(scrapy.Spider):
name = 'canada'
allowed_domains = ['canada.ca']
start_urls = ['https://www.canada.ca/en/news/advanced-news-search/news-results.html?start=&typ=newsreleases&end=&idx=0&dprtmnt=fisheriesoceans']
page_count = 0
def start_requests(self):
for i in range(self.page_count, 690, 10):
yield scrapy.Request('https://www.canada.ca/en/news/advanced-news-search/news-results.html?start=&typ=newsreleases&end=&idx=%d&dprtmnt=fisheriesoceans'%i, callback=self.parse
)
def parse(self, response):
quotes = response.xpath('//*[@class="h5"]')
for quote in quotes:
title = quote.xpath('.//a/text()').extract_first()
link = quote.xpath('.//a/@href').extract_first()
yield {'Title': title,
'Link': link}
{'Title': 'Canadian small businesses create innovative solutions to help reduce plastic pollution in our oceans', 'Link': 'https://www.canada.ca/en/fisheries-oceans/news/2020/06/canadian-small-businesses-create-innovative-solutions-to-help-reduce-plastic-pollution-in-our-oceans.html'}
2020-06-15 05:57:33 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.canada.ca/en/news/advanced-news-search/news-results.html?start=&typ=newsreleases&end=&idx=0&dprtmnt=fisheriesoceans>
{'Title': 'Government of Canada takes the fight against illegal fishing to outer space', 'Link': 'https://www.canada.ca/en/fisheries-oceans/news/2020/06/government-of-canada-takes-the-fight-against-illegal-fishing-to-outer-space.html'}
2020-06-15 05:57:33 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.canada.ca/en/news/advanced-news-search/news-results.html?start=&typ=newsreleases&end=&idx=0&dprtmnt=fisheriesoceans>
{'Title': 'Closed areas for shellfish harvesting on the North Shore', 'Link': 'https://www.canada.ca/en/fisheries-oceans/news/2020/06/closed-areas-for-shellfish-harvesting-on-the-north-shore.html'}
答案 1 :(得分:0)
在下面回答我自己的问题(尚未尝试上述操作)
以下是方法2的答案。它的一个关键部分是,确保最后处理分页的行在for循环之外。
import scrapy
class ScrapeDfo2Spider(scrapy.Spider):
name = 'scrape-dfo2'
allowed_domains = ['www.canada.ca']
# start_urls = ['https://www.canada.ca/en/news/advanced-news-search/news-results.html?typ=newsreleases&dprtmnt=fisheriesoceans&start=&end=']
def start_requests(self):
yield scrapy.Request(url='https://www.canada.ca/en/news/advanced-news-search/news-results.html?typ=newsreleases&dprtmnt=fisheriesoceans&start=&end=', callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'})
def parse(self, response):
for quote in response.xpath('//*[@class="h5"]'):
yield{
'Title': quote.xpath('.//a/text()').get(),
'Link': quote.xpath('.//a/@href').get(),
'User-Agent': response.request.headers['User-Agent']}
next_page_url = response.xpath('//a[@rel="next"]/@href').get()
if next_page_url:
yield scrapy.Request(url=response.urljoin(next_page_url), headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'})