我正在使用Scrapy制作刮刀。这是代码:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_url = 'http://www.irna.ir/en/services/161'
next_page = 162
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'page_number': 1})
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
def parse_article(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
articleday = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
raise CloseSpider('duplicate article')
yield {
'date': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[@id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[@class="Tags"]/p/a/text()').extract() if tag.strip()]
}
我希望它只删除自上次运行以来放置的链接,因此每次读取文章时都会将其发布日期与程序运行的最后一次进行比较,如果文章较旧,则会不要刮掉它并杀死程序。
这里的问题是,有多个类别都在使用此代码同时被删除,并且在我浏览另一个类别中的所有新文章之前,我可能会在一个类别中找到较旧的文章
是否有可能提出某些内容以便只杀死一个函数实例,以便刮刀能够继续查看其他类别?
编辑:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
def parse(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
articleday = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
def parse_article(self, response):
yield {
'date': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[@id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[@class="Tags"]/p/a/text()').extract() if tag.strip()]
}
这个问题是,在抓取文章以确定其日期之前,我似乎无法加载文章。
答案 0 :(得分:0)
您需要对蜘蛛进行一些重组。一个是你不应该使用
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
因为每次获得结果页面时,您都会反复运行相同的网址。因此,在下次请求后,它们将被过滤掉。所以你应该在base_urls中使用它
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
接下来,在您的文章解析中,您应该从结果中获取日期
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
# get the date for this article
# if the date is already extracted
date_already_processed = <-Get the date from result page->
if date_already_processed:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})