我正在尝试使用Scrapy抓取网站。这是我的代码:
import scrapy
class ArticleSpider(scrapy.Spider):
name = "article"
start_urls = [
'http://www.irna.ir/en/services/161',
]
def parse(self, response):
for linknum in range(1, 15):
next_article = response.xpath('//*[@id="NewsImageVerticalItems"]/div[%d]/div[2]/h3/a/@href' % linknum).extract_first()
next_article = response.urljoin(next_article)
yield scrapy.Request(next_article)
for text in response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]'):
yield {
'article': text.xpath('./text()').extract()
}
for tag in response.xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_bodytext"]'):
yield {
'tag1': tag.xpath('./div[3]/p[1]/a/text()').extract(),
'tag2': tag.xpath('./div[3]/p[2]/a/text()').extract(),
'tag3': tag.xpath('./div[3]/p[3]/a/text()').extract(),
'tag4': tag.xpath('./div[3]/p[4]/a/text()').extract()
}
yield response.follow('http://www.irna.ir/en/services/161', callback=self.parse)
但是这会在JSON中返回重复项目的奇怪混合,不按顺序并经常跳过链接:https://pastebin.com/LVkjHrRt
但是,当我将linknum设置为单个数字时,代码工作正常。
为什么迭代会改变我的结果?
答案 0 :(得分:0)
正如@TarunLalwani已经说过,你目前的方法是不对的。基本上你应该:
parse
方法中,提取指向页面上所有文章的链接,并通过一个名为egback的回调产生抓取请求的请求parse_article
。parse
方法中,检查是否存在用于加载更多文章的按钮,如果存在,则生成对模式http://www.irna.ir/en/services/161/pageN的URL的请求。 (这可以在网络选项卡上的XHR请求下的浏览器开发人员工具中找到。)parse_article
方法,从详细信息页面中提取文章文本和标签,最后将其作为项目生成。下面是最后一只蜘蛛:
import scrapy
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_url = 'http://www.irna.ir/en/services/161'
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'page_number': 1})
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
def parse_article(self, response):
yield {
'text': ' '.join(response.xpath('//p[@id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[@class="Tags"]/p/a/text()').extract() if tag.strip()]
}