我正在尝试刮掉多个页面。其结构如下:
- >第1页 - 刮取链接
------->第2页 - 删除更多链接(某些页面包含分页)和数据
------------>第3页 - 抓取数据
返回18项,但有127页(第2步)和18项/页。而不是在item中返回author和author_link。
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as lext
from scrapy.selector import Selector
from scrapy.http import Request
from ror.items import RorItem
class RorSpiderSpider(CrawlSpider):
name = "ror_spider"
allowed_domains = ["example.com"]
start_urls = (
'http://www.example.com/',
)
rules = [
Rule(lext(allow=("http://www.example.com/$"), restrict_xpaths=('//a[@class="nextpostslink"]',)), callback='parse', follow=True),
]
def parse(self, response):
links = Selector(response).xpath('//ul[@id="nav"]/li')
for link in links:
item = RorItem()
item['menu_link'] = link.xpath('a/@href').extract()[0]
item['menu_title'] = link.xpath('a/text()').extract()[0]
if "http" not in item['menu_link']:
item['menu_link'] = "http://www.reviewofreligions.org" + ''.join(item['menu_link'])
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
else:
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
def parse_articles(self, response):
sel = Selector(response)
item = response.meta['item']
if "articles" in item['menu_link']:
item['link_cat'] = item['menu_title']
pg = 1
maxPgs = 124
while pg <= 124:
item['article_pg_link'] = item['menu_link'] + "page/" + str(pg) + "/"
article_links = sel.xpath('//div[@id="rightcol"]/div[@class="articlebox"]')
for art_link in article_links:
item['article_link'] = art_link.xpath('a[@class="title "]/@href').extract()[0]
item['article_title'] = art_link.xpath('a[@class="title "]/text()').extract()[0].replace('\n\t\t\t\t', '').replace('\t\t\t\t', '')
# article_txt_1 = art_link.xpath('text()').extract()[1].replace('\n \n\t\t\t\t', '').replace('\t\t\t\t', '').replace('\n \n', '')
# article_txt_2 = art_link.xpath('text()').extract()[2].replace('\n \n\t\t\t\t', '') if art_link.xpath('text()').extract()[2] else ''
# item['article_txt'] = article_txt_1 + '\n'.join(article_txt_2).replace('\n\n\n \n\n\n \n \n \n \n\n\n\t\n\t\n\t', '')
yield Request(url=item['article_link'], meta={'item': item}, callback=self.article_page)
pg += 1
def article_page(self, response):
select = Selector(response)
item = response.meta['item']
item['author'] = select.xpath('//div[@id="author"]/a/text()').extract()
item['author_link'] = select.xpath('//div[@id="author"]/a/@href').extract()
return item
代码有什么问题?