我正在研究抓取这个网站:
第1页:http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626
所以(我认为)出错的是它从第1页获取所有链接,转到子页面(因此它转到它提取的链接,“子页面”)然后转到第2页再次执行,但我认为在第1页之后它只获得第一个链接(而不是第2页的所有链接),然后继续第3页并执行相同的操作。
我尝试了很多不同的代码,我仍然无法做到正确,我希望你能看看我的代码并帮助我解决我的错误
Code Spider
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from craig.items import CraigItem
from scrapy.http import Request
import re
class CraigSpiderSpider(CrawlSpider):
name = "craig_spider"
allowed_domains = ["randstad.nl"]
start_urls = (
"http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626",
"http://www.randstad.nl/mwp2/faces/baanZoeken?"
)
rules = (Rule (SgmlLinkExtractor(allow=("filters=vakgebied!5626", "pagina=")), callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
#Haalt alle links op
for link in sel.xpath(".//a[contains(@class, 'outer-read-more-link')]/@href").extract():
yield Request(link, callback=self.parse)
#Gaat alle links af en haalt alle text op
text_list = sel.xpath('//div[@id="basePage:page:twoColumn:r2:0:functieOmschrijvingPanel::content"]/text()').extract()
title_list = sel.xpath('//div[@id="basePage:page:panelTitleHeader"]//td[@class="af_panelBox_header-text"]//h1[@class="af_panelBox_header-element"]/text()').extract()
label_samenvatting = sel.xpath('//div[@id="basePage:page:twoColumn:r1:0:pfl1b"]//table//td//label/text()').extract()
opleidingniveau_list = sel.xpath('//div[@id="basePage:page:twoColumn:r1:0:pl1"]//ul//li/text()').extract()
soortbaan_list = sel.xpath('//table[@id="basePage:page:twoColumn:r1:0:soortDienstverbandRNL"]//td[@class="AFContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
uren_per_week_list = sel.xpath('//tr[@id="basePage:page:twoColumn:r1:0:it5"]//td[@class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
vakgebied_list = sel.xpath('//tr[@id="basePage:page:twoColumn:r1:0:vakgebieden"]//td[@class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]//li/text()').extract()
branche_list = sel.xpath('//tr[@id="basePage:page:twoColumn:r1:0:aanvraagBranch"]//td[@class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
datum = sel.xpath('//span[@class="date-changed"]/text()').extract()
if text_list:
title = ' '.join(title_list)
text = ' '.join(text_list)
samenvatting = ' '.join(label_samenvatting)
opleidingniveau = ' '.join(opleidingniveau_list)
soortbaan = ' '.join(soortbaan_list)
urenperweek = ' '.join(uren_per_week_list)
vakgebied = ' '.join(vakgebied_list)
branche = ' '.join(branche_list)
item = CraigItem()
item['link'] = response.url
item['title'] = title
item['text'] = text
item['samenvatting'] = samenvatting
item['opleidingniveau'] = opleidingniveau
item['soortbaan'] = soortbaan
item['urenperweek'] = urenperweek
item['vakgebied'] = vakgebied
item['branche'] = branche
item['date'] = datum
yield item
代码项
from scrapy.item import Item, Field
class CraigItem(Item):
title = Field()
text = Field()
link = Field()
site = Field()
date = Field()
samenvatting = Field()
opleidingniveau = Field()
soortbaan = Field()
urenperweek = Field()
vakgebied = Field()
branche = Field()
答案 0 :(得分:1)
我认为在需要以下链接时应该使用CrawlSpider,而不是BaseSpider。
class CraigSpider(CrawlSpider):