我需要在多个页面上刮取车辆的品牌,型号和版本并将其放入单个项目中。我希望我的蜘蛛从page1中检索汽车品牌列表。然后为每个品牌转到第2页并获取所有模型。对于每个型号,请转到第3页并获取所有相关版本和规格。 每个版本的品牌和型号需要放入一个汽车项目(并产生?)。 不幸的是,它只是在第1页上循环了第一个品牌的所有步骤。在获得下一个品牌并开始流程之前,如何强制它循环遍历所有模型和相关版本?我在不同的版本中看到过这个问题,但是当我试图控制处理成一个项目的顺序时,我仍然很难理解多个回调是如何工作的。
class Spider(scrapy.Spider):
name = "fdm_bot"
allowed_domains = ["fdm.dk"]
start_urls = ["http://www.fdm.dk/bildatabasen",]
def parse(self, response):
rows = response.xpath('//td//a[contains(@href,"bildatabasen")]')
for row in rows:
car = CarItem()
car['brand'] = row.xpath('./text()').extract()
absolute_URL = response.urljoin(row.xpath('./@href').extract()[0])
request = Request(absolute_URL, callback=self.parse_model, meta={'car':car})
yield request
def parse_model(self, response):
rows = response.xpath('//td//a[contains(@href,"bildatabasen")]')
for row in rows:
car = response.meta['car']
brand = car['brand'][0]
model = row.xpath('./text()').extract()[0]
car['model'] = string.replace(model,brand, "")
absolute_URL = response.urljoin(row.xpath('./@href').extract()[0])
request = Request(absolute_URL, callback=self.parse_editions, meta={'car':car})
return request
def parse_editions(self, response):
rows = response.xpath('//td//a[contains(@href,"bildatabasen")]')
for row in rows:
car = response.meta['car']
brand = car['brand'][0]
model = car['model']
edition = row.xpath('./text()').extract()[0]
edition = string.replace(edition,brand, "")
car['edition'] = string.replace(edition,model, "")
absolute_URL = response.urljoin(row.xpath('./@href').extract()[0])
request = Request(absolute_URL, callback=self.parse_specs, meta={'car':car})
return request
def parse_specs(self, response):
car = response.meta['car']
car['engineVolume'] = 1.0
return car