我正在尝试检索MercadoLibre产品列表。我正在使用Scrapy 1.5.0。当Scrapy尝试转到下一页时,它将循环浏览第一页和第二页。
代码
# Python 3.5
# Scrapy 1.5.0
import scrapy
from scrapy.http import Request
class MercadoLibreSpider(scrapy.Spider):
name = "mlspider"
allowed_domains = ['mercadolibre.com.ar']
start_urls = ['https://listado.mercadolibre.com.ar/microondas#D[A:microondas]']
def parse(self, response):
# Prices
priceData = response.css(".price__fraction::text").extract()
# Product Listings
titleData = response.css(".main-title::text").extract()
for i in range(len(priceData)):
yield {
'title':titleData[i],
'price':priceData[i]
}
# Pagina siguiente
nextPage = response.css('.andes-pagination__link::attr(href)').extract_first()
if nextPage is not None:
yield Request(url=nextPage, callback=self.parse)
答案 0 :(得分:0)
编辑
我找到了另一个解决方案。寻找“预取”类
<a class="andes-pagination__link prefetch" href="https://listado.mercadolibre.com.ar/electrodomesticos/coccion/microondas/microondas_Desde_49">2</a>
代码更正
import scrapy
from scrapy.http import Request
class MercadoLibreSpider(scrapy.Spider):
name = "mlspider"
allowed_domains = ['mercadolibre.com.ar']
start_urls = ['https://listado.mercadolibre.com.ar/microondas#D[A:microondas]']
def parse(self, response):
# Prices
priceData = response.css(".price__fraction::text").extract()
# Product Listings
titleData = response.css(".main-title::text").extract()
for i in range(len(priceData)):
yield {
'title':titleData[i],
'price':priceData[i]
}
# Next Page, using the prefetch identifier
nextPage = response.css('a.prefetch::attr(href)').extract_first()
if nextPage is not None:
yield Request(url=nextPage, callback=self.parse)
答案 1 :(得分:0)
更可靠的方法是利用容器并遍历它们以收集titles
和prices
。
import scrapy
from scrapy.http import Request
class MercadoLibreSpider(scrapy.Spider):
name = "mlspider"
start_urls = ['https://listado.mercadolibre.com.ar/microondas#D[A:microondas]']
def parse(self, response):
for item in response.css(".results-item"):
titleData = item.css(".main-title::text").extract_first()
priceData = item.css(".price__fraction::text").extract_first()
yield {
'title':titleData,
'price':priceData
}
nextPage = response.css('a.prefetch::attr(href)').extract_first()
if nextPage:
yield Request(url=nextPage, callback=self.parse)