我正在尝试从amazon.in收集产品和卖方信息,该信息仅在选择了产品尺寸后才会显示(以下是我正在尝试的一种示例产品)。我正在尝试收集所有尺寸的数据。
https://www.amazon.in/Puma-Black-Running-Shoes-9-19163804/dp/B0792KBCRW
我试图借助带有选择器的选项值来重新创建URL。 response.xpath('// * [@ id =“ native_dropdown_selected_size_name”] / option / @ value')。extract()
下面是它返回的列表:
[u'-1', u'0,B0792KBCRW', u'1,B07928YZVZ', u'2,B07929CLRY', u'3,B07928HYJ7', u'4,B07927MZ19', u'5,B07928PZXT']
基于上面的列表,我试图为每种尺寸加载一个特定的URL,以下是我的蜘蛛代码。
def parse(self, response):
avalibility = response.xpath('//*[@id="availability"]/span/text()').get()
multipleProductSize = response.xpath('//*[@id="variation_size_name"]/span[2]').get()
productUID = response.meta['start_url_id']
if 'Currently unavailable' in avalibility:
yield scrapy.Request(url=response.url, meta={'productUID': productUID}, callback=self.productUnavailable)
else:
if multipleProductSize is not None:
dropdownList = response.xpath('//*[@id="native_dropdown_selected_size_name"]/option/@value').extract()
for data in dropdownList:
if len(data) > 2:
dataList = data.split(",")
dataItem = dataList[1]
productPageURLbySize = 'www.amazon.in/dp/' + str(dataItem) + '/ref=twister_dp_update?_encoding=UTF8&th=1&psc=1'
yield scrapy.Request(url=productPageURLbySize, meta={'productUID': productUID, 'optionValue': data}, callback=self.productAvailable)
else:
yield scrapy.Request(url=response.url, meta={'productUID': productUID}, callback=self.singleProductAvailable)
根据产品是否可用以及是否存在尺寸下拉列表,我向相应功能发出请求。以下是下拉菜单可用时的功能。
def productAvailable(self, response):
productUID = response.meta['productUID']
optionValue = response.meta['optionValue']
mrp = response.xpath('//*[@id="price"]/table/tr[1]/td[2]/span[1]/text()').extract()[0]
# normalize the MRP to manage unicode characters
formattedMRP = mrp.replace(u'\u20b9\xa0', '')
sPrice = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract()
formattedSellingPrice = sPrice.replace(u'\u20b9\xa0', '')
fulfilledByAMZ = response.xpath('//*[@id="priceblock_ourprice_row"]/td[2]/span[2]').extract()
if fulfilledByAMZ is None:
protalVarified = 0
else:
protalVarified = 1
# Get product Size
xpathForSize = str('//*[@id="native_dropdown_selected_size_name"]/option[@value="' + optionValue + '"]/@data-a-html-content)')
productSize = response.xpath(xpathForSize).get()
item = AmzProductdetailsItem()
item['productUID'] = productUID
item['available'] = 1
item['avgRating'] = response.xpath('//*[@id="acrPopover"]/span[1]/a/i[1]/span/text()').extract()[0]
item['totalReviews'] = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract()[0]
item['mrp'] = formattedMRP
item['sellingPrice'] = formattedSellingPrice
item['portalVarified'] = protalVarified
item['seller'] = response.xpath('//*[@id="sellerProfileTriggerId"]/text()').extract()[0]
item['size'] = productSize
item['productInfo'] = response.xpath('//*[@id="feature-bullets"]').extract()[0]
item['offers'] = response.xpath('//*[@id="sopp_feature_div"]/ul').extract()[0]
item['additionalInfo1'] = response.xpath('//*[@id="productDetailsTable"]/tr/td/div/ul/li[1]/text()').extract()[0]
item['additionalInfo2'] = response.xpath('//*[@id="productDetailsTable"]/tr/td/div/ul/li[4]').extract()[0]
item['fistSellingDate'] = response.xpath('//*[@id="productDetailsTable"]/tr/td/div/ul/li[2]/text()').extract()[0]
item['linkToReviews'] = response.xpath('//*[@id="reviews-medley-footer"]/div[2]/a/@href').extract()[0]
yield item
虽然URL似乎在网页上加载了数据,但它返回的页面没有在Scrapy shell中选择大小。所以我继续在没有大小的页面上结束
有人可以指导我如何根据大小存档以收集数据。