同志,我需要你的帮助,我为此受了折磨。我的项目数据被重复并从不同页面重新排列。这是我的代码。
def start_requests(self):
yield scrapy.Request(url=self.job['start_url'], callback=self.parse)
def parse(self, response):
for subcat in response.xpath("//div[@id='nav-menu-group']/nav[@id='menu']/ul/li[@class='_category-link-wrapper current selected ']/ul[@class='_subcategories current']/li")[7:-2]:
Item = WebscrapyItem()
url = subcat.xpath('./a/@href').extract_first()
if url:
Item['category'] = self.job['category']
Item['subcategory'] = subcat.xpath('./a/span//text()').extract_first()
yield scrapy.Request(url=url, meta={'Item': Item}, callback=self.parse_data)
def parse_data(self, response):
Item = response.meta['Item']
for product in response.xpath('//ul[@class="product-list _productList "]/li'):
url = product.xpath('./a/@href').extract_first()
if url:
Item['dt'] = datetime.datetime.utcnow()
Item['product_id'] = product.xpath('./@id').extract_first()
Item['url'] = url
yield scrapy.Request(url=url, meta={'Item': Item}, callback=self.parse_details)
def parse_details(self, response):
Item = response.meta['Item']
other_data = json.loads(response.xpath('//section[@id="product"]/script[@type="application/ld+json"]//text()').extract_first())
Item['data'] = {
'header': response.xpath("//div[@class='info-section']/header/h1/text()").extract_first(),
'price': other_data[0]['offers']['price'],
'priceCurrency': other_data[0]['offers']['priceCurrency'],
'colorName': response.xpath("//div[@class='info-section']/div[@class='product-info-wrapper _product-info'][1]/p[@class='product-color']/span[@class='_colorName']/text()").extract_first(),
'reference': response.xpath("//div[@class='info-section']/div[@class='product-info-wrapper _product-info'][1]/p[@class='product-color']/span[2]/text()").extract_first(),
'description': response.xpath("//div[@class='info-section']/div[@class='product-info-wrapper _product-info'][1]/div[@id='description']/p[@class='description']/text()").extract_first(),
'sizeList': response.xpath("//div[@class='info-section']/form[@class='_product-detail-actions']/fieldset[@class='size-selector _size-selector opened ']/div[@class='size-select']/div[@class='size-list']/label/span/text()").extract_first(),
'other_data': other_data,
}
yield Item
我在for循环之前和之后创建了Item对象,但是没有用。数据以重复和混合的形式记录在mongodb中。导出到json时是相同的。
大量帮助...