更改页面= xxx(xxx = 1至102)设置以尝试废弃lazada项目会返回很多重复的项目。由于此问题,网页上的许多项目没有被刮掉。
尝试以下方法,但无用:
1.)设置以下内容
'CONCURRENT_REQUESTS':1
'CONCURRENT_REQUESTS_PER_DOMAIN':1,
'RANDOMIZE_DOWNLOAD_DELAY':是的,
'CONCURRENT_REQUESTS_PER_IP':1,
2.)在请求中包括标头,例如:yield scrapy.Request(url = next_page,标头= {'JSESSIONID':'2DE61BF1E734471FBB8C768B21D47D85'}
3.)将页面从102设置为1,而不是1到102
进口沙皮 从scrapy.loader导入ItemLoader 从scrapy.loader.processors导入TakeFirst,Compose,Join 汇入 从diffmarts.items导入DiffmartsItem 从diffmarts.util导入ProductLoader,to_price,to_link,to_name 导入json 导入时间
类RedmartSpider(scrapy.Spider):
名称=“ redmart”
start_urls = ['https://redmart.lazada.sg/shop-party-supplies/?acm=201903252.1003.1.3731409&m=redmart&page=102&pos=17&scm=1003.1.201903252.null_16_3731409&spm=a2o42.redmart_channel.nav_category_tree.213.6fea48a6uGZyQM']
“”
start_urls = ['https://redmart.lazada.sg/shop-groceries-fresh-produce/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.18.e7ea48a6I6BibL',
'https://redmart.lazada.sg/meat-and-seafood/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.30.2e5d48a6aXcB49',
'https://redmart.lazada.sg/shop-Groceries-DairyChilled/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.37.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-Groceries-FoodStaplesCookingEssentials/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.52.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/beverages/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.67.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/laundry-and-home-care/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.81.2e5d48a6YIJ1n7'
'https://redmart.lazada.sg/mother-baby/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.92.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-groceries-frozen/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.108.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-Groceries-ChocolateSnacksSweets/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.121.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/breakfast/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.131.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/wines-beers-spirits/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.143.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-health-beauty/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.156.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-kitchen-dining/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.173.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-furniture-decor-2/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.189.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-pet-supplies/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.203.2e5d48a6YIJ1n7',
'https://redmart.lazada.sg/shop-party-supplies/?m=redmart&page=102&spm=a2o42.redmart_channel.nav_category_tree.211.2e5d48a6YIJ1n7']
“”
custom_settings = {
'CONCURRENT_REQUESTS':1
'CONCURRENT_REQUESTS_PER_DOMAIN':1,
'RANDOMIZE_DOWNLOAD_DELAY':是的,
#'CONCURRENT_REQUESTS_PER_IP':1,
#delay似乎对抓取的项目数有影响0.5 = 663 0.01 = 691
#'DOWNLOAD_DELAY':0.01,
}
def parse(自身,响应):
cat_name = response.url
cat_name = re.sub('https://redmart.lazada.sg/','',cat_name)
cat_name = re.sub('-','',cat_name)
cat_link = response.url
产生scrapy.Request(URL = response.url,标头= {'JSESSIONID':'2DE61BF1E734471FBB8C768B21D47D85'},callback = self.parse_category,meta = {'cat_link':cat_link,'cat_name':cat_name,'page':102} )
def parse_category(self, response):
print("@@@ Parsing: %s " % (response.url))
if len(response.body) == 0:
print("@@@ Response empty, retry parsing: %s" % (response.url))
yield scrapy.Request(url=response.url,callback=self.parse_category, meta={'cat_link': response.meta['cat_link'], 'cat_name': response.meta['cat_name'],'page': response.meta['page']}, dont_filter=True)
else:
#print("debug url %s" % response.url)
data=response.xpath("//script[contains(.,'mod')]/text()").extract_first()
sdata=re.sub('window.pageData=','',data)
json_response=json.loads(sdata)
#checksuccess=json_response['msg'] json_response is a dict with len 2 //json_response.keys
#> a['mods'].keys()
if ('mods' in json_response.keys()):
print("@@@ %s: It's got %d items" % (response.url,len(json_response['mods']['listItems'])))
for product in range(len(json_response['mods']['listItems'])):
yield self.parse_item(response, json_response['mods']['listItems'][product], json_response['mainInfo']['title'])
next_page=response.xpath("//link[@rel='prev']//@href").extract_first()
page=int(response.meta['page'])
if (next_page is not None):
pre_page='page='+str(page)
#print("debug 1 pre-page %s" % pre_page)
next_page='page='+str(page-1)
#print("debug 1 next-page %s" % next_page)
next_page=re.sub(pre_page,next_page,response.url)
#print("debug next page %s" % next_page)
#yield scrapy.Request(url=next_page, callback=self.parse_category, meta={'cat_link': response.meta['cat_link'], 'cat_name': response.meta['cat_name'],'page': next_page}, dont_filter=True)
yield scrapy.Request(url=next_page, headers={'JSESSIONID':'2DE61BF1E734471FBB8C768B21D47D85'},callback=self.parse_category, meta={'page': response.meta['page']-1})
def parse_item(self, response, json_product, cat_name):
item_loader = ProductLoader(DiffmartsItem(), None)
item_loader.add_value('id', str(json_product['itemId']))
#print("debug url %s" % response.url)
#print("debug ID %s" % json_product['name'])
item_loader.add_value('cat_name', cat_name)
item_loader.add_value('name', json_product['name'], Compose(to_name))
if 'originalPrice' in json_product.keys():
item_loader.add_value('price', re.sub('$','',json_product['priceShow']), Compose(to_price))
item_loader.add_value('prev_price', json_product['originalPrice'], Compose(to_price))
item_loader.add_value('promotion', json_product['discount'])
else:
item_loader.add_value('price',json_product['price'], Compose(to_price))
item_loader.add_value('prev_price', '0')
item_loader.add_value('link', json_product['productUrl'], Compose(lambda v:to_link(v, response)))
item_loader.add_value('image_link', json_product['image'], Compose(lambda v:to_link(v, response)))
checksoldout=json_product['inStock']
if checksoldout=='Yes':
item_loader.add_value('sold_out', 1)
else:
item_loader.add_value('sold_out', 0)
return item_loader.load_item()
预计将获得102(页面)x24 = 2448项,但只能获得1200至1300项。