我有这个脚本,可以在scrapy和mysql级别正常工作。数据已正确插入,但方案是: -我进行抓取,插入了大约370个项目 -370件商品(此数字有时会有所不同)后,价格的价值将变为空 -直到我获得约8000个项目,脚本仍在运行
_如果我再次运行抓取,价格从一开始就是空的。 -一段时间(大于1小时)后,价格恢复,直到达到370件左右
import scrapy
import urllib
import time
import datetime
import re
import requests
import simplejson
import json
from re import sub
from decimal import Decimal
#from prod.items import ProdItem
from staging.items import StagingItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d')
#pagenb = 0
#pagegrp = 0
class QuotesSpider(scrapy.Spider):
name = "shopee"
def start_requests(self):
shopeecat = [["17101","Home care"],["160","Health & Beauty"],["9824","Grocery"],["9827","Snack food"],["2352","Baby products"]]
for i in range(len(shopeecat)):
for pageloop in range(0, 10000,20):
if pageloop < 10000:
url = 'https://shopee.vn/api/v2/search_items/?by=pop&limit=20&match_id=%s&newest=%s&page_type=search&order=desc' % (shopeecat[i][0],pageloop)
headers = {
"if-none-match-": "55b03-c72444ae4e056f6e58a1d9eab26d36d4",
"accept-encoding": "gzip, deflate, sdch, br",
"x-requested-with": "XMLHttpRequest",
"accept-language": "en-SG,en;q=0.8,en-US;q=0.6,zh-CN;q=0.4,zh;q=0.2,vi;q=0.2,fr;q=0.2",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36",
"x-api-source": "pc",
"accept": "*/*",
"referer": "https://shopee.vn/S%E1%BB%A9c-Kh%E1%BB%8Fe-S%E1%BA%AFc-%C4%90%E1%BA%B9p-cat.160",
"authority": "shopee.vn"
}
meta={"acategory": shopeecat[i][1]}
request = scrapy.Request(url=url, callback=self.parse, meta=meta, method='GET', headers=headers)
yield request
def parse(self, response):
print response.headers
jsonresponse = json.loads(response.body)
# print jsonresponse
for product in jsonresponse['items']:
item = StagingItem()
item['collector_sku'] = product['name']
linkname = "https://shopee.vn/" + product[u'name'].replace(' ', '-') +"-i."+str(product['shopid'])+"."+str(product['itemid'])
if 'price_before_discount' in product:
apromo = product['price_before_discount']
print "yes promo"
print product['price_before_discount']
else:
apromo = ''
print "no promo"
item['collector_price_promo'] = apromo
item['collector_retailer'] = 'Shopee'
item['collector_url'] = linkname
aimage = "https://cf.shopee.vn/file/" + product['image']
item['collector_photo_url'] = aimage
item['collector_brand'] = product['brand']
item['collector_quantity'] = 'NA'
item['collector_category'] = response.meta["acategory"]
if 'price' in product:
aprice = product['price']
print "yes price"
print product['price']
else:
aprice = ''
print "no price"
item['collector_price'] = aprice
item['collector_timestamp'] = timestamp
item['collector_local_id'] = product['itemid']
item['collector_location_id'] = ''
item['collector_location_name'] = ''
item['collector_vendor_id'] = product['shopid']
item['collector_vendor_name'] = ''
yield item
网站是否有可能仅在丢弃一堆物品后才阻止碎片?似乎很奇怪,为什么只在价格上限制内容?其余的内容(名称,网址等)可以正常使用,只有价格和促销价格变为空。 我无法从网站上的浏览器复制此案,但在几次报废后,报废反应清楚地显示出空价格
有什么想法吗?