我试图用刮擦刮擦最畅销的产品“ shopee.com.my”,也尝试使用请求,但是未能获得有效的JSON对象。我的请求代码如下:
import requests as r
import json
data = {
'authority': 'shopee.com.my',
'method': 'GET',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'scheme': 'https',
'accept': '*/*, application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'x-api-source': 'pc',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
}
subcat_url = '/Boys-Fashion-cat.27.2427'
id = subcat_url.split('.')[-1]
data['path'] = f'/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
data['referer'] = f'https://shopee.com.my{subcat_url}?page=0&sortBy=sales'
url = f'https://shopee.com.my/api/v2/search_items/?by=sales&match_id={id}&newest=0&order=desc&page_type=search&version=2'
req = r.get(url, headers=data)
items = req.json()['items']
print(items)
print(f'Items length: {len(items)}')
这是我的验证码:
import scrapy
import json
from scrapy import Request
from scrapy.http.cookies import CookieJar
header_data = {'authority': 'shopee.com.my',
'method': 'GET',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
# 'cookie': 'SPC_U=-; SPC_IA=-1; SPC_EC=-; SPC_F=7jrWAm4XYNNtyVAk83GPknN8NbCMQEIk; REC_T_ID=476673f8-eeb0-11ea-8919-48df374df85c; _gcl_au=1.1.1197882328.1599225148; _med=refer; _fbp=fb.2.1599225150134.114138691; language=en; _ga=GA1.3.1167355736.1599225151; csrftoken=mu9M72KLd73P9QJusB9zFBP6wV3NGg85; _gid=GA1.3.273342972.1603211749; SPC_SI=yxvc89nmqe97ldvpo6wgeybtc8berzyd; welcomePkgShown=true; AMP_TOKEN=%24NOT_FOUND; REC_MD_41_1000027=1603289427_0_50_0_48; SPC_CT_48918e31="1603289273.lUS7x9IuKN5vNbhzibZCOHrIf6vVQmykU/TXxiOii7w="; SPC_CT_57540430="1603289278.FLT3IdzHC32RmEzFxkOi9pI7qhKIs/yq328elYMuwps="; SPC_CT_50ee4e78="1603289299.gvjW32HwgiQGN/4kj2Ac3YFrpqyHVTO8+UjM+uzxy4E="; _dc_gtm_UA-61915055-6=1; SPC_CT_75d7a2b7="1603289557.t5FvxXhnJacZrKkjnIWCUbAgAxAQ3hG5c1tZBzafwc4="; SPC_R_T_ID="n6Ek85JJY1JZATlhgutfB4KB3qrbmFDYX1+udv1EBAPegPE9xuzM8HFeCy1duskY9+DVLJxe4RqaabhyUuojHQG0NI2TqegihbAge+s3k7w="; SPC_T_IV="SGNXqyZ1jtRYpo5kFeKtYg=="; SPC_R_T_IV="SGNXqyZ1jtRYpo5kFeKtYg=="; SPC_T_ID="n6Ek85JJY1JZATlhgutfB4KB3qrbmFDYX1+udv1EBAPegPE9xuzM8HFeCy1duskY9+DVLJxe4RqaabhyUuojHQG0NI2TqegihbAge+s3k7w="',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'x-api-source': 'pc',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en',
}
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['shopee.com', 'shopee.com.my', 'shopee.com.my/api/']
def start_requests(self):
subcat_url = '/Baby-Toddler-Play-cat.27.23785'
id = subcat_url.split('.')[-1]
header_data['path'] = f'/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
header_data['referer'] = f'https://shopee.com.my{subcat_url}?page=0&sortBy=sales'
url = f'https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
yield Request(url=url, headers=header_data)
def parse_data(self, response):
try:
jdata = json.loads(response.body)
return None
except Exception as e:
print(f'exception: {e}')
print(response.body)
return None
items = jdata['items']
for item in items:
name = item['name']
image_path = item['image']
absolute_image = f'https://cf.shopee.com.my/file/{image_path}_tn'
print(f'this is absolute image {absolute_image}')
monthly_sold = 'pending'
price = float(item['price'])/100000
total_sold = item['sold']
location = item['shop_location']
stock = item['stock']
print(name)
print(price)
print(total_sold)
print(location)
print(stock)
现在不使用cookie,但也尝试使用新鲜的cookie,但没有响应。 这是一些示例链接,其中一些链接使它们的响应始终是有效的JSON对象,而某些链接则不返回任何响应。参见下面的api和直接浏览器链接:
https://shopee.com.my/Kids-Sports-Outdoor-Play-cat.27.21700?page=0&sortBy=sales https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id=21700&newest=0&order=desc&page_type=search&version=2
https://shopee.com.my/Bath-Toiletries-cat.27.2422 https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id=2422&newest=0&order=desc&page_type=search&version=2
您还可以在“网络”标签中看到API链接: network tab link image
答案 0 :(得分:1)
我认为您缺少必填的标头,因此我将其发送给他们,并且效果很好
from pprint import pprint
import requests
headers = {
'authority': 'shopee.com.my',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'x-shopee-language': 'en',
'x-requested-with': 'XMLHttpRequest',
'if-none-match-': '55b03-c3d70d78b473147beeb6551fa9df8ca0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'x-api-source': 'pc',
'accept': '*/*',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://shopee.com.my/Kids-Sports-Outdoor-Play-cat.27.21700?page=0&sortBy=sales',
'accept-language': 'es-US,es;q=0.9,en-US;q=0.8,en;q=0.7,es-419;q=0.6',
# 'cookie': '_gcl_au=1.1.1866522785.1603486253; _fbp=fb.2.1603486253254.1114160447; SPC_IA=-1; SPC_EC=-; SPC_U=-; SPC_F=9RO26eJM7IQiFlxki0dAdQCcCsgPwz67; REC_T_ID=71a698d6-1571-11eb-9baf-48df3757c438; SPC_SI=mall.n58BgakbNjCD5RDYlsQJ8EurmBkH5HIY; SPC_CT_c49f0fdc="1603486254.GqWz1BPlfz3MKmUufL3eTwFqgUfdKWcWVf2xiJI7nSk="; SPC_R_T_ID="89vber/2TKnfACAmGbXpxC3BzHc0ajEQMPxgMbAlZnQlgEo7YWmya0sf/KRt1FsoZvaFYKoNDk+Rh9YWLWsNMH324iqgZePbam1q9QpYQlE="; SPC_T_IV="vko6vAtWsyHuqteFHAoPIA=="; SPC_R_T_IV="vko6vAtWsyHuqteFHAoPIA=="; SPC_T_ID="89vber/2TKnfACAmGbXpxC3BzHc0ajEQMPxgMbAlZnQlgEo7YWmya0sf/KRt1FsoZvaFYKoNDk+Rh9YWLWsNMH324iqgZePbam1q9QpYQlE="; AMP_TOKEN=%24NOT_FOUND; _ga=GA1.3.602723004.1603486255; _gid=GA1.3.657631736.1603486255; _dc_gtm_UA-61915055-6=1; language=en',
}
params = (
('by', 'sales'),
('limit', '50'),
('match_id', '21700'),
('newest', '0'),
('order', 'desc'),
('page_type', 'search'),
('version', '2'),
)
response = requests.get('https://shopee.com.my/api/v2/search_items/', headers=headers, params=params)
pprint(response.json())