无法刮擦“ shopee.com.my”最畅销的产品页面

时间:2020-10-23 15:29:21

标签: python api web-scraping scrapy python-requests

我试图用刮擦刮擦最畅销的产品“ shopee.com.my”,也尝试使用请求,但是未能获得有效的JSON对象。我的请求代码如下:

import requests as r
import json

data = {
'authority': 'shopee.com.my',
'method': 'GET',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'scheme': 'https',
'accept': '*/*, application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'x-api-source': 'pc',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
}

subcat_url = '/Boys-Fashion-cat.27.2427'
id = subcat_url.split('.')[-1]
data['path'] = f'/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
data['referer'] = f'https://shopee.com.my{subcat_url}?page=0&sortBy=sales'
url = f'https://shopee.com.my/api/v2/search_items/?by=sales&match_id={id}&newest=0&order=desc&page_type=search&version=2'

req = r.get(url, headers=data)
items = req.json()['items']
print(items)
print(f'Items length: {len(items)}')

这是我的验证码:

import scrapy
import json
from scrapy import Request
from scrapy.http.cookies import CookieJar

header_data = {'authority': 'shopee.com.my',
    'method': 'GET',
    'scheme': 'https',
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
    # 'cookie': 'SPC_U=-; SPC_IA=-1; SPC_EC=-; SPC_F=7jrWAm4XYNNtyVAk83GPknN8NbCMQEIk; REC_T_ID=476673f8-eeb0-11ea-8919-48df374df85c; _gcl_au=1.1.1197882328.1599225148; _med=refer; _fbp=fb.2.1599225150134.114138691; language=en; _ga=GA1.3.1167355736.1599225151; csrftoken=mu9M72KLd73P9QJusB9zFBP6wV3NGg85; _gid=GA1.3.273342972.1603211749; SPC_SI=yxvc89nmqe97ldvpo6wgeybtc8berzyd; welcomePkgShown=true; AMP_TOKEN=%24NOT_FOUND; REC_MD_41_1000027=1603289427_0_50_0_48; SPC_CT_48918e31="1603289273.lUS7x9IuKN5vNbhzibZCOHrIf6vVQmykU/TXxiOii7w="; SPC_CT_57540430="1603289278.FLT3IdzHC32RmEzFxkOi9pI7qhKIs/yq328elYMuwps="; SPC_CT_50ee4e78="1603289299.gvjW32HwgiQGN/4kj2Ac3YFrpqyHVTO8+UjM+uzxy4E="; _dc_gtm_UA-61915055-6=1; SPC_CT_75d7a2b7="1603289557.t5FvxXhnJacZrKkjnIWCUbAgAxAQ3hG5c1tZBzafwc4="; SPC_R_T_ID="n6Ek85JJY1JZATlhgutfB4KB3qrbmFDYX1+udv1EBAPegPE9xuzM8HFeCy1duskY9+DVLJxe4RqaabhyUuojHQG0NI2TqegihbAge+s3k7w="; SPC_T_IV="SGNXqyZ1jtRYpo5kFeKtYg=="; SPC_R_T_IV="SGNXqyZ1jtRYpo5kFeKtYg=="; SPC_T_ID="n6Ek85JJY1JZATlhgutfB4KB3qrbmFDYX1+udv1EBAPegPE9xuzM8HFeCy1duskY9+DVLJxe4RqaabhyUuojHQG0NI2TqegihbAge+s3k7w="',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'x-api-source': 'pc',
    'x-requested-with': 'XMLHttpRequest',
    'x-shopee-language': 'en',
    }


class TestSpider(scrapy.Spider):
    name = 'test'
    allowed_domains = ['shopee.com', 'shopee.com.my', 'shopee.com.my/api/']


    def start_requests(self):
        subcat_url = '/Baby-Toddler-Play-cat.27.23785'
        id = subcat_url.split('.')[-1]
        header_data['path'] = f'/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
        header_data['referer'] = f'https://shopee.com.my{subcat_url}?page=0&sortBy=sales'
        url = f'https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'

       


        yield Request(url=url, headers=header_data)

    def parse_data(self, response):
        try:
            jdata = json.loads(response.body)
            return None
        except Exception as e:
            print(f'exception: {e}')
            print(response.body)
            return None

        items = jdata['items']

        for item in items:
            name = item['name']
            image_path = item['image']
            absolute_image = f'https://cf.shopee.com.my/file/{image_path}_tn'
            print(f'this is  absolute image {absolute_image}')
            monthly_sold = 'pending'
            price = float(item['price'])/100000
            total_sold = item['sold']
            location = item['shop_location']
            stock = item['stock']

            print(name)
            print(price)
            print(total_sold)
            print(location)
            print(stock)

现在不使用cookie,但也尝试使用新鲜的cookie,但没有响应。 这是一些示例链接,其中一些链接使它们的响应始终是有效的JSON对象,而某些链接则不返回任何响应。参见下面的api和直接浏览器链接:

https://shopee.com.my/Kids-Sports-Outdoor-Play-cat.27.21700?page=0&sortBy=sales https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id=21700&newest=0&order=desc&page_type=search&version=2

https://shopee.com.my/Bath-Toiletries-cat.27.2422 https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id=2422&newest=0&order=desc&page_type=search&version=2

您还可以在“网络”标签中看到API链接: network tab link image

1 个答案:

答案 0 :(得分:1)

我认为您缺少必填的标头,因此我将其发送给他们,并且效果很好

from pprint import pprint
import requests

    headers = {
        'authority': 'shopee.com.my',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'x-shopee-language': 'en',
        'x-requested-with': 'XMLHttpRequest',
        'if-none-match-': '55b03-c3d70d78b473147beeb6551fa9df8ca0',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
        'x-api-source': 'pc',
        'accept': '*/*',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'referer': 'https://shopee.com.my/Kids-Sports-Outdoor-Play-cat.27.21700?page=0&sortBy=sales',
        'accept-language': 'es-US,es;q=0.9,en-US;q=0.8,en;q=0.7,es-419;q=0.6',
        # 'cookie': '_gcl_au=1.1.1866522785.1603486253; _fbp=fb.2.1603486253254.1114160447; SPC_IA=-1; SPC_EC=-; SPC_U=-; SPC_F=9RO26eJM7IQiFlxki0dAdQCcCsgPwz67; REC_T_ID=71a698d6-1571-11eb-9baf-48df3757c438; SPC_SI=mall.n58BgakbNjCD5RDYlsQJ8EurmBkH5HIY; SPC_CT_c49f0fdc="1603486254.GqWz1BPlfz3MKmUufL3eTwFqgUfdKWcWVf2xiJI7nSk="; SPC_R_T_ID="89vber/2TKnfACAmGbXpxC3BzHc0ajEQMPxgMbAlZnQlgEo7YWmya0sf/KRt1FsoZvaFYKoNDk+Rh9YWLWsNMH324iqgZePbam1q9QpYQlE="; SPC_T_IV="vko6vAtWsyHuqteFHAoPIA=="; SPC_R_T_IV="vko6vAtWsyHuqteFHAoPIA=="; SPC_T_ID="89vber/2TKnfACAmGbXpxC3BzHc0ajEQMPxgMbAlZnQlgEo7YWmya0sf/KRt1FsoZvaFYKoNDk+Rh9YWLWsNMH324iqgZePbam1q9QpYQlE="; AMP_TOKEN=%24NOT_FOUND; _ga=GA1.3.602723004.1603486255; _gid=GA1.3.657631736.1603486255; _dc_gtm_UA-61915055-6=1; language=en',
    }

    params = (
        ('by', 'sales'),
        ('limit', '50'),
        ('match_id', '21700'),
        ('newest', '0'),
        ('order', 'desc'),
        ('page_type', 'search'),
        ('version', '2'),
    )

    response = requests.get('https://shopee.com.my/api/v2/search_items/', headers=headers, params=params)
    pprint(response.json())
相关问题