Question

大家好！

我正在尝试使用带有scrapy 的API scrape https://guardian.com.my/health.html。在邮递员上，请求 url 的状态为 200ok，但在抓取时我收到了 Crawled(400) 错误请求。

    import scrapy
    from scrapy.exceptions import CloseSpider
    import json


    class GapiSpider(scrapy.Spider):
      name = 'gapi'

      headers={
        ':authority': 'guardian.com.my',
        ':method': 'GET',
        ':path': f"/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
        ':scheme': 'https',
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'no-cache',
        'content-type': 'application/json',
        'cookie': 'lzd_cid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; t_uid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; lzd_sid=1728fd4a8af615419dc17044911406d4; t_fv=1623735402455; hng=MY|en-MY|MYR|458; userLanguageML=en; _bl_uid=7qkOhpLtxynm254s1v6s7X3gjp7n; cna=aipPGYuo6GwCAXOk2DtI3+FS; _gcl_au=1.1.335118801.1623735403; _tb_token_=f311f13b31eb5; age_restriction=over%3B18%3B1; _fbp=fb.2.1623760903320.1705265880; _ga=GA1.3.1886895770.1623760957; _gid=GA1.3.71310232.1623760957; t_sid=VVEk6ELqT4zeevu7Snsvx63eqX0u9De7; utm_origin=https://www.google.com/; utm_channel=SEO; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7LqI1pVlU7ApMhhrX1Oe_NHHkRPwi6zdBuxVpdbHrc8tccMpabJfEEwLAe7yCNtlESqPvCMPcAfjqwmNQ19bjOPRdxRnSKGABpGFDMYvsTiDFT7FfaLnWnbHJr555QFqdYHSAtp73LRUYDZgaeGlJT4=; isg=BICAfUhD9ImywYiPlm818du3UQhSCWTTedKTF_oRTxsudSGfoh0tYqpEiNW1QByr; l=eBgFfq9IjfrHbUYTBOfaourza779IIRbSuPzaNbMiOCP_-1p5HU1W6OwTtT9CnhNnsgHR3lqRWpDBu8SQyz6Qxv9-egPe9oEndBG.; tfstk=ctjGBQ2yNBGSUZThPlt1IUVci2adZKke6s51Yite4tkyX3IFiWuEzDR4-BfGDq1..; _m_h5_tk=9a0a86b9d6a2af069082c39a003a0087_1623862308738; _m_h5_tk_enc=e8b5adef6a74673cb92ca952c850f019; _uetsid=ab562c30cd9b11eb835b67411ecd1bd6; _uetvid=ab587f00cd9b11eb8eea6906a86bf0ba',
        'pragma': 'no-cache',
        'referer': 'https://guardian.com.my/health.html?page=1',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'sec-ch-ua-mobile': '?0',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'store': 'default',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    }

    def start_requests(self):
        yield scrapy.Request(
            url= f"https://guardian.com.my/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
            headers=self.headers
        )
    
    def parse(self, response):
        print(response.body)

由于我使用的是 IP 代理旋转器，我还缺少什么？ ROBOTSTXT_OBEY 已在 settings.py 中设置为 False

非常感谢！

Answer 1

当请求不是以预期的格式发出时，Scrapy 会抛出 400 个错误的请求。标头、负载或参数可能有误。

考虑到您的情况，您在各种标头字段中添加了冒号 :，这是在发出请求时无效的格式。例如， ':authority' 应该是 'authority' 。同样，':method' 应该是 'method' 等等..

代码

import scrapy
from scrapy.exceptions import CloseSpider
import json


class GapiSpider(scrapy.Spider):
    name = 'gapi'

    headers={
        'authority': 'guardian.com.my',
        'method': 'GET',
        'path': f"/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A2%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
        'scheme': 'https',
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'no-cache',
        'content-type': 'application/json',
        'cookie': 'lzd_cid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; t_uid=13b492b3-2f4c-46fc-fb4f-98212b95f68d; lzd_sid=1728fd4a8af615419dc17044911406d4; t_fv=1623735402455; hng=MY|en-MY|MYR|458; userLanguageML=en; _bl_uid=7qkOhpLtxynm254s1v6s7X3gjp7n; cna=aipPGYuo6GwCAXOk2DtI3+FS; _gcl_au=1.1.335118801.1623735403; _tb_token_=f311f13b31eb5; age_restriction=over%3B18%3B1; _fbp=fb.2.1623760903320.1705265880; _ga=GA1.3.1886895770.1623760957; _gid=GA1.3.71310232.1623760957; t_sid=VVEk6ELqT4zeevu7Snsvx63eqX0u9De7; utm_origin=https://www.google.com/; utm_channel=SEO; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7LqI1pVlU7ApMhhrX1Oe_NHHkRPwi6zdBuxVpdbHrc8tccMpabJfEEwLAe7yCNtlESqPvCMPcAfjqwmNQ19bjOPRdxRnSKGABpGFDMYvsTiDFT7FfaLnWnbHJr555QFqdYHSAtp73LRUYDZgaeGlJT4=; isg=BICAfUhD9ImywYiPlm818du3UQhSCWTTedKTF_oRTxsudSGfoh0tYqpEiNW1QByr; l=eBgFfq9IjfrHbUYTBOfaourza779IIRbSuPzaNbMiOCP_-1p5HU1W6OwTtT9CnhNnsgHR3lqRWpDBu8SQyz6Qxv9-egPe9oEndBG.; tfstk=ctjGBQ2yNBGSUZThPlt1IUVci2adZKke6s51Yite4tkyX3IFiWuEzDR4-BfGDq1..; _m_h5_tk=9a0a86b9d6a2af069082c39a003a0087_1623862308738; _m_h5_tk_enc=e8b5adef6a74673cb92ca952c850f019; _uetsid=ab562c30cd9b11eb835b67411ecd1bd6; _uetvid=ab587f00cd9b11eb8eea6906a86bf0ba',
        'pragma': 'no-cache',
        'referer': 'https://guardian.com.my/health.html?page=1',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'sec-ch-ua-mobile': '?0',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'store': 'default',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    }

    def start_requests(self):
        yield scrapy.Request(
            url= f"https://guardian.com.my/graphql?query=query+GetCategories%28%24id%3AInt%21%24pageSize%3AInt%21%24currentPage%3AInt%21%24filters%3AProductAttributeFilterInput%21%24sort%3AProductAttributeSortInput%29%7Bcategory%28id%3A%24id%29%7Bid+description+name+product_count+meta_title+meta_keywords+meta_description+__typename%7Dproducts%28pageSize%3A%24pageSize+currentPage%3A%24currentPage+filter%3A%24filters+sort%3A%24sort%29%7Bitems%7Bid+name+sku+price%7BregularPrice%7Bamount%7Bcurrency+value+__typename%7D__typename%7D__typename%7Dprice_range%7Bminimum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7Dmaximum_price%7Bfinal_price%7Bcurrency+value+__typename%7Ddiscount%7Bamount_off+percent_off+__typename%7D__typename%7D__typename%7Dpromotion_label+promotion_label_name+sales_icon+small_image%7Burl+__typename%7Dstock_status+url_key+url_suffix+__typename%7Dpage_info%7Btotal_pages+__typename%7Dtotal_count+__typename%7D%7D&operationName=GetCategories&variables=%7B%22currentPage%22%3A1%2C%22id%22%3A3047%2C%22filters%22%3A%7B%22category_id%22%3A%7B%22eq%22%3A%223047%22%7D%7D%2C%22pageSize%22%3A60%2C%22sort%22%3A%7B%22position%22%3A%22ASC%22%7D%7D",
            headers=self.headers
        )

    def parse(self, response):
        print(response.body)

scrapy 400 错误请求，但邮递员状态为 200ok

1 个答案: