如何检索下一页链接效果输出

时间:2018-06-13 06:33:22

标签: python python-2.7 web-scraping scrapy screen-scraping

我正在经历一种非常不寻常的行为,我似乎无法将其包裹起来。由于某种原因,输出会根据检索下一页链接的方式而有所不同。 为什么会这样?

下面是蜘蛛,下面是两种不同的检索方法和每种方法的输出。

蜘蛛

import scrapy

class AmazonSpider(scrapy.Spider):
    name = 'amazon'
    allowed_domains = [ 'amazon.ca' ]
    custom_settings = { 'USER_AGENT' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' }
    handle_httpstatus_list = [ 404 ]
    def start_requests(self):
        pages = [ 'https://www.amazon.ca/b/ref=sr_aj?node=2055586011' ]
        for page in pages:
            yield scrapy.Request(url=page, callback=self.parse)
    def parse(self, response):
        if response.status != 404:
            cont_path = '//div[@class="a-row a-spacing-none"]/span[@class="a-size-small a-color-secondary a-text-strike"]/ancestor::div[@class="s-item-container"]'
            cont_resp = response.xpath(cont_path).extract()
            for idx in range(1, len(cont_resp)+1):
                imag_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-base"]/div/div/a/img/@src').extract_first()
                titl_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][1]/div[1]/a/@title').extract_first()
                prod_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][1]/div[2]/span[2]/text()').extract_first()
                pric_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[1]/a/span/text()').extract_first()
                retl_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[1]/span[@class="a-size-small a-color-secondary a-text-strike"]/text()').extract_first()
                thrd_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[4]/a/span[2]/text()').extract_first()
                rtng_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-none"]/span/span/a/i/span/text()').extract_first()
                vots_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-none"]/a/text()').extract_first()
                obj = {
                    'idx' : idx,
                    'img' : imag_resp,
                    'ttl' : titl_resp,
                    'pdr' : prod_resp,
                    'rtg' : rtng_resp,
                    'vts' : vots_resp,
                }
                if pric_resp is not None:
                    obj['prc'] = float(pric_resp.split()[1])
                if retl_resp is not None:
                    obj['rtl'] = float(retl_resp.split()[1])
                if 'prc' not in obj and thrd_resp is not None:
                    obj['prc'] = float(thrd_resp.split()[1])
                if 'prc' in obj and 'rtl' in obj:
                    obj['dsc'] = str(int(round(100 - (obj['prc'] / obj['rtl'] * 100)))) + '%'
                yield obj
           """
           ### Method #1
           yield response.follow(response.xpath('//a[@id="pagnNextLink"]').extract_first(), callback=self.parse)
           ### Method #2
           next_path = '//a[@id="pagnNextLink"]/@href'
           next_resp = response.xpath('next_path').extract_first()
           if next_resp is not None:
               next_resp = response.urljoin(next_resp)
               yield scrapy.Request(next_resp, callback=self.parse)
           """

方法#1

yield response.follow(response.xpath('//a[@id="pagnNextLink"]').extract_first(), callback=self.parse)

首次输出

[
  {
    "rtg": "5 out of 5 stars",
    "idx": 1,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41p9bSkUHlL._AA160_.jpg",
    "rtl": 779.99,
    "pdr": "Pentax",
    "vts": "2",
    "prc": 639.99,
    "ttl": "Pentax 21790 DA 55mm F1.4 SDM Lens with Case",
    "dsc": "18%"
  },
  {
    "rtg": "4.1 out of 5 stars",
    "idx": 2,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/31gMEIPs+CL._AA160_.jpg",
    "rtl": 33.85,
    "pdr": "Sony",
    "vts": "660",
    "prc": 30.35,
    "ttl": "Sony MDRZX110 Over-Ear Headphones (White)",
    "dsc": "10%"
  },
  {
    "rtg": "4.2 out of 5 stars",
    "idx": 3,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
    "rtl": 179.95,
    "pdr": "Monster",
    "vts": "81",
    "prc": 90.35,
    "ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
    "dsc": "50%"
  },
  {
    "rtg": "3.9 out of 5 stars",
    "idx": 4,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
    "rtl": 179.95,
    "pdr": "Monster",
    "vts": "120",
    "prc": 127.38,
    "ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
    "dsc": "29%"
  },
  {
    "rtg": "4.3 out of 5 stars",
    "idx": 5,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
    "rtl": 499.99,
    "pdr": "Olympus",
    "vts": "22",
    "ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
  },
  {
    "rtg": "4.1 out of 5 stars",
    "idx": 6,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51Ctux-mgzL._AA160_.jpg",
    "rtl": 13.19,
    "pdr": "Western Digital",
    "vts": "301",
    "prc": 9.99,
    "ttl": "Western Digital My Passport Carrying Case - Black (WDBABK0000NBK-WRSN)",
    "dsc": "24%"
  },
  {
    "rtg": "4.4 out of 5 stars",
    "idx": 7,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41LaVPK8wPL._AA160_.jpg",
    "rtl": 19.99,
    "pdr": "Canon",
    "vts": "20",
    "prc": 18.99,
    "ttl": "Genuine Canon CLI-42 Ink Tank, Magenta - 6386B002",
    "dsc": "5%"
  }
]

[
  {
    "rtg": "5 out of 5 stars",
    "idx": 1,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41p9bSkUHlL._AA160_.jpg",
    "rtl": 779.99,
    "pdr": "Pentax",
    "vts": "2",
    "prc": 639.99,
    "ttl": "Pentax 21790 DA 55mm F1.4 SDM Lens with Case",
    "dsc": "18%"
  },
  {
    "rtg": "4.1 out of 5 stars",
    "idx": 2,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/31gMEIPs+CL._AA160_.jpg",
    "rtl": 33.85,
    "pdr": "Sony",
    "vts": "660",
    "prc": 30.35,
    "ttl": "Sony MDRZX110 Over-Ear Headphones (White)",
    "dsc": "10%"
  },
  {
    "rtg": "4.2 out of 5 stars",
    "idx": 3,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
    "rtl": 179.95,
    "pdr": "Monster",
    "vts": "81",
    "prc": 90.35,
    "ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
    "dsc": "50%"
  },
  {
    "rtg": "3.9 out of 5 stars",
    "idx": 4,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
    "rtl": 179.95,
    "pdr": "Monster",
    "vts": "120",
    "prc": 127.38,
    "ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
    "dsc": "29%"
  },
  {
    "rtg": "4.3 out of 5 stars",
    "idx": 5,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
    "rtl": 499.99,
    "pdr": "Olympus",
    "vts": "22",
    "ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
  },
  {
    "rtg": "4.1 out of 5 stars",
    "idx": 6,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51Ctux-mgzL._AA160_.jpg",
    "rtl": 13.19,
    "pdr": "Western Digital",
    "vts": "301",
    "prc": 9.99,
    "ttl": "Western Digital My Passport Carrying Case - Black (WDBABK0000NBK-WRSN)",
    "dsc": "24%"
  },
  {
    "rtg": "4.4 out of 5 stars",
    "idx": 7,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41LaVPK8wPL._AA160_.jpg",
    "rtl": 19.99,
    "pdr": "Canon",
    "vts": "20",
    "prc": 18.99,
    "ttl": "Genuine Canon CLI-42 Ink Tank, Magenta - 6386B002",
    "dsc": "5%"
  }
]

方法#2

next_path = '//a[@id="pagnNextLink"]/@href'
next_resp = response.xpath('next_path').extract_first()
if next_resp is not None:
    next_resp = response.urljoin(next_resp)
    yield scrapy.Request(next_resp, callback=self.parse)

第二次输出

[
  {
    "rtg": "4.2 out of 5 stars",
    "idx": 1,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
    "rtl": 179.95,
    "pdr": "Monster",
    "vts": "81",
    "prc": 90.35,
    "ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
    "dsc": "50%"
  },
  {
    "rtg": "3.9 out of 5 stars",
    "idx": 2,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
    "rtl": 179.95,
    "pdr": "Monster",
    "vts": "120",
    "prc": 127.38,
    "ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
    "dsc": "29%"
  },
  {
    "rtg": "4.3 out of 5 stars",
    "idx": 3,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
    "rtl": 499.99,
    "pdr": "Olympus",
    "vts": "22",
    "ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
  },
  {
    "rtg": "4.2 out of 5 stars",
    "idx": 4,
    "img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51mInH2UC-L._AA160_.jpg",
    "rtl": 8.93,
    "pdr": "Swingline",
    "vts": "24",
    "prc": 7.48,
    "ttl": "Swingline Optima Premium Staples, 0.25 Inch Leg Length, 45 Sheet Capacity, 3,750 Staples per Box, Silver (S7035556)",
    "dsc": "16%"
  }
]

为什么输出会根据检索下一页链接的方式而有所不同?

1 个答案:

答案 0 :(得分:0)

这可能是因为这两种方法都是错误的。

对于第一个,您忘记了{x}末尾的/@href,因此检索到的链接看起来像 https://www.amazon.ca/b/%3Ca%20title=%22Next%20Page%22%20id=%22pagnNextLink%22%20class=%22pagnNext%22%20href=%22/s/ref=lp_2055586011_pg_2/143-4196459-4416903?rh=n%3A667823011%2Cn%3A%212418672011%2Cn%3A%212418674011%2Cn%3A2055586011&page=2&ie  这只是亚马逊基本网址与html标签连接。你最好写

yield response.follow(response.xpath('//a[@id="pagnNextLink"]/@href').extract_first(), callback=self.parse)

对于第二个,您有next_resp = response.xpath('next_path').extract_first()但该表达式中使用的next_path是字符串"next_path",而不是变量next_path。 您最好拥有next_resp = response.xpath(next_path).extract_first()

一旦修好,他们就会做同样的事情