我正在经历一种非常不寻常的行为,我似乎无法将其包裹起来。由于某种原因,输出会根据检索下一页链接的方式而有所不同。 为什么会这样?
下面是蜘蛛,下面是两种不同的检索方法和每种方法的输出。
蜘蛛
import scrapy
class AmazonSpider(scrapy.Spider):
name = 'amazon'
allowed_domains = [ 'amazon.ca' ]
custom_settings = { 'USER_AGENT' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' }
handle_httpstatus_list = [ 404 ]
def start_requests(self):
pages = [ 'https://www.amazon.ca/b/ref=sr_aj?node=2055586011' ]
for page in pages:
yield scrapy.Request(url=page, callback=self.parse)
def parse(self, response):
if response.status != 404:
cont_path = '//div[@class="a-row a-spacing-none"]/span[@class="a-size-small a-color-secondary a-text-strike"]/ancestor::div[@class="s-item-container"]'
cont_resp = response.xpath(cont_path).extract()
for idx in range(1, len(cont_resp)+1):
imag_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-base"]/div/div/a/img/@src').extract_first()
titl_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][1]/div[1]/a/@title').extract_first()
prod_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][1]/div[2]/span[2]/text()').extract_first()
pric_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[1]/a/span/text()').extract_first()
retl_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[1]/span[@class="a-size-small a-color-secondary a-text-strike"]/text()').extract_first()
thrd_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-mini"][2]/div[4]/a/span[2]/text()').extract_first()
rtng_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-none"]/span/span/a/i/span/text()').extract_first()
vots_resp = response.xpath('(' + cont_path + ')[' + str(idx) + ']/div[@class="a-row a-spacing-none"]/a/text()').extract_first()
obj = {
'idx' : idx,
'img' : imag_resp,
'ttl' : titl_resp,
'pdr' : prod_resp,
'rtg' : rtng_resp,
'vts' : vots_resp,
}
if pric_resp is not None:
obj['prc'] = float(pric_resp.split()[1])
if retl_resp is not None:
obj['rtl'] = float(retl_resp.split()[1])
if 'prc' not in obj and thrd_resp is not None:
obj['prc'] = float(thrd_resp.split()[1])
if 'prc' in obj and 'rtl' in obj:
obj['dsc'] = str(int(round(100 - (obj['prc'] / obj['rtl'] * 100)))) + '%'
yield obj
"""
### Method #1
yield response.follow(response.xpath('//a[@id="pagnNextLink"]').extract_first(), callback=self.parse)
### Method #2
next_path = '//a[@id="pagnNextLink"]/@href'
next_resp = response.xpath('next_path').extract_first()
if next_resp is not None:
next_resp = response.urljoin(next_resp)
yield scrapy.Request(next_resp, callback=self.parse)
"""
方法#1
yield response.follow(response.xpath('//a[@id="pagnNextLink"]').extract_first(), callback=self.parse)
首次输出
[
{
"rtg": "5 out of 5 stars",
"idx": 1,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41p9bSkUHlL._AA160_.jpg",
"rtl": 779.99,
"pdr": "Pentax",
"vts": "2",
"prc": 639.99,
"ttl": "Pentax 21790 DA 55mm F1.4 SDM Lens with Case",
"dsc": "18%"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 2,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/31gMEIPs+CL._AA160_.jpg",
"rtl": 33.85,
"pdr": "Sony",
"vts": "660",
"prc": 30.35,
"ttl": "Sony MDRZX110 Over-Ear Headphones (White)",
"dsc": "10%"
},
{
"rtg": "4.2 out of 5 stars",
"idx": 3,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "81",
"prc": 90.35,
"ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
"dsc": "50%"
},
{
"rtg": "3.9 out of 5 stars",
"idx": 4,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "120",
"prc": 127.38,
"ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
"dsc": "29%"
},
{
"rtg": "4.3 out of 5 stars",
"idx": 5,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
"rtl": 499.99,
"pdr": "Olympus",
"vts": "22",
"ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 6,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51Ctux-mgzL._AA160_.jpg",
"rtl": 13.19,
"pdr": "Western Digital",
"vts": "301",
"prc": 9.99,
"ttl": "Western Digital My Passport Carrying Case - Black (WDBABK0000NBK-WRSN)",
"dsc": "24%"
},
{
"rtg": "4.4 out of 5 stars",
"idx": 7,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41LaVPK8wPL._AA160_.jpg",
"rtl": 19.99,
"pdr": "Canon",
"vts": "20",
"prc": 18.99,
"ttl": "Genuine Canon CLI-42 Ink Tank, Magenta - 6386B002",
"dsc": "5%"
}
]
[
{
"rtg": "5 out of 5 stars",
"idx": 1,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41p9bSkUHlL._AA160_.jpg",
"rtl": 779.99,
"pdr": "Pentax",
"vts": "2",
"prc": 639.99,
"ttl": "Pentax 21790 DA 55mm F1.4 SDM Lens with Case",
"dsc": "18%"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 2,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/31gMEIPs+CL._AA160_.jpg",
"rtl": 33.85,
"pdr": "Sony",
"vts": "660",
"prc": 30.35,
"ttl": "Sony MDRZX110 Over-Ear Headphones (White)",
"dsc": "10%"
},
{
"rtg": "4.2 out of 5 stars",
"idx": 3,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "81",
"prc": 90.35,
"ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
"dsc": "50%"
},
{
"rtg": "3.9 out of 5 stars",
"idx": 4,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "120",
"prc": 127.38,
"ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
"dsc": "29%"
},
{
"rtg": "4.3 out of 5 stars",
"idx": 5,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
"rtl": 499.99,
"pdr": "Olympus",
"vts": "22",
"ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
},
{
"rtg": "4.1 out of 5 stars",
"idx": 6,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51Ctux-mgzL._AA160_.jpg",
"rtl": 13.19,
"pdr": "Western Digital",
"vts": "301",
"prc": 9.99,
"ttl": "Western Digital My Passport Carrying Case - Black (WDBABK0000NBK-WRSN)",
"dsc": "24%"
},
{
"rtg": "4.4 out of 5 stars",
"idx": 7,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41LaVPK8wPL._AA160_.jpg",
"rtl": 19.99,
"pdr": "Canon",
"vts": "20",
"prc": 18.99,
"ttl": "Genuine Canon CLI-42 Ink Tank, Magenta - 6386B002",
"dsc": "5%"
}
]
方法#2
next_path = '//a[@id="pagnNextLink"]/@href'
next_resp = response.xpath('next_path').extract_first()
if next_resp is not None:
next_resp = response.urljoin(next_resp)
yield scrapy.Request(next_resp, callback=self.parse)
第二次输出
[
{
"rtg": "4.2 out of 5 stars",
"idx": 1,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41YquQ15BdL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "81",
"prc": 90.35,
"ttl": "Monster BackFloat High Definition Bluetooth Wireless Waterproof Floating Speaker, Black\/Blue",
"dsc": "50%"
},
{
"rtg": "3.9 out of 5 stars",
"idx": 2,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41znh7URsTL._AA160_.jpg",
"rtl": 179.95,
"pdr": "Monster",
"vts": "120",
"prc": 127.38,
"ttl": "Monster Clarity HD On-Ear Bluetooth Wireless Headphones with Digital USB Audio, Black",
"dsc": "29%"
},
{
"rtg": "4.3 out of 5 stars",
"idx": 3,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/41ZbZ3iSUHL._AA160_.jpg",
"rtl": 499.99,
"pdr": "Olympus",
"vts": "22",
"ttl": "Olympus 45mm F1.8 Interchangeable Lens for Olympus\/Panasonic Micro Cameras (Black)"
},
{
"rtg": "4.2 out of 5 stars",
"idx": 4,
"img": "https:\/\/images-na.ssl-images-amazon.com\/images\/I\/51mInH2UC-L._AA160_.jpg",
"rtl": 8.93,
"pdr": "Swingline",
"vts": "24",
"prc": 7.48,
"ttl": "Swingline Optima Premium Staples, 0.25 Inch Leg Length, 45 Sheet Capacity, 3,750 Staples per Box, Silver (S7035556)",
"dsc": "16%"
}
]
为什么输出会根据检索下一页链接的方式而有所不同?
答案 0 :(得分:0)
这可能是因为这两种方法都是错误的。
对于第一个,您忘记了{x}末尾的/@href
,因此检索到的链接看起来像
https://www.amazon.ca/b/%3Ca%20title=%22Next%20Page%22%20id=%22pagnNextLink%22%20class=%22pagnNext%22%20href=%22/s/ref=lp_2055586011_pg_2/143-4196459-4416903?rh=n%3A667823011%2Cn%3A%212418672011%2Cn%3A%212418674011%2Cn%3A2055586011&page=2&ie
这只是亚马逊基本网址与html标签连接。你最好写
yield response.follow(response.xpath('//a[@id="pagnNextLink"]/@href').extract_first(), callback=self.parse)
对于第二个,您有next_resp = response.xpath('next_path').extract_first()
但该表达式中使用的next_path
是字符串"next_path"
,而不是变量next_path
。
您最好拥有next_resp = response.xpath(next_path).extract_first()
一旦修好,他们就会做同样的事情