我试图获取Scrapy中每个网址的总计数(简单来说,对于以&#34开头的项目; A"&" B" )。我知道如何计算每个字母表的计数以及如何获得累积计数(两个网址的总计数)。但是如何获得每个链接/网址的总数?我想我已经接近但我无法弄明白。我错过了什么?
import scrapy
import string
import itertools
class AmazonitemsSpider(scrapy.Spider):
name = 'amazonitems'
allowed_domains = ['www.amazon.ca']
def __init__(self):
self.total_count = []
self.f = open('csvfile.csv','w')
def start_requests(self):
alphabets = [ 'a', 'b' ]
amazon_urls = [ 'https://www.amazon.ca/gp/search/other/?rh=i%2Cn%3A6647368011&pickerToList=lbr_brands_browse-bin&indexField=', 'https://www.amazon.ca/gp/search/other/?rh=i%2Cn%3A2224133011&pickerToList=lbr_brands_browse-bin&indexField=' ]
for amazon_url in amazon_urls:
keywords = itertools.product(alphabets, repeat=1)
for keyword in keywords:
keyword = ''.join(keyword)
yield scrapy.Request(url=amazon_url + keyword, callback=self.parse)
def parse(self, response):
count_list = []
items = response.xpath('//*[@class="a-list-item"]')
for item in items:
item_count = item.xpath('.//*[@class="narrowValue"]/text()').extract_first()
item_count = item_count.replace('(','').replace(')','').strip()
count_list.append(int(item_count))
count_list = sum(count_list)
self.total_count.append(count_list)
def closed(self, reason):
self.total_count = sum(self.total_count)
self.f.write(str(total_count) + '\n')
self.f.close()