我正在抓取以下网站https://www.trollandtoad.com/magic-the-gathering/magic-2020-m20-/14878?Keywords=&min-price=&max-price=&items-pp=60&item-condition=&selected-cat=14878&sort-order=&page-no=1&view=list&Rarity=&Ruleset=&minMana=&maxMana=&minPower=&maxPower=&minToughness=&maxToughness=,并且我需要遍历下拉列表中的数量,直到数量到尽为止,以便确定剩余库存。我在其中放置一个计数器,以确定它在循环中运行了多少次以确定剩余的库存量,但它仅在循环中运行了一次。
# Function to parse needed data
def parse(self, response):
# For loop to run through html code until all needed data is scraped
for data in response.css('div.card > div.row'):
# import items from items.py
item = DataItem()
# Scrape Category name
item["Category"] = data.css("div.col-12.prod-cat a::text").get()
# Scrape card name
item["Card_Name"] = data.css("a.card-text::text").get()
item["Stock"] = data.css("div.font-weight-bold.font-smaller.text-muted::text").get()
if item["Stock"] == None:
item["Stock"] = "In Stock"
# For loop to run through all the buying information needed, skips first row
for buying_option in data.css('div.buying-options-table div.row')[1:]:
# Scrape seller, condition, and price
item["Seller"] = buying_option.css('div.row.align-center.py-2.m-auto > div.col-3.text-center.p-1 > img::attr(title)').get()
if item["Seller"] == "PRE ORDER":
item["Seller"] = "TrollAndToad Com"
item["Condition"] = buying_option.css("div.col-3.text-center.p-1::text").get()
num = 0
for select in buying_option.css('select.w-100'): # Right here is where I am trying to determine the stock by looping through drop down lsit
num = num + 1
item["Price"] = buying_option.css("div.col-2.text-center.p-1::text").get()
# Return data
yield item
答案 0 :(得分:2)
XPath有一种非常简单的方法:
stock_quantity = row.xpath('//select[@name="qtyToBuy"]/option[last()]/@value').get()
答案 1 :(得分:0)
我通过基本上选择所有<option>
,提取其value
属性并取其最大整数值来计算数量。像这样:
quantity_options = p.css('.product-add-container .box-quantity option::attr(value)').getall()
quantity = max(map(int, quantity_options))
我也重构了您的代码
import scrapy
from scrapy.crawler import CrawlerProcess
class TrollandtoadSpider(scrapy.Spider):
name = 'TrollAndSpider'
start_urls = [
'https://www.trollandtoad.com/magic-the-gathering/magic-2020-m20-/14878'
]
logger = None
def parse(self, response: scrapy.http.Response):
for p in response.css('.product-col > .card > .row'):
p: scrapy.Selector
title = p.css('.prod-title a::text').get()
category = p.css('.prod-cat a::text').get()
stock = p.css("div.text-muted::text").get() or 'In Stock'
quantity_options = p.css('.product-add-container .box-quantity option::attr(value)').getall()
quantity = max(map(int, quantity_options))
buying_opts = p.css('.buying-options-table .row:last-child [class*=col-]')
seller = buying_opts[0].css('img::attr(title)').get()
if seller == 'PRE ORDER':
seller = 'TrollAndToad Com'
condition = buying_opts[1].css('::text').get()
price = buying_opts[3].css('::text').get()
product = {
'title': title,
'category': category,
'stock': stock,
'seller': seller,
'condition': condition,
'quantity': quantity,
'price': price,
}
yield product
if __name__ == '__main__':
p = CrawlerProcess()
p.crawl(TrollandtoadSpider)
p.start()
输出:
{
'title': 'Leyline of the Void 107/280',
'category': 'Magic 2020 (M20) Singles',
'stock': 'In Stock',
'seller': 'TrollAndToad Com',
'condition': 'Near Mint',
'quantity': 6,
'price': '$17.49'
},
{
'title': "Sephara, Sky's Blade 036/280",
'category': 'Magic 2020 (M20) Singles',
'stock': 'In Stock',
'seller': 'TrollAndToad Com',
'condition': 'Near Mint',
'quantity': 3,
'price': '$3.99'
}
答案 2 :(得分:0)
Items.py
import scrapy
class MagiccardsiteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Category = scrapy.Field()
Card_Name = scrapy.Field()
Stock = scrapy.Field()
Seller = scrapy.Field()
Condition = scrapy.Field()
Price = scrapy.Field()
Num = scrapy.Field()
蜘蛛代码
import scrapy
from MagicCardSite.items import MagiccardsiteItem
class CardinfoSpider(scrapy.Spider):
name = 'CardInfo'
url = 'https://www.trollandtoad.com/magic-the-gathering/magic-2020-m20-singles/15088'
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
for row in response.xpath('//div[contains(@class,"product-col")]'):
num = 0
item = MagiccardsiteItem()
item['Category'] = row.xpath('.//div[@class="col-12 prod-cat"]/u/a/text()').get()
item['Card_Name'] = row.xpath('.//div[@class="col-12 prod-title"]/a/text()').get()
stock = row.xpath('.//div[@class="box-quantity col-2 p-1"]/select[@class="w-100"]/option[last()]/text()').get()
item['Stock'] = 'In Stock' if int(stock) > 0 else None
item['Seller'] = row.xpath('.//div[@class="buying-options-table pb-3"]//img/@src').get().split('logos/')[1].replace('.png', '')
item['Condition'] = row.xpath('.//div[@class="buying-options-table pb-3"]/div[2]/div[2]/text()').get()
item['Price'] = row.xpath('.//div[@class="buying-options-table pb-3"]/div[2]/div[4]/text()').get()
for option in row.xpath('.//div[@class="box-quantity col-2 p-1"]/select[@class="w-100"]/option'):
num += 1
item['Num'] = num
yield item
结果