import scrapy
from scrapy.linkextractors import LinkExtractor
class WoolRich(scrapy.Spider):
name= "WoolRich_Spider"
allowed_domains = ['woolrich.com']
start_urls = ['https://www.woolrich.com/men/?sort=featured&page=1']
def parse(self, response):
links = response.css('li.product> article> figure> a::attr(href)').extract()
for link in links:
yield scrapy.Request(link,
callback=self.parse_of_individual_page)
next_page=LinkExtractor(allow=[''], deny=['sort', 'size', 'Size', 'fsnf'])
links = next_page.extract_links(response)
for link in links:
yield scrapy.Request(link.url,
callback=self.parse)
# response.css('div.productView-image').extract()
def parse_of_individual_page(self, response):
self.arbi = {
'Product Name': response.css('h1.productView-title::text').extract(),
'Style': response.css('.productView-product > div:nth-child(2) > strong:nth-child(1)::text').extract(),
'Price': response.css('span.price::text')[0].extract(),
'Size': response.css('span.form-option-variant::text').extract(),
'Features': response.css('#features-content > li::text').extract(),
'Description': response.css('#details-content::text').extract(),
'Path from home': response.css('a.breadcrumb-label::text').extract(),
'Image links': response.css('div.zoom> a::attr(data-zoom-image)').extract()
}
yield self.arbi
这是完整的代码。由于产品是动态的,因此我无法检索它们的颜色。以这个URl为例:https://www.woolrich.com/mens-wool-stag-shirt-jac-6138/
它有多种颜色。仅需要颜色的名称。
答案 0 :(得分:0)
response.css('label.form-option-swatch> span::attr(title)').extract()
这解决了问题。我在HTML中忽略了这一行