Question

import scrapy
from scrapy.linkextractors import LinkExtractor


class WoolRich(scrapy.Spider):
    name= "WoolRich_Spider"
    allowed_domains = ['woolrich.com']
    start_urls = ['https://www.woolrich.com/men/?sort=featured&page=1']

    def parse(self, response):
    links = response.css('li.product> article> figure> a::attr(href)').extract()
    for link in links:
        yield scrapy.Request(link,
                             callback=self.parse_of_individual_page)

    next_page=LinkExtractor(allow=[''], deny=['sort', 'size', 'Size', 'fsnf'])
    links = next_page.extract_links(response)
    for link in links:
        yield scrapy.Request(link.url,
                             callback=self.parse)

   # response.css('div.productView-image').extract()

def parse_of_individual_page(self, response):
    self.arbi = {
        'Product Name': response.css('h1.productView-title::text').extract(),
        'Style': response.css('.productView-product > div:nth-child(2) > strong:nth-child(1)::text').extract(),
        'Price': response.css('span.price::text')[0].extract(),
        'Size': response.css('span.form-option-variant::text').extract(),
        'Features': response.css('#features-content > li::text').extract(),
        'Description': response.css('#details-content::text').extract(),
        'Path from home': response.css('a.breadcrumb-label::text').extract(),
        'Image links': response.css('div.zoom> a::attr(data-zoom-image)').extract()



    }
    yield self.arbi

这是完整的代码。由于产品是动态的，因此我无法检索它们的颜色。以这个URl为例：https://www.woolrich.com/mens-wool-stag-shirt-jac-6138/

它有多种颜色。仅需要颜色的名称。

Answer 1

response.css('label.form-option-swatch> span::attr(title)').extract()

这解决了问题。我在HTML中忽略了这一行

如何从scrapy中的javascript事件中提取项目？

1 个答案: