我正在尝试创建一个刮板,用于刮擦其产品的网站。我决定从导航菜单中提取所有类别链接,然后跟随它们并提取所有产品链接,然后在parse_product函数中对其进行解析。但是我实际上并不是最好的方法。我正在努力遵循以下parse_menu链接和进一步的extractg产品链接。批评我的代码。
class DiorSpider(CrawlSpider):
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us',
)), callback='parse_menu'),
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/products/.*',
)), callback='parse_product'),
)
def parse_menu(self, response):
menu = response.xpath('//a[@class="navigation-item-link"]').extract()
for item in menu:
link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
if link:
absolute_url = response.urljoin(link[0])
yield absolute_url
def parse_product(self, response):
答案 0 :(得分:0)
class DiorSpider(Spider): #crawlspider is used mostly when you use Linkextractors.
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
#if you're going through nevigation bar, no need to add Rules.
def parse(self, response):
links = response.xpath('//a[@class="navigation-item-link"]/@href').extract() #here you can easily extract links
for link in links:
#link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
#links are extracted in xpath above.
absolute_url = response.urljoin(link)
yield Request(absolute_url, self.parse_product)
def parse_product(self, response):