导入SitemapSpider
类MySpider(SitemapSpider): NAME = “网站地图” sitemap_urls = ['https://play.google.com/sitemaps/sitemaps-index-0.xml']
sitemap_rules =[
('/app/', 'parse_product'),
]
def parse(self, response):
yield scrapy.Request(response.url,callback=self.parse_product)
def parse_product(self, response):
yield {
'applicationName' : (response.css('div.id-app-title ::text').extract_first()).encode("utf-8") ,
'publishedBy' : (response.css('div a.document-subtitle.primary span::text').extract_first()).encode("utf-8") ,
}