我正在尝试抓取Google App Store,但遇到了一些问题。我定义了关键字并获得了几个搜索结果页面:
> https://play.google.com/store/search?q=stutter&c=apps > https://play.google.com/store/search?q=stuttering&c=apps > https://play.google.com/store/search?q=speech%20therapy&c=apps > https://play.google.com/store/search?q=speech%20and%20language%20therapy&c=apps > https://play.google.com/store/search?q=aphasia&c=apps > https://play.google.com/store/search?q=apraxia&c=apps > https://play.google.com/store/search?q=dysarthria&c=apps
但是当我尝试从搜索页面获取应用程序详细信息网址时,它什么也没有返回。我自己写了xpath路径。但是,如果我直接从Chrome复制它会起作用,那么它只会刮取一个结果页面,而不会刮除全部7个页面。我不明白问题是什么。
主要代码如下:
# -*- coding: utf-8 -*-
import scrapy
from GP_Spider.items import GpItem
from scrapy import Request
class GoogleSpider(scrapy.Spider):
name = 'google'
allowed_domains = ['google.play.com']
start_urls = ['https://play.google.com/store']
def parse(self, response):
keywords = [
'stuttering', 'speech%20therapy', 'speech%20and%20language%20therapy', 'aphasia', 'apraxia', 'dysarthria'
]
link_flag = 0
for each in keywords:
app_url = ("https://play.google.com/store/search?q=" + keywords[link_flag] + '&c=apps')
print(app_url)
yield Request(url=app_url, callback=self.parse_search, dont_filter=True)
link_flag += 1
def parse_search(self, response):
print("START PARSING")
selector = scrapy.Selector(response)
#print(response.body)
urls = selector.xpath('//a[@class="poRVub" and aria-hidden="true"]/@href').extract()
#urls = selector.xpath('//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/c-wiz/c-wiz/c-wiz/div/div[2]/div[1]/c-wiz/div/div/div[1]/div/div/a/@href').extract()
print(urls)
link_flag = 0
links = []
for link in urls:
links.append(link)
for each in urls:
yield Request(url="https://play.google.com" + links[link_flag], callback=self.parse_detail, dont_filter=True)
print("https://play.google.com" + links[link_flag])
link_flag += 1
def parse_detail(self, response):
item = GpItem()
item['app_url'] = response.url
item['app_name'] = response.xpath('//h1[@itemprop="name"]/span').xpath('text()').get()
item['app_icon'] = response.xpath('//img[@itemprop="image"]/@src').get()
item['app_rate'] = response.xpath('//div[@class="K9wGie"]/div[@class="BHMmbe"]').xpath('text()').get()
item['app_version'] = response.xpath('//div[@class="IQ1z0d"]/span[@class="htlgb"]').xpath('text()').get()
item['app_description'] = response.xpath('//div[@itemprop="description"]/span/div').xpath('text()').get()
# item['app_developer'] = response.xpath('//')
# print(response.text)
yield item