我正在尝试抓取Google App Store,但是在运行脚本时,出现以下错误:“不支持的URL方案”:该方案没有可用的处理程序。 代码如下:
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
# from html.parser import HTMLParser as SGMLParser
from gp.items import GpItem
class GoogleSpider(scrapy.Spider):
# print("HELLO STARTING")
name = 'google'
allowed_domains = ['play.google.com']
start_urls = ['https://play.google.com/store/apps/']
'''
rules = [
Rule(LinkExtractor(allow=("https://play\.google\.com/store/apps/details",)), callback='parse_app', follow=True),
]
'''
def parse(self, response):
print("CALLING PARSE")
selector = scrapy.Selector(response)
# print(response.body)
urls = selector.xpath('//a[@class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/@href').extract()
link_flag = 0
links = []
for link in urls:
print("LINK" + str(link))
links.append(link)
for each in urls:
# print(links[link_flag])
yield scrapy.Request(links[link_flag], callback=self.parse_next, dont_filter=True)
link_flag += 1
def parse_next(self, response):
selector = scrapy.Selector(response)
# print(response)
app_urls = selector.xpath('//div[@class="details"]/a[@class="title"]/@href').extract()
# print(app_urls)
urls = []
for url in app_urls:
url = "http://play.google.com" + url
# print(url)
urls.append(url)
link_flag = 0
for each in app_urls:
yield scrapy.Request(urls[link_flag], callback=self.parse_app, dont_filter=True)
link_flag += 1
def parse_app(self, response):
item = GpItem()
item['app_url'] = response.url
item['app_name'] = response.xpath('//div[@itemprop="name"]').xpath('text()').extract()
item['app_icon'] = response.xpath('//img[@itemprop="image"]/@src')
# item['app_developer'] = response.xpath('//')
# print(response.text)
yield item
错误信息如下:
[scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET :///robots.txt>: Unsupported URL scheme '': no handler available for that scheme
Traceback (most recent call last):
File "/anaconda3/lib/python3.7/site-packages/twisted/internet/defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "/anaconda3/lib/python3.7/site-packages/twisted/python/failure.py", line 512, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/anaconda3/lib/python3.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "/anaconda3/lib/python3.7/site-packages/scrapy/utils/defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "/anaconda3/lib/python3.7/site-packages/scrapy/core/downloader/handlers/__init__.py", line 70, in download_request
(scheme, self._notconfigured[scheme]))
scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme
2019-11-15 08:49:14 [scrapy.core.scraper] ERROR: Error downloading <GET /store/apps/collection/cluster?clp=ogoKCAEqAggBUgIIAQ%3D%3D:S:ANO1ljJG6Aw&gsr=Cg2iCgoIASoCCAFSAggB:S:ANO1ljLKNqE>
scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme
我试图改为将extract()放到extract_first()中,它将显示另一个错误,例如'raise ValueError('在请求url中缺少方案:%s'%self._url)'。我是新手,请有人帮我解决这个问题吗?
答案 0 :(得分:1)
尝试将response.urljoin()
用于您的请求网址:
yield scrapy.Request(response.urljoin(urls[link_flag]), ...)