我正在尝试抓取亚马逊网站,但抓取工作正常并给出结果但是在抓取30个结果后它停止。我默认获得16个来自此网站的结果,但如果我更改settings.py(中间件)其显示30个结果。
import scrapy
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["https://www.amazon.in"]
start_urls = ['http://www.amazon.in/s/ref=nb_sb_ss_c_1_4?url=search-
alias%3Daps&fieldkeywords=lava+mobile+phones&sprefix
=lava%2Caps%2C383&crid=1NBZO31560SIG&rh=i%3Aaps%2Ck%3Alava+
mobile+phones'
]
def parse1(self, response):
q = '//div[contains(@class,"popover-
grouping")]//a[contains(@class,"nav_a")]'
global site
for i in response.xpath(q):
d1 = {
'url' : site+' '.join(i.xpath('./@href').extract()),
'Category': ' '.join(i.xpath('./text()').extract()),
'MetaCategory': '
'.join(i.xpath('../../../h2/text()').extract())
}
url = d1['url']
yield scrapy.Request(url,callback=self.parseCategoryPage,meta=
{'src':d1})
def parseCategoryPage(self, response):
src = response.meta['src']
try:
q = '//span[@class="a-list-item"]/a/span[text() = "See more"]/../@href'
brand_url = site+response.xpath(q).extract()[1]
src['brand_url'] = brand_url
yield src]
except Exception as e:
# self.logger.debug('Exception occured:: %s',e)
self.logger.exception(e)
pass
def parse(self,response):
q = '//li[contains(@id,"result_")]'
for i in response.xpath(q):
title = ''.join(i.xpath('.//h2/text()').extract())
sp = ''.join(i.xpath('.//span[contains(@class,"s-
price")]/text()').extract())
mp = ''.join(i.xpath('.//span[contains(@class,"a-text-
strike")]/text()').extract())
purl = ''.join(i.xpath('.//span[contains(@class,"s-
price")]/../@href').extract())
imgurl = ''.join(i.xpath('.//img[contains(@class,"s-access-
image")]/@src').extract())
yield { "title" : title, "sp" : sp, "mrp" : mp, "pdp": purl,
"img":imgurl}
paginate_next =
response.xpath('//span[@class="pagnLink"]/a/@href').extract()
paginate_max =
response.xpath('//span[@class="pagnDisabled"]/text()').extract()
def parseBrandsPages(self,response):
global site
q = '//span[@class="pagnLink"]/a/@href'
brands = response.xpath(q).extract()
self.logger.debug('parsed the brands pages ')
# self.logger.debug(brands)
brand_pages = [ site+i for i in list(set(brands))]
yield {"brand_pages" : brand_pages}
def parseBrands(self,response):
global site
q = '//span[@class="refinementLink"]'
self.logger.debug('parsing the brands in the page')
for i in response.xpath(q):
url = site+''.join(i.xpath('../@href').extract())
name = ''.join(i.xpath('./text()').extract())
cnt = ''.join(i.xpath('./following-
sibling::span/text()').extract())[2:-1]
# print url,name,cnt
yield { "url":url, "brand":name, "count":cnt}'
从亚马逊网页开始只有30个结果。
File "/usr/lib/python2.7/dist-packages/boto/utils.py", line 210, in
retry_url
r = opener.open(req, timeout=timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1198, in do_open
raise URLError(err)
URLError: <urlopen error timed out>
2017-08-01 11:22:02 [boto] ERROR: Unable to read instance data, giving
up