抓取工具只能抓取30个结果

时间:2017-08-01 06:13:01

标签: python web-scraping scrapy web-crawler scrapy-spider

我正在尝试抓取亚马逊网站,但抓取工作正常并给出结果但是在抓取30个结果后它停止。我默认获得16个来自此网站的结果,但如果我更改settings.py(中间件)其显示30个结果。

import scrapy
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["https://www.amazon.in"]
start_urls = ['http://www.amazon.in/s/ref=nb_sb_ss_c_1_4?url=search-
       alias%3Daps&fieldkeywords=lava+mobile+phones&sprefix
       =lava%2Caps%2C383&crid=1NBZO31560SIG&rh=i%3Aaps%2Ck%3Alava+
         mobile+phones'         
        ]
def parse1(self, response):
    q = '//div[contains(@class,"popover-
               grouping")]//a[contains(@class,"nav_a")]'
    global site
    for i in response.xpath(q):
        d1 = {
          'url' : site+' '.join(i.xpath('./@href').extract()),
          'Category': ' '.join(i.xpath('./text()').extract()),
          'MetaCategory': ' 
                   '.join(i.xpath('../../../h2/text()').extract())
        }
        url = d1['url']
        yield scrapy.Request(url,callback=self.parseCategoryPage,meta=
                       {'src':d1})
def parseCategoryPage(self, response):
    src = response.meta['src']
    try:
        q = '//span[@class="a-list-item"]/a/span[text() = "See more"]/../@href'
        brand_url = site+response.xpath(q).extract()[1]
        src['brand_url'] = brand_url
        yield src]
    except Exception as e:
        # self.logger.debug('Exception occured:: %s',e)
        self.logger.exception(e)
        pass
def parse(self,response):
    q = '//li[contains(@id,"result_")]'
    for i in response.xpath(q):
        title = ''.join(i.xpath('.//h2/text()').extract())
        sp = ''.join(i.xpath('.//span[contains(@class,"s-
                price")]/text()').extract())
        mp = ''.join(i.xpath('.//span[contains(@class,"a-text-
                strike")]/text()').extract())
        purl = ''.join(i.xpath('.//span[contains(@class,"s-
                price")]/../@href').extract())
        imgurl = ''.join(i.xpath('.//img[contains(@class,"s-access-
                 image")]/@src').extract())
        yield { "title" : title, "sp" : sp, "mrp" : mp, "pdp": purl, 
               "img":imgurl}
    paginate_next = 
        response.xpath('//span[@class="pagnLink"]/a/@href').extract()
    paginate_max = 
      response.xpath('//span[@class="pagnDisabled"]/text()').extract()
def parseBrandsPages(self,response):
    global site
    q = '//span[@class="pagnLink"]/a/@href'
    brands = response.xpath(q).extract()
    self.logger.debug('parsed the brands pages ')
    # self.logger.debug(brands)
    brand_pages = [ site+i for i in list(set(brands))]
    yield {"brand_pages" : brand_pages}
def parseBrands(self,response):
    global site
    q = '//span[@class="refinementLink"]'
    self.logger.debug('parsing the brands in the page')
    for i in response.xpath(q):
        url = site+''.join(i.xpath('../@href').extract())
        name = ''.join(i.xpath('./text()').extract())
        cnt = ''.join(i.xpath('./following-
           sibling::span/text()').extract())[2:-1]
        # print url,name,cnt
        yield { "url":url, "brand":name, "count":cnt}'

输出

从亚马逊网页开始只有30个结果。

File "/usr/lib/python2.7/dist-packages/boto/utils.py", line 210, in 
retry_url
r = opener.open(req, timeout=timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1198, in do_open
raise URLError(err)
URLError: <urlopen error timed out>
2017-08-01 11:22:02 [boto] ERROR: Unable to read instance data, giving 
   up

0 个答案:

没有答案