Scrapy蜘蛛被自动杀死

时间:2016-06-17 03:43:33

标签: python memory-leaks scrapy

我有一只蜘蛛爬网店。它在Cloud9上很好,但在我将它移动到我的1CPU 0.5RAM vps之后。蜘蛛总是在10-20秒后被杀死。我想这可能是内存泄漏,所以我使用trackrefs,这里是少数prefs()的结果

Selector                          526   oldest: 151s ago
ColesSpider                         1   oldest: 302s ago
Request                           452   oldest: 301s ago
ColesItem                          11   oldest: 4s ago
HtmlResponse                       73   oldest: 152s ago

Selector                          574   oldest: 749s ago
ColesSpider                         1   oldest: 900s ago
Request                           106   oldest: 899s ago
ColesItem                          13   oldest: 2s ago
HtmlResponse                       43   oldest: 750s ago


Selector                          755   oldest: 789s ago
ColesSpider                         1   oldest: 940s ago
Request                           105   oldest: 939s ago
ColesItem                          14   oldest: 6s ago
HtmlResponse                       58   oldest: 791s ago

任何地方都有可疑之处?为什么有这么多Selectors?我尝试get_oldest并返回<Selector xpath=None data=u'<html class="no-js not-ready" lang="en" '>,我认为我在代码中没有选择此内容。

任何想法都会受到赞赏。

蜘蛛:

import scrapy
import re
from coles.items import ColesItem

class ColesSpider(scrapy.Spider):
    name = "coles"
    # allowed_domains = ["http://shop.coles.com.au/"]
    start_urls = ["http://shop.coles.com.au/online/national"]

    def parse(self, response):
        for div in response.xpath('//ul[@id="aisleMenu"]/li/div')[1:]:
            a = div.xpath('h2/a')
            category_name = a.xpath('text()').extract_first().strip()
            category_urlName = a.xpath('@href').re(r'/([^/]+)/*$')[0]

            for li in div.xpath('ul/li'):
                # subcategory_name = subcategory_urlName = None
                if (li.xpath('@class').extract_first()):
                    subcategory_name = li.xpath('a/text()').extract_first().strip()
                subcategory_urlName = li.xpath('a/@href').re(r'/([^/]+)/*$')[0]
                else:
                    subsubcategory_name = li.xpath('a/text()').extract_first().strip()
                    subsubcategory_urlName = li.xpath('a/@href').re(r'/([^/]+)/*$')[0]

                    url = self.make_url(category_urlName, subcategory_urlName, subsubcategory_urlName)
                    request = scrapy.Request(url, cookies={'ColesSearchPageSizeCookie': 1000}, callback = self.get_products)
                    request.meta['category'] = [category_name, category_urlName]
                    request.meta['subcategory'] = [subcategory_name, subcategory_urlName]
                    request.meta['subsubcategory'] = [subsubcategory_name, subsubcategory_urlName]

                    yield request


    def get_products(self, response):
        for div in response.xpath('//div[@class="list-view viewContainer clearfix searchEspot"]/div[@class="outer-prod prodtile"]'):
            item = ColesItem()
            data_refresh  = div.xpath('form/div/@data-refresh')
            item['name'] = data_refresh.re_first(r'catEntryName: "(.+?)",')
            item['stockcode'] = int(data_refresh.re_first(r'productId: "(.+?)",'))
            a = div.xpath('form/div/div/a')
            item['urlName'] = a.xpath('@href').re_first(r'/([^/]+)/*$')
            item['image'] = re.sub(r'-th(?=\.jpg)', '', a.xpath('img/@src').extract_first())
            item['brand'] = div.xpath('form/div/div/div[@class="detail"]/span[@class="brand"]/text()').extract_first()
            item['size'] = div.xpath('form/div/div/div[@class="detail"]/span[@class="item"]/a/text()').re_first(r'\w+\s*\n(.+)').strip()


            price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="price"]/text()').extract_first()
            if price is None:
                price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="std-price"]/text()').re_first(r'1\sfor\s\$(\d+\.\d\d)\s\/')

            if price is None:
                price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="price no-price"]/text()').extract_first()

            if price == "Price unavailable":
                item['price'] = None
                item['regular_price'] = None
            else:
                item['price'] = int(float(price) * 100)

                regular_price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="saving"]/text()').re_first(r'was\s\$(\d+\.\d\d)')

                if regular_price is None:
                    item['regular_price'] = item['price']
                else:
                   item['regular_price'] = int(float(regular_price) * 100)

            item['category'] = response.meta['category']
            item['subcategory'] = response.meta['subcategory']
            item['subsubcategory'] = response.meta['subsubcategory']

            # print item
            yield item

    def make_url(self, category_urlName, subcategory_urlName, subsubcategory_urlName):
         return 'http://shop.coles.com.au/online/national/{0}/{1}/{2}'.format(category_urlName, subcategory_urlName, subsubcategory_urlName)

0 个答案:

没有答案