我有一只蜘蛛爬网店。它在Cloud9
上很好,但在我将它移动到我的1CPU 0.5RAM vps之后。蜘蛛总是在10-20秒后被杀死。我想这可能是内存泄漏,所以我使用trackrefs
,这里是少数prefs()
的结果
Selector 526 oldest: 151s ago
ColesSpider 1 oldest: 302s ago
Request 452 oldest: 301s ago
ColesItem 11 oldest: 4s ago
HtmlResponse 73 oldest: 152s ago
Selector 574 oldest: 749s ago
ColesSpider 1 oldest: 900s ago
Request 106 oldest: 899s ago
ColesItem 13 oldest: 2s ago
HtmlResponse 43 oldest: 750s ago
Selector 755 oldest: 789s ago
ColesSpider 1 oldest: 940s ago
Request 105 oldest: 939s ago
ColesItem 14 oldest: 6s ago
HtmlResponse 58 oldest: 791s ago
任何地方都有可疑之处?为什么有这么多Selectors
?我尝试get_oldest
并返回<Selector xpath=None data=u'<html class="no-js not-ready" lang="en" '>
,我认为我在代码中没有选择此内容。
任何想法都会受到赞赏。
蜘蛛:
import scrapy
import re
from coles.items import ColesItem
class ColesSpider(scrapy.Spider):
name = "coles"
# allowed_domains = ["http://shop.coles.com.au/"]
start_urls = ["http://shop.coles.com.au/online/national"]
def parse(self, response):
for div in response.xpath('//ul[@id="aisleMenu"]/li/div')[1:]:
a = div.xpath('h2/a')
category_name = a.xpath('text()').extract_first().strip()
category_urlName = a.xpath('@href').re(r'/([^/]+)/*$')[0]
for li in div.xpath('ul/li'):
# subcategory_name = subcategory_urlName = None
if (li.xpath('@class').extract_first()):
subcategory_name = li.xpath('a/text()').extract_first().strip()
subcategory_urlName = li.xpath('a/@href').re(r'/([^/]+)/*$')[0]
else:
subsubcategory_name = li.xpath('a/text()').extract_first().strip()
subsubcategory_urlName = li.xpath('a/@href').re(r'/([^/]+)/*$')[0]
url = self.make_url(category_urlName, subcategory_urlName, subsubcategory_urlName)
request = scrapy.Request(url, cookies={'ColesSearchPageSizeCookie': 1000}, callback = self.get_products)
request.meta['category'] = [category_name, category_urlName]
request.meta['subcategory'] = [subcategory_name, subcategory_urlName]
request.meta['subsubcategory'] = [subsubcategory_name, subsubcategory_urlName]
yield request
def get_products(self, response):
for div in response.xpath('//div[@class="list-view viewContainer clearfix searchEspot"]/div[@class="outer-prod prodtile"]'):
item = ColesItem()
data_refresh = div.xpath('form/div/@data-refresh')
item['name'] = data_refresh.re_first(r'catEntryName: "(.+?)",')
item['stockcode'] = int(data_refresh.re_first(r'productId: "(.+?)",'))
a = div.xpath('form/div/div/a')
item['urlName'] = a.xpath('@href').re_first(r'/([^/]+)/*$')
item['image'] = re.sub(r'-th(?=\.jpg)', '', a.xpath('img/@src').extract_first())
item['brand'] = div.xpath('form/div/div/div[@class="detail"]/span[@class="brand"]/text()').extract_first()
item['size'] = div.xpath('form/div/div/div[@class="detail"]/span[@class="item"]/a/text()').re_first(r'\w+\s*\n(.+)').strip()
price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="price"]/text()').extract_first()
if price is None:
price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="std-price"]/text()').re_first(r'1\sfor\s\$(\d+\.\d\d)\s\/')
if price is None:
price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="price no-price"]/text()').extract_first()
if price == "Price unavailable":
item['price'] = None
item['regular_price'] = None
else:
item['price'] = int(float(price) * 100)
regular_price = div.xpath('form/div/div/div[@class="purchasing"]/div[@class="saving"]/text()').re_first(r'was\s\$(\d+\.\d\d)')
if regular_price is None:
item['regular_price'] = item['price']
else:
item['regular_price'] = int(float(regular_price) * 100)
item['category'] = response.meta['category']
item['subcategory'] = response.meta['subcategory']
item['subsubcategory'] = response.meta['subsubcategory']
# print item
yield item
def make_url(self, category_urlName, subcategory_urlName, subsubcategory_urlName):
return 'http://shop.coles.com.au/online/national/{0}/{1}/{2}'.format(category_urlName, subcategory_urlName, subsubcategory_urlName)