我已经在scrapy中写了一个蜘蛛,从一些新闻网站爬行数十万页。当我从命令行工具启动时它运行良好,内存使用率在我的4GB PC上达到稳定的20%。 (我已经优先使用请求以确保不会有太多请求存活。)但是当我从python脚本启动它时,内存的使用会继续增长,直到我的蜘蛛占用所有内存空间。这是我的启动脚本:
class CrawlersInitiator(object):
def __init__(self, spiders, start=datetime.now()-timedelta(minutes=30), end=datetime.now()):
self.setting = get_project_settings()
self.crawlers = []
self.spiders = spiders
self.start_time = start
self.end_time = end
# log file
self.info_log = None
log_dir = self.setting.get("LOG_DIR")
if not os.path.exists(log_dir):
os.mkdir(log_dir)
# counter used to stop reactor
self.stopped_crawler = 0
self.lock = RLock()
def __del__(self):
self.close_log_file()
def create_log_file(self):
""" create log file with crawl date in file name
"""
self.close_log_file()
dir_path = self.setting.get("LOG_DIR")+"/{0}".format(self.end_time.strftime("%Y-%m"))
file_suffix = self.end_time.strftime("%Y-%m-%d")
if not os.path.exists(dir_path):
os.mkdir(dir_path)
self.info_log = open("{0}/log-{1}.log".format(dir_path, file_suffix), "a") # info
def close_log_file(self):
if self.info_log and not self.info_log.closed:
self.info_log.close()
self.info_log = None
def get_crawler(self, spider):
crawler = Crawler(self.setting)
crawler.signals.connect(self.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider(start_time=self.start_time, end_time=self.end_time))
return crawler
def stop(self):
"""callback to stop reactor
"""
self.lock.acquire()
self.stopped_crawler += 1
if self.stopped_crawler >= len(self.crawlers):
reactor.stop()
self.lock.release()
def run_spiders(self):
"""run spiders
"""
self.crawlers = []
self.stopped_crawler = 0
# get crawlers
for Spider in self.spiders:
self.crawlers.append(self.get_crawler(Spider))
# log
self.create_log_file()
ScrapyFileLogObserver(self.info_log, level=log.INFO).start()
self.info_log.write("\nCrawlers starting...\n")
self.info_log.write("Crawl from {0} to {1}".format(str(self.start_time), str(self.end_time)))
# run
for crawler in self.crawlers:
crawler.start()
reactor.run()
end = datetime.now()
# release crawlers
for crawler in self.crawlers:
del crawler
# log
self.info_log.write("Crawlers finished in {0} !\n".format(str(end-self.end_time)))
self.close_log_file()
def crawl(spiders, start, end):
CrawlersInitiator(spiders, start=start, end=end).run_spiders()
SPIDERS = [MySpider1, MySpider2]
if __name__ == "__main__":
start_time = datetime.strptime(sys.argv[1], "%Y-%m-%d_%H:%M:%S")
end_time = datetime.strptime(sys.argv[2], "%Y-%m-%d_%H:%M:%S")
crawl(SPIDERS, start_time, end_time)
quit()
我曾尝试使用scrapy trackref来查找问题。
从命令行工具启动时,prefs()显示(只启动了一个蜘蛛):
MySpider1 1 oldest: 942s ago
HtmlResponse 13 oldest: 52s ago
Request 6329 oldest: 932s ago
Item 5915 oldest: 932s ago
Selector 13 oldest: 52s ago
从脚本开始时,prefs()显示:
Response 51 oldest: 657s ago
Request 6966 oldest: 661s ago
Item 5732 oldest: 661s ago
HtmlResponse 377 oldest: 661s ago
Selector 377 oldest: 661s ago
MySpider1 1 oldest: 661s ago
从我的脚本开始,看起来scrapy永远不会释放任何对象。为什么会发生这种情况以及如何解决?
这是我所有蜘蛛的超类,所有请求都在这个类中处理:
class IndexSpider(Spider):
__metaclass__ = ABCMeta
# splice _queries onto _search_url to get start_requests (index pages of news)
_search_url = ""
_queries = []
_char_set = "utf8"
def __init__(self, queries=self._queries, start_time=datetime.min, end_time=datetime.now()):
self.queries = queries
self.start_time = start_time
self.end_time = end_time
def start_requests(self):
query_count = 0
query = None
try:
for query in self.queries:
yield Request(self._search_url.format(urllib.quote(query.encode(self._char_set))), self.parse_index)
query_count += 1
except Exception, e:
self.log("Query No.{0} can't be encoded in {1}, because of {2}!"
.format(str(query_count), self.name, e), level=log.WARNING)
yield Request(self._search_url.format(query.encode("gbk")), self.parse_index)
def parse_index(self, response):
"""parse index page
"""
requests = []
page_list = self._get_result(response)
if not page_list:
return requests
next_page = True
for item in page_list:
if isinstance(item, Request):
requests.append(item)
next_page = False
break
if item['publish_time'] <= self.from_time:
next_page = False
break
elif item['publish_time'] > self.end_time:
continue
else:
req = Request(item['url'], self.parse_news, priority=1)
req.meta["item"] = item
requests.append(req)
if next_page:
next_page = self._next_index_page(response)
if next_page:
requests.append(Request(self._next_index_page(response), self.parse_index))
return requests
def parse_news(self, response):
"""parse news page
"""
item = response.meta["item"]
del response.meta['item']
return self._finish_item(item, response)
@abstractmethod
def _get_result(self, response):
"""get news list from index page
:param response: index page
:return: a list of objects of crawlers.items.Base or its subclass, each object represents a news
"""
pass
@abstractmethod
def _next_index_page(self, response):
"""
:param response: current index page
:return: URL of the next index page
"""
pass
@abstractmethod
def _finish_item(self, item, response):
"""parse news page
:param item: news item get from the index page
:param response: news page
:return: news item or new request
"""
pass
答案 0 :(得分:2)
从脚本运行时,scrapy可能不会使用计算机的全部功能。您仍然可以使用“设置”来控制蜘蛛行为:
my_settings = {
'MEMUSAGE_ENABLED' = 1,
'MEMUSAGE_LIMIT_MB' = 1024,
'CONCURRENT_REQUESTS' = 100 # lower this if it is still reaching memory limits
}
process = CrawlerProcess(my_settings)
process.crawl(MySpider)
process.start()