我正在使用scrapy
和scrapyrt
来创建API。我的应用程序大约有20个蜘蛛。我们正在使用NFS服务器在生产上进行负载平衡。不幸的是,应用程序占用了40%或更多的空间。
"stats": {
"downloader/request_bytes": 12033,
"downloader/request_count": 5,
"downloader/request_method_count/GET": 4,
"downloader/request_method_count/POST": 1,
"downloader/response_bytes": 20165,
"downloader/response_count": 5,
"downloader/response_status_count/200": 3,
"downloader/response_status_count/302": 1,
"downloader/response_status_count/404": 1,
"finish_reason": "finished",
"finish_time": "2019-05-23 06:05:04",
"item_scraped_count": 1,
"log_count/DEBUG": 35,
"log_count/INFO": 20,
"memusage/max": 3399057408,
"memusage/startup": 3399057408,
"request_depth_max": 2,
"response_received_count": 4,
"scheduler/dequeued": 4,
"scheduler/dequeued/memory": 4,
"scheduler/enqueued": 4,
"scheduler/enqueued/memory": 4,
"start_time": "2019-05-23 06:05:01"
}
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
14500 root 20 0 4999116 3.190g 7184 S 0.3 40.9 103:34.01 scrapyrt
我遵循了易碎的内存泄漏文档,因此从请求中删除了元属性,但是内存仍在增加。
class GarudaRetrieveBookingSpider(Spider):
"""get the pax, flight and fare details"""
name = "garuda_retrieve_booking"
meta = dict()
formdata = dict()
start_url = ''
booking_code = ''
output_dict = {'schedule_detail': [], 'pax_details': [], 'reservation_name': '', 'fare_details': {}}
pax_count = 0
adult_count = 0
child_count = 0
infant_count = 0
ticket_list_adt_child = []
ticket_list_inf = []
# this variable is created to save rt command response data to pass in next call if there are no tickets
rt_response = ''
def start_requests(self):
"""
:return: Request object
"""
post_data = self.data
garuda_session_id = post_data['parameter']['jSessionId']
post_data["command"] = "IG"
file_path = os.path.join(GARUDA_SESSION_FILES_PATH, TODAY_DATE, garuda_session_id + "_session.txt")
session_data = get_cookies(self, file_path)
self.start_url = GARUDA_KEEP_ALIVE_URL.format(session_id=session_data["jSessionId"], site=SITE, lang=LANGUAGE)
self.meta = {"session_data": session_data, "post_data": post_data}
return [Request(self.start_url, self.parse, errback=self.errback_httpbin)]
def parse(self, response):
"""
:param response:
:return: FormRequest
description: submit IG command
"""
self.log("\n\nparse response: {}\n\n".format(response.text))
if response.status != 200:
error_message = 'parse method failed.'
return {"status": False, "error_message": error_message}
session_data = self.meta["session_data"]
command = self.meta["post_data"]["command"]
# override the command with current command
session_data["tasks"][0]["command"]["command"] = command
self.formdata = {
"data": json.dumps(session_data)
}
yield scrapy.FormRequest(CRYTO_COMMAND_URL, formdata=self.formdata,
callback=self.ig_command_response, errback=self.errback_httpbin)