Question

我正在使用scrapy和scrapyrt来创建API。我的应用程序大约有20个蜘蛛。我们正在使用NFS服务器在生产上进行负载平衡。不幸的是，应用程序占用了40％或更多的空间。

"stats": {
    "downloader/request_bytes": 12033,
    "downloader/request_count": 5,
    "downloader/request_method_count/GET": 4,
    "downloader/request_method_count/POST": 1,
    "downloader/response_bytes": 20165,
    "downloader/response_count": 5,
    "downloader/response_status_count/200": 3,
    "downloader/response_status_count/302": 1,
    "downloader/response_status_count/404": 1,
    "finish_reason": "finished",
    "finish_time": "2019-05-23 06:05:04",
    "item_scraped_count": 1,
    "log_count/DEBUG": 35,
    "log_count/INFO": 20,
    "memusage/max": 3399057408,
    "memusage/startup": 3399057408,
    "request_depth_max": 2,
    "response_received_count": 4,
    "scheduler/dequeued": 4,
    "scheduler/dequeued/memory": 4,
    "scheduler/enqueued": 4,
    "scheduler/enqueued/memory": 4,
    "start_time": "2019-05-23 06:05:01"
  }
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
14500 root      20   0 4999116 3.190g   7184 S   0.3 40.9 103:34.01 scrapyrt

我遵循了易碎的内存泄漏文档，因此从请求中删除了元属性，但是内存仍在增加。

class GarudaRetrieveBookingSpider(Spider):
    """get the pax, flight and fare details"""

    name = "garuda_retrieve_booking"
    meta = dict()
    formdata = dict()
    start_url = ''
    booking_code = ''
    output_dict = {'schedule_detail': [], 'pax_details': [], 'reservation_name': '', 'fare_details': {}}
    pax_count = 0
    adult_count = 0
    child_count = 0
    infant_count = 0
    ticket_list_adt_child = []
    ticket_list_inf = []
    # this variable is created to save rt command response data to pass in next call if there are no tickets
    rt_response = ''

    def start_requests(self):
        """
        :return: Request object
        """
        post_data = self.data
        garuda_session_id = post_data['parameter']['jSessionId']
        post_data["command"] = "IG"
        file_path = os.path.join(GARUDA_SESSION_FILES_PATH, TODAY_DATE, garuda_session_id + "_session.txt")
        session_data = get_cookies(self, file_path)
        self.start_url = GARUDA_KEEP_ALIVE_URL.format(session_id=session_data["jSessionId"], site=SITE, lang=LANGUAGE)
        self.meta = {"session_data": session_data, "post_data": post_data}
        return [Request(self.start_url, self.parse, errback=self.errback_httpbin)]

    def parse(self, response):
        """
        :param response:
        :return: FormRequest
        description: submit IG command
        """
        self.log("\n\nparse response: {}\n\n".format(response.text))
        if response.status != 200:
            error_message = 'parse method failed.'
            return {"status": False, "error_message": error_message}
        session_data = self.meta["session_data"]
        command = self.meta["post_data"]["command"]
        # override the command with current command
        session_data["tasks"][0]["command"]["command"] = command
        self.formdata = {
            "data": json.dumps(session_data)
        }
        yield scrapy.FormRequest(CRYTO_COMMAND_URL, formdata=self.formdata,
                                 callback=self.ig_command_response, errback=self.errback_httpbin)

稀少的内存消耗仅在增加

0 个答案: