为什么在python + pandas代码下面有内存泄漏?

时间:2013-01-22 09:23:38

标签: python memory-leaks pandas

请检查下面的代码。我试图在下面的程序中找到内存泄漏。

run_beforerun_after支持在函数调用之前和之后计算字典数的函数。

我为每个日期调用processDate ...我会在每次调用后检查字典计数。词典总数增加了约10,000。仅创建本地词典。有一个全局功能可以更新每个呼叫,但它不会添加新的词典,至少不会计入10,000个。我不知道这个字典计数在哪里增加。它是pandas数据帧切片的内部吗?

编辑:根据要求添加完整的课程。它依赖于其他类。我很确定其他类没有这种内存泄漏,因为只有在我介绍下面的类时才会发生泄漏。

import math,logging,time
import numpy as np
import alphaCalculator,regConfig,pdb,pandas,time
import regressor
from basics import *
from matplotlib import pyplot as plt
import gc
from collections import defaultdict
from gc import get_objects


class AlphaAnalyser(alphaCalculator.AlphaCalculator):
    def __init__(self,regInfo,dateFrom=None,dateTo=None):
        alphaCalculator.AlphaCalculator.__init__(self,regInfo,dateFrom,dateTo)

        self.stockList=regInfo.stockListReg # list of stock to calc the stat on

        self.thresholdList=[-1,0,0.1,0.3,0.5,0.7] # alpha threshold it's a pct of the spread
        self.lagList=[100,600,1800,5000] # in sec

        #initialise result dictionnaries
        self.fields=(["nbPaperTrades","alpha","obj","obj2","objalpha","objalpha2"])
        for l in self.lagList:
            self.fields+=["realised_"+str(l)]

        self.results=pandas.DataFrame(columns=["date","ric","threshold"]+self.fields)
        self.results.set_index(["date","ric","threshold"],drop=False)
        self.before=defaultdict(int)
        self.after=defaultdict(int)

    def resetInfo(self,regInfo):
        self.regInfo=regInfo
        self.stockList=regInfo.stockListReg

    def getDtAlpha(self,stock,date):
        self.tickData.setDate(date)
        if stock not in self.index.refData.ix[date.date().isoformat()].ric:
            #logging.warning("[%s]: %s doesn't exist on day %s" % (__name__, stock, date.date().isoformat()))
            return()
        return(self.computeAlpha(product=stock,date=date))

    def run_before(self):
        self.before=defaultdict(int)
        for i in get_objects():self.before[type(i)]+=1

    def run_after(self):
        self.after=defaultdict(int)
        gc.collect()
        for i in get_objects():self.after[type(i)]+=1
        print("Objects which are not garbage collected: ->")
        print("Dict count diff (" + str(self.after[type({})]) + "+" + str(self.before[type({})]) + "): "+str(self.after[type({})] - self.before[type({})]))

    def processDate(self,date):
        dt1=0
        dt2=0
        if self.regInfo.regType==regConfig.RegType.INDEX_REG or self.stockList==[]:
            stockList = self.index.refData.ix[date.date().isoformat()].ric.tolist()
        else:
            stockList=self.stockList
        self.tickData.setDate(date)
        result=[]
        for stock in stockList:
            if stock not in self.index.refData.ix[date.date().isoformat()].ric:
                #logging.warning("[%s]: %s doesn't exist on day %s" % (__name__, stock, date.date().isoformat()))
                continue
            print stock

            alphaData=self.computeAlpha(stock,date) ## no dictionaries are created in this function
            if not(self.isValid):
                continue
            objData=self.computeObj(stock,date) ## no dictionaries are created in this function
            if not(self.isValid):
                continue
            nanfilter=np.isnan(alphaData["VALUES"])
            nanfilter+=np.isnan(objData["VALUES"])
            nanfilter=~nanfilter
            alphaData=alphaData["VALUES"][nanfilter]
            objData=objData["VALUES"][nanfilter]
            spread=self.index.refData.ix[date.date().isoformat(),stock]["spread"]
            paperTrades={}
            dfs=[]
            for i in range(len(self.thresholdList)):
                t=self.thresholdList[i]
                if t<0:
                    paperTrades[t]=np.repeat(True,len(alphaData))
                elif i==len(self.thresholdList)-1:
                    paperTrades[t]=(abs(alphaData)>=t*spread)
                else:
                    paperTrades[t]=(abs(alphaData)>=t*spread)*(abs(alphaData)<self.thresholdList[i+1]*spread)
                nbPaperTrades=nansum(paperTrades[t])
                sumAlpha=np.sum(abs(alphaData[paperTrades[t]]))
                sumobj=np.sum(objData[paperTrades[t]])
                sumobj2=np.sum(np.square(objData[paperTrades[t]]))
                sumoa=np.sum(objData[paperTrades[t]]-alphaData[paperTrades[t]])
                sumoa2=np.sum(np.square(objData[paperTrades[t]]-alphaData[paperTrades[t]]))
                dfs.append(pandas.DataFrame([[date,stock,t,nbPaperTrades,sumAlpha,sumobj,sumobj2,sumoa,sumoa2,]+[0.0]*len(self.lagList)],columns=self.results.columns))
                del t,nbPaperTrades,sumAlpha,sumobj,sumobj2,sumoa,sumoa2 
            for l in self.lagList:
                objData=self.computeObj(stock,date,delay=l)
                if not(self.isValid):
                    break
                objData=objData["VALUES"][nanfilter]
                for i in range(len(self.thresholdList)):
                    t=self.thresholdList[i]
                    dataIndex=paperTrades[t]
                    val=np.sign(alphaData[dataIndex])*objData[dataIndex]
                    dfs[i]["realised_"+str(l)]=nansum(val)
                del dataIndex, val, t
            self.rowResults.append(dfs)
        del alphaData, objData, nanfilter, spread, paperTrades, dfs, stockList, date

    def analyseResult(self):
        dateGen = regressor.DateGenerator(self.regInfo.indexName,self.dateFrom,self.dateTo)
        self.rowResults=[]
        for date in dateGen.getDates():
            #logging.debug("doing date "+date.date().isoformat())
            try:
                self.run_before()
                self.processDate(date) # check memory
                                self.run_after()
            except Exception:
                logging.error("Analyser can't process date "+date.date().isoformat())
        self.processFinalResult()

    def processFinalResult(self):
        if len(self.rowResults)>0:
            self.results=self.results.append(self.rowResults)

0 个答案:

没有答案