Python中的内存溢出

时间:2013-04-24 19:48:33

标签: python memory-leaks

我有67000个文件,我需要读取它们并提取单词之间的相似之处,但是当我运行代码时我的笔记本电脑变得慢得多,我无法打开任何其他应用程序,然后出现内存溢出错误(即使我运行大约10 000个文件)。有没有办法在每个for循环之后清除内存,或者在所有文件上运行代码是不可能的?以下是代码:

def isAscii(s):
    for c in s:
        if c not in string.printable:
            return False
    return True


windowSize = 2

relationTable = {}
probabilities = {}
wordCount = {}

totalWordCount = 0

def sim(w1, w2):
    numerator = 0
    denominator = 0
    if (w1 in relationTable) and (w2 in relationTable):
        rtw1 = {}
        rtw2 = {}
        rtw1 = relationTable[w1]
        rtw2 = relationTable[w2]
        for word in rtw1:
            rtw1_PMI = rtw1[word]['pairPMI']           
            denominator += rtw1_PMI
            if(word in rtw2):
                rtw2_PMI = rtw2[word]['pairPMI']
                numerator += (rtw1_PMI + rtw2_PMI)

        for word in rtw2:
            rtw2_PMI = rtw2[word]['pairPMI'] 
            denominator += rtw2_PMI

        if(denominator != 0):
            return float(numerator)/denominator
        else:
            return 0

    else:
        return -1



AllNotes = {}
AllNotes = os.listdir("C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes")

fileStopPunctuations = open('C:/Users/nerry-san/Desktop/EECE 502/stopPunctuations.txt')
stopPunctuations = nltk.word_tokenize(fileStopPunctuations.read())


for x in range (0, 10):
    fileToRead = open('C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes/%s'%(AllNotes[x]))


    case1 = fileToRead.read()
    text = nltk.WordPunctTokenizer().tokenize(case1.lower())


    final_text = []
    for index in range(len(text)):
        word = text[index]
        if (word not in stopPunctuations):
            final_text.append(word)            

    for index in range (len(final_text)):
        w1 = final_text[index]
        if(isAscii(w1)):
            for index2 in range(-windowSize, windowSize+1):
                if (index2 != 0):
                    if ( index + index2 ) in range (0, len(final_text)):
                        w2 = final_text[index + index2]

                        if(isAscii(w2)):
                            totalWordCount += 1
                            if (w1 not in wordCount):
                                wordCount[w1] = {}
                                wordCount[w1]['wCount'] = 0

                            try:
                                wordCount[w1][w2]['count'] += 1
                                wordCount[w1]['wCount'] += 1
                            except KeyError:
                                wordCount[w1][w2] = {'count':1}
                                wordCount[w1]['wCount'] += 1 


for word in wordCount:
    probabilities[word]={}
    probabilities[word]['wordProb'] = float (wordCount[word]['wCount'])/ totalWordCount



for word in wordCount:
    relationTable[word] = {}
    for word2 in wordCount[word]:    
        if ( word2 != 'wCount'):          
            pairProb = float(wordCount[word][word2]['count'])/(wordCount[word]['wCount'])

            relationTable[word][word2] = {}
            relationTable[word][word2]['pairPMI'] = math.log(float(pairProb)/(probabilities[word]['wordProb'] * probabilities[word2]['wordProb']),2)

l = []
for word in relationTable:
    l.append(word)

for index in range (0, len(l)):
    word = l[index] 
    simValues = []
    for index2 in range (0, len(l)):
        word2 = l[index2]
        if(word!= word2):
            simVal = sim(word,word2)
            if(simVal > 0):
                simValues.append([word2, simVal])

    simValues.sort(key= operator.itemgetter(1), reverse = True)

1 个答案:

答案 0 :(得分:0)

每次打开文件时,请使用“with”语句。这将确保在循环结束时(或者当退出with块时)关闭文件。