Python脚本泄漏内存

时间:2013-09-01 17:40:07

标签: python mysql memory-management memory-leaks nltk

我在我的服务器上运行此代码以提取名词短语(某种) 从一些rss饲料内容。我在Web Faction服务器上运行它并且它正在泄漏内存。关于它是如何工作的任何指针或者如果它是我的代码所以我可以解决它将非常感激。代码是:

import MySQLdb
import nltk, re, pprint

def RgxChunk(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    grammar = "NP: {<RB>?<DT>?<JJ.*>*<NN.*>*}"
    cp = nltk.RegexpParser(grammar)
    sentences = [cp.parse(sent) for sent in sentences]
    return sentences;

db = MySQLdb.connect(host="HOST", user="USER" ,  passwd="PASS", db="DB")
cursor = db.cursor()
cursor2 = db.cursor()
cursor.execute("SELECT * FROM `rss_posts` WHERE length(`text`) > 100 AND `link` LIKE 'http%'")
numrows = int(cursor.rowcount)
for x in range(0,numrows):
    row = cursor.fetchone()
    text = row[6]
    parsed = RgxChunk(text)
    insert_sql = 'INSERT INTO `nltk_terms` VALUES'
    insert_values = ''
    for sent in parsed:
        for word in sent:
            if isinstance(word, nltk.tree.Tree): 
                if word.node =='NP':
                    wordcount = len(word)
                    if(wordcount == 1):
                        thephrase = db.escape_string(word[0][0].lower())
                    elif(wordcount > 1):
                        thephrase = word[0][0]
                        for i in range(1,wordcount):
                            thephrase = thephrase + ' ' +word[i][0]
                        thephrase = db.escape_string(thephrase.lower())     
                    thelink = db.escape_string(row[2])
                    insert_values = insert_values + "('" + thelink + "','" + thephrase + "','" + thephrase + thelink + "'),"
    insert_sql = insert_sql + insert_values[:-1]
    cursor2.execute(insert_sql)
    db.commit()
    print str(x+1) + ' articles processed of ' + str(numrows) 

0 个答案:

没有答案