Python程序占用大量内存

时间:2015-07-20 06:59:34

标签: python memory-management memory-leaks reference

def sentence_to_wordlist (sentence, remove_stopwords=False ):

    review_text = re.sub("[^a-zA-Z]"," ", sentence)

        # 3. Convert words to lower case and split them
    words = review_text.lower().split()

        # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

        # 5. Return a list of words
    return(words)

def doc_to_sentences( doc, tokenizer, remove_stopwords=False ):

    raw_sentences = tokenizer.tokenize(unicode(doc.strip(), errors = 'ignore'))
    sentences = []

    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( sentence_to_wordlist( raw_sentence, remove_stopwords ))

    return sentences

def sentence_extractor ( news_file ) :

    docs = []
    counter = 0
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    for line in open(news_file) :
        if counter % 1000 == 0:
            print counter,"docs read"
            print sys.getsizeof(docs)
            gc.collect()

        counter += 1
        row = line.split("\t")
        title = row[1]
        content = row[2]
        docs += doc_to_sentences(title + ". " + content, tokenizer)
    return docs  

在函数sentence_extractor中,我将列表添加到docs。通过getsizeof函数检查其大小时,它仅显示200MB。但是top显示的内存使用量超过了15GB。有人可以告诉我上面的代码有什么问题。

0 个答案:

没有答案