def sentence_to_wordlist (sentence, remove_stopwords=False ):
review_text = re.sub("[^a-zA-Z]"," ", sentence)
# 3. Convert words to lower case and split them
words = review_text.lower().split()
# 4. Optionally remove stop words (false by default)
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
# 5. Return a list of words
return(words)
def doc_to_sentences( doc, tokenizer, remove_stopwords=False ):
raw_sentences = tokenizer.tokenize(unicode(doc.strip(), errors = 'ignore'))
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( sentence_to_wordlist( raw_sentence, remove_stopwords ))
return sentences
def sentence_extractor ( news_file ) :
docs = []
counter = 0
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for line in open(news_file) :
if counter % 1000 == 0:
print counter,"docs read"
print sys.getsizeof(docs)
gc.collect()
counter += 1
row = line.split("\t")
title = row[1]
content = row[2]
docs += doc_to_sentences(title + ". " + content, tokenizer)
return docs
在函数sentence_extractor
中,我将列表添加到docs
。通过getsizeof
函数检查其大小时,它仅显示200MB
。但是top
显示的内存使用量超过了15GB
。有人可以告诉我上面的代码有什么问题。