
时间:2015-07-20 06:59:34

标签: python memory-management memory-leaks reference

def sentence_to_wordlist (sentence, remove_stopwords=False ):

    review_text = re.sub("[^a-zA-Z]"," ", sentence)

        # 3. Convert words to lower case and split them
    words = review_text.lower().split()

        # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

        # 5. Return a list of words

def doc_to_sentences( doc, tokenizer, remove_stopwords=False ):

    raw_sentences = tokenizer.tokenize(unicode(doc.strip(), errors = 'ignore'))
    sentences = []

    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( sentence_to_wordlist( raw_sentence, remove_stopwords ))

    return sentences

def sentence_extractor ( news_file ) :

    docs = []
    counter = 0
    tokenizer ='tokenizers/punkt/english.pickle')

    for line in open(news_file) :
        if counter % 1000 == 0:
            print counter,"docs read"
            print sys.getsizeof(docs)

        counter += 1
        row = line.split("\t")
        title = row[1]
        content = row[2]
        docs += doc_to_sentences(title + ". " + content, tokenizer)
    return docs  


0 个答案:
