Gensim word2vec:迭代器不会因为yield而停止

时间:2016-10-20 15:01:36

标签: python gensim word2vec

我有句使用yield的句子迭代器,所以它正式是一个生成器

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import json
import os

class LyricsCorpus(object):

    def __init__(self, corpus, tokenize=False, deaccent=False):
        self.corpus = corpus
        self.tokenize = tokenize
        self.deaccent = deaccent

    def __iter__(self):
        for index,fname in enumerate( os.listdir(self.corpus) ):
            with open( os.path.join(self.corpus, fname) ) as data_file:
                data = json.load(data_file)
                for item in data:
                    if "lyrics" in item:
                        if "lyrics_body" in item["lyrics"]:
                            if self.tokenize:
                                yield self.tokens( item["lyrics"]["lyrics_body"] )
                            else:
                                yield item["lyrics"]["lyrics_body"].split()
    '''
        This lowercases, tokenizes, de-accents (optional). – the output are final tokens = unicode strings, that won’t be processed any further.
    '''
    def tokens(self,text):
        return [token for token in simple_preprocess(text, deacc=self.deaccent, min_len=2, max_len=15) if token not in STOPWORDS]

遇到Word2vec之类的

min_count = 1
size = 50
window = 4
model = Word2Vec(corpus_iterator, min_count=min_count, size=size, window=window)

迭代器不会停止,无限期地循环遍历语料库文件夹中的文件。这不会发生在像

这样的普通迭代中
from LyricsCorpus import *
it=LyricsCorpus('./corpus')
[item for k in it]

0 个答案:

没有答案