我有句使用yield
的句子迭代器,所以它正式是一个生成器
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import json
import os
class LyricsCorpus(object):
def __init__(self, corpus, tokenize=False, deaccent=False):
self.corpus = corpus
self.tokenize = tokenize
self.deaccent = deaccent
def __iter__(self):
for index,fname in enumerate( os.listdir(self.corpus) ):
with open( os.path.join(self.corpus, fname) ) as data_file:
data = json.load(data_file)
for item in data:
if "lyrics" in item:
if "lyrics_body" in item["lyrics"]:
if self.tokenize:
yield self.tokens( item["lyrics"]["lyrics_body"] )
else:
yield item["lyrics"]["lyrics_body"].split()
'''
This lowercases, tokenizes, de-accents (optional). – the output are final tokens = unicode strings, that won’t be processed any further.
'''
def tokens(self,text):
return [token for token in simple_preprocess(text, deacc=self.deaccent, min_len=2, max_len=15) if token not in STOPWORDS]
遇到Word2vec
之类的
min_count = 1
size = 50
window = 4
model = Word2Vec(corpus_iterator, min_count=min_count, size=size, window=window)
迭代器不会停止,无限期地循环遍历语料库文件夹中的文件。这不会发生在像
这样的普通迭代中from LyricsCorpus import *
it=LyricsCorpus('./corpus')
[item for k in it]