我收到以下错误:在可迭代的语料库对象上调用gensim.models.Word2Vec()时。
File "/anaconda/envs/py36/lib/python3.6/site-packages/gensim/models/word2vec.py", line 542, in __init__
self.build_vocab(sentences, trim_rule=trim_rule)
File "/anaconda/envs/py36/lib/python3.6/site-packages/gensim/models/word2vec.py", line 621, in build_vocab
self.finalize_vocab(update=update) # build tables & arrays
File "/anaconda/envs/py36/lib/python3.6/site-packages/gensim/models/word2vec.py", line 845, in finalize_vocab
self.reset_weights()
File "/anaconda/envs/py36/lib/python3.6/site-packages/gensim/models/word2vec.py", line 1270, in reset_weights
self.wv.syn0[i] = self.seeded_vector(self.wv.index2word[i] + str(self.seed))
TypeError: can only concatenate tuple (not "str") to tuple
语料库的定义如下:
class DataSet:
"""
Holds the dataset and the methods associated with it
"""
def __init__(self, dir, verbose, categories):
self.dir = dir
self.verbose = verbose
self.dictionary = None
self.categories = categories
self.type = None
@staticmethod
def iter_documents():
"""
Generator: iterate over all relevant documents
:return: yields one document (=list of utf8 tokens) at a time
"""
for root, dirs, files in os.walk(DIR_PROCESSED):
for fname in filter(lambda fname: fname.endswith('.txt'), files):
document = open(os.path.join(root, fname)).read()
yield gensim.utils.tokenize(document, errors='ignore')
def __iter__(self):
"""
__iter__ is a generator => Dataset is a streamed iterable
:return: sparse dictionary
"""
for tokens in DataSet.iter_documents():
yield self.dictionary.doc2bow(tokens)
使用子类XMLDataset:包含静态方法
它是XMLDataset的一个实例,它作为语料库传递给gensim.models.Word2Vec()
问题是什么?
编辑:
字典更新如下:
self.dictionary = gensim.corpora.Dictionary(DataSet.iter_documents())
然后在数据集上调用gensim:
corpus = DataSet() #roughly
model = gensim.models.Word2Vec(corpus, size=dim, window=5, workers=workers) # mincount
如果我要遍历语料库并打印每个元素,我会获得训练模型所需的输出,如下所示:
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 3), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 2), (30, 1), (31, 2), (32, 2), (33, 2), (34, 5), (35, 1), (36, 2), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 2), (64, 1), (65, 1), (66, 1), (67, 2), (68, 1), (69, 2), (70, 1), (71, 1), (72, 1), (73, 2), (74, 1), (75, 2), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 2), (87, 1), (88, 2), (89, 1), (90, 3), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 3), (97, 2), (98, 1)]