我正在使用高斯LDA,并且在运行 g.fit(2)时遇到不知道如何处理uri 。该问题应与已定义的函数 fit(self,erations = 1,init = True)有关。以下是错误。
TypeError Traceback (most recent call last)
<ipython-input-6-2f3f6e257f87> in <module>
13 model_t = KeyedVectors.load_word2vec_format(tmp_file)
14 g = Gauss_LDA(2, title_document, model_t)
---> 15 g.fit(2) #bug appears here
<ipython-input-3-89d3f37594a9> in fit(self, iterations, init)
64 def fit(self, iterations=1, init=True):
65 if init == True:
---> 66 self.init()
67 init = False
<ipython-input-3-89d3f37594a9> in init(self)
76 self.process_corpus(self.corpus)
---> 77 self.process_wordvectors(self.wordvecFP)
78 #setting wishhart priors
79 self.priors = Wishart(self.word_vecs)
<ipython-input-3-89d3f37594a9> in process_wordvectors(self, filepath)
---> 37 vectors = gensim.models.KeyedVectors.load_word2vec_format(fname=filepath, binary=False)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gensim/models/keyedvectors.py in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
1474 return _load_word2vec_format(
1475 cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
-> 1476 limit=limit, datatype=datatype)
1477
1478 def get_keras_embedding(self, train_embeddings=False):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gensim/models/utils_any2vec.py in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
341 logger.info("loading projection weights from %s", fname)
--> 342 with utils.smart_open(fname) as fin:
343 header = utils.to_unicode(fin.readline(), encoding=encoding)
344 vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/smart_open/smart_open_lib.py in smart_open(uri, mode, **kw)
229 except KeyError:
230 binary_mode = mode
--> 231 binary, filename = _open_binary_stream(uri, binary_mode, **kw)
232 if ignore_extension:
233 decompressed = binary
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/smart_open/smart_open_lib.py in _open_binary_stream(uri, mode, **kw)
378 return uri, filename
379 else:
--> 380 raise TypeError('don\'t know how to handle uri %s' % repr(uri))
TypeError: don't know how to handle uri <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x10a80d128>
我引用此站点来修改我的代码:https://github.com/mansweet/Gaussian-LDA-word2vec 而且我发现原始代码在输入数据格式方面存在问题,因此,我通过以下方式将手套格式转换为word2vec格式:https://radimrehurek.com/gensim/scripts/glove2word2vec.html?fbclid=IwAR3ZAkpX795pyRR-XMnDYb7EW5qTNZpcZqKxTQcDkfdOqMDQO2uG-jumaWg
现在,g.fit(2)出现了问题。我查看了网站,但没有看到任何类似的问题。最接近的是:https://github.com/mansweet/Gaussian-LDA-word2vec/issues/2。
不知道如何处理uri 出现在最后一行代码中:
if __name__ == "__main__":
wordvec_fileapth = "/Users/julia/Desktop/homework/android.en.vector"
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(wordvec_fileapth, tmp_file)
model_t = KeyedVectors.load_word2vec_format(tmp_file)
g = Gauss_LDA(2, title_document, model_t)
g.fit(2) ###bug appears here
错误回溯应该与以下代码相关(3个问题###):
def fit(self, iterations=1, init=True):
if init == True:
self.init() ### traceback problem one
init = False
for i in xrange(iterations):
self.sample()
def init(self):
self.process_corpus(self.corpus)
self.process_wordvectors(self.wordvecFP) ###traceback problem two
#setting wishhart priors
self.priors = Wishart(self.word_vecs)
self.doc_topic_CT = np.zeros((len(self.corpus.keys()), self.numtopics))
self.word_topics = {word: random.choice(range(self.numtopics)) for word in self.vocab}
# get Doc-Topic Counts
for docID, doc in self.corpus.iteritems():
for word in doc:
topicID = self.word_topics[word]
self.doc_topic_CT[docID, topicID] += 1
# Init parameters for topic distributions
for k in range(self.numtopics):
self.recalculate_topic_params(k)
def process_wordvectors(self, filepath):
print("Processing word-vectors, this takes a moment")
vectors = gensim.models.KeyedVectors.load_word2vec_format(fname=filepath, binary=False) ###traceback problem three
useable_vocab = 0
unusable_vocab = 0
self.word_vec_size = vectors.vector_size
for word in self.vocab:
try:
vectors[word]
useable_vocab += 1
except KeyError: unusable_vocab += 1
# self.word_vecs = np.zeros((useable_vocab, vectors.vector_size))
index = 0
for word in self.vocab:
try:
self.word_vecs[word] = vectors[word]
index += 1
except KeyError: continue
print ("Word-vectors for the corpus are created")
我希望可以通过提供任何想法或指导来解决g.fit()问题的原因。谢谢!