我正在使用gensim
,但当我尝试使用s3
保存到Mmcorpus.serialize
位置时,会发送错误:
corpora.MmCorpus.serialize('s3://my_bucket/corpus.mm', corpus)
2016-01-12 15:55:41,957 : INFO : storing corpus in Matrix Market format to s3://my_bucket/corpus.mm
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-33-513a98b2dfd4> in <module>()
----> 1 corpora.MmCorpus.serialize('s3://my_bucket/corpus.mm', corpus)
/home/nanounanue/.pyenv/versions/3.4.3/lib/python3.4/site-packages/gensim/corpora/indexedcorpus.py in serialize(serializer, fname, corpus, id2word, index_fname, progress_cnt, labels, metadata)
92 offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
93 else:
---> 94 offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)
95
96 if offsets is None:
/home/nanounanue/.pyenv/versions/3.4.3/lib/python3.4/site-packages/gensim/corpora/mmcorpus.py in save_corpus(fname, corpus, id2word, progress_cnt, metadata)
47 logger.info("storing corpus in Matrix Market format to %s" % fname)
48 num_terms = len(id2word) if id2word is not None else None
---> 49 return matutils.MmWriter.write_corpus(fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata)
50
51 # endclass MmCorpus
/home/nanounanue/.pyenv/versions/3.4.3/lib/python3.4/site-packages/gensim/matutils.py in write_corpus(fname, corpus, progress_cnt, index, num_terms, metadata)
484 is allowed to be larger than the available RAM.
485 """
--> 486 mw = MmWriter(fname)
487
488 # write empty headers to the file (with enough space to be overwritten later)
/home/nanounanue/.pyenv/versions/3.4.3/lib/python3.4/site-packages/gensim/matutils.py in __init__(self, fname)
434 if fname.endswith(".gz") or fname.endswith('.bz2'):
435 raise NotImplementedError("compressed output not supported with MmWriter")
--> 436 self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing
437 self.headers_written = False
438
/home/nanounanue/.pyenv/versions/3.4.3/lib/python3.4/site-packages/smart_open/smart_open_lib.py in smart_open(uri, mode, **kw)
132 return S3OpenWrite(key, **kw)
133 else:
--> 134 raise NotImplementedError("file mode %s not supported for %r scheme", mode, parsed_uri.scheme)
135
136 elif parsed_uri.scheme in ("hdfs", ):
NotImplementedError: ('file mode %s not supported for %r scheme', 'wb+', 's3')
注意:s3://my_bucket
存在(使用其他名称),而corpus
与gensim
的教程相同。
这是正确的做法吗?我想要实现以下内容:在S3中存储语料库(或模型,如LDA)并从S3获取并再次运行它。