我正在尝试使用Gensim库建立模型。我正在使用python 3和Spyder。我也想加入维基语料库。代码如下所示:
enter code hereimport os
import sys
import bz2
import logging
import multiprocessing
import gensim
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_PATH = os.path.join(SCRIPT_PATH, 'data/')
MODEL_PATH = os.path.join(SCRIPT_PATH, 'model/')
DICTIONARY_FILEPATH = os.path.join(DATA_PATH, 'wiki-english_wordids.txt.bz2')
WIKI_DUMP_FILEPATH = os.path.join(DATA_PATH, 'enwiki-latest-pages-
articles.xml.bz2')
if __name__ == '__main__':
# Check if the required files have been downloaded
if not WIKI_DUMP_FILEPATH:
print('Wikipedia articles dump could not be found..')
print('Please see README.md for instructions!')
sys.exit()
# Get number of available cpus
cores = multiprocessing.cpu_count()
if not os.path.exists(MODEL_PATH):
os.makedirs(MODEL_PATH)
# Initialize logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if not os.path.isfile(DICTIONARY_FILEPATH):
logging.info('Dictionary has not been created yet..')
logging.info('Creating dictionary (takes about 9h)..')
# Construct corpus
wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH)
# Remove words occuring less than 20 times, and words occuring in more
# than 10% of the documents. (keep_n is the vocabulary size)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)
# Save dictionary to file
wiki.dictionary.save_as_text(DICTIONARY_FILEPATH)
del wiki
# Load dictionary from file
dictionary = gensim.corpora.Dictionary.load_from_text(DICTIONARY_FILEPATH)
# Construct corpus using dictionary
wiki = gensim.corpora.WikiCorpus(WIKI_DUMP_FILEPATH, dictionary=dictionary)
class SentencesIterator:
def __init__(self, wiki):
self.wiki = wiki
def __iter__(self):
for sentence in self.wiki.get_texts():
yield list(map(lambda x: x.decode('utf-8'), sentence))
# Initialize simple sentence iterator required for the Word2Vec model
sentences = SentencesIterator(wiki)
logging.info('Training word2vec model..')
model = gensim.models.Word2Vec(sentences=sentences, size=300, min_count=1, window=5, workers=cores)
# Save model
logging.info('Saving model..')
model.save(os.path.join(MODEL_PATH, 'word2vec.model'))
logging.info('Done training word2vec model!')
但是我遇到以下错误:
File "C:/Users/elli/.spyder-py3/temp.py", line 60, in <lambda>
yield list(map(lambda x: x.decode('utf-8'), sentence))
AttributeError: 'str' object has no attribute 'decode'
这段代码来自github的链接: https://github.com/LasseRegin/gensim-word2vec-model/blob/master/train.py。
我怀疑这应该是简单的排序。你能请教吗?
答案 0 :(得分:1)
这是类SentencesIterator
中的Unicode问题,您的示例代码适用于python2。对于python3,您可以删除解码部分并使它如下:
class TaggedWikiDocument(object):
def __init__(self, wiki):
self.wiki = wiki
self.wiki.metadata = True
def __iter__(self):
for content, (page_id, title) in self.wiki.get_texts():
yield TaggedDocument(content, [title])