我正在使用csv文件中的句子在gensim中训练word2vec模型,如下所示:
import string
import gensim
import csv
import nltk
path = '/home/neel/Desktop/csci544_proj/test/sample.csv'
translator = str.maketrans({key: None for key in string.punctuation})
class gen(object):
def __init__(self, path):
self.path = path
def __iter__(self):
with open(path) as infile:
reader = csv.reader(infile)
for row in reader:
rev = row[4]
l = nltk.sent_tokenize(rev)
for sent in l:
sent = sent.translate(translator)
yield sent.lower().split()
sentences = [path]
for p in gen(path):
model = gensim.models.Word2Vec(p, min_count=1, iter=1)
print(model.vocab.keys())
我得到以下结果: ([' b',' u',' m',' h',' e',' ;','' v',' i',' a',' t' ,''' w',' o',' l'])
我得到的结果不是文字,而是文字。程序在哪里出错?
答案 0 :(得分:0)
我修复了你的代码
import string
import gensim
import csv
import nltk
path = '/home/neel/Desktop/csci544_proj/test/sample.csv'
translator = str.maketrans({key: None for key in string.punctuation})
class Generator(object):
def __init__(self, pathes):
self.pathes = pathes
def __iter__(self):
for path in self.pathes:
with open(path) as infile:
for row in csv.reader(infile):
for sent in nltk.sent_tokenize(row[4]):
yield sent.translate(translator).lower().split()
corpus = Generator([path])
model = gensim.models.Word2Vec(min_count=1, iter=1)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=2)
model.wv.vocab.keys()
你得到dict_keys(['wassup', 'where', 'fresh', 'new', 'about', 'juice', 'whats', 'are', 'im', 'hello', 'wtf', 'd', 'hi', 'you', 'world', 'bro', 'friend'])