我是python和机器学习的新手。我有一个文件(kkk.csv)。这有101个句子每行1个句子。我想在gensim中使用Doc2vec获取每个句子的向量。之后,我想使用这些向量来执行聚类,以便将相同的句子组合在一起。有人可以为我提供代码(获取每个句子的矢量并对其进行抄写)吗?
我尝试使用此代码:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
#from sklearn.feature_extraction.text import TfidfVectorizer
import gensim,logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
from os import listdir
docLabels = []
docLabels = [f for f in listdir('''C:/Users/rekhasharma/new''') if
f.endswith('.csv')]
import pandas as pd
df = pd.read_csv('''C:/Users/rekhasharma/kkk.csv''',names = ['comments'])
rows = 100
df = df.iloc[:rows]
list1 = df.values.tolist()
del list1[0]
class LabeledLineSentence(object):
def __init__(self, doc_list, labels_list):
self.labels_list = labels_list
self.doc_list = doc_list
def __iter__(self):
for idx,doc in enumerate(self.doc_list):
yield gensim.models.doc2vec.LabeledSentence(doc,self.labels_list)
it = LabeledLineSentence(list1, docLabels)
model = gensim.models.Doc2Vec(vector_size=50, min_count=0, alpha=0.025,
min_alpha=0.025)
model.build_vocab(it)
for epoch in range(100):
print ('iteration' + str(epoch+1))
model.train(it,total_examples=49, total_words=None, epochs=1)
model.alpha = 0.002
model.min_alpha = model.alpha
model.save('doc2vec.model')
d2v_model = gensim.models.doc2vec.Doc2Vec.load('doc2vec.model')
docvec = d2v_model.docvecs[0]
print (docvec)