如何使用相似性。在gensim中的相似性
因为如果我使用相似性.MatrixSimilarity:
index = similarities.MatrixSimilarity(tfidf[corpus])
它只是告诉我:
C:\Users\Administrator\AppData\Local\Enthought\Canopy\User\lib\site- packages\gensim-0.12.4-py2.7-win-amd64.egg\gensim\similarities\docsim.pyc in __init__(self, corpus, num_best, dtype, num_features, chunksize, corpus_len)
513 raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
514 logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
515 self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
516 # iterate over corpus, populating the numpy index matrix with (normalized)
517 # document vectors
MyProgram:
输入内容小于20,000行时可以正常工作,但当行数超过20,000时,它就无法为' corpus_tfidf'
建立索引。 # -*- coding: utf-8 -*-
import logging,time
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
start=time.clock()
def mmin(a,b):
if a>b:
return b
else:
return a
from gensim import corpora, models, similarities
fsource_01='E:\\cm_test\\ptfidf_test\\dim_items_terms_pre_pre.csv'
fsource_02=fsource_01
fcontent='E:\\cm_test\\ptfidf_test\\'
f0=open(fsource_01)
lines=f0.readlines()
terms_list=[]
for line in lines:
line=line.strip('\n') #
terms_sline=line.split(',') # es:['48909,53517,116593,55095']->['48909','53517','116593','55095']
terms_list.append(terms_sline)
f0.close()
from collections import defaultdict
frequency = defaultdict(int)
for text in terms_list:
cnt_single=defaultdict(int)
for token in text:
frequency[token] += 1
terms_list = [[token for token in text if frequency[token] > 1] for text in terms_list]
terms_list_qc=[]
for ttext in terms_list:
cnt_single=defaultdict(int)
terms_list_qc_item=[]
for token in ttext:
cnt_single[token]+=1
if(cnt_single[token]<=1):
terms_list_qc_item.append(str(token))
terms_list_qc.append(terms_list_qc_item)
dictionary = corpora.Dictionary(terms_list)
#dictionary.save(fcontent+'dim_items_terms.dict')
corpus = [dictionary.doc2bow(text) for text in terms_list]
#corpora.MmCorpus.serialize(fcontent+'dim_items_terms.mm', corpus)
end1=time.clock()
print "01. Time Cost for trim_items_terms_to_sparse_matrix: %f s" % (end1-start)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
end2=time.clock()
print "2. Time Cost for bagofwords_to_tfidf: %f s" % (end2-end1)
index = similarities.MatrixSimilarity(tfidf[corpus_tfidf])
#index.save(fcontent+'dim_items_terms_tfidf.index')
f0=open(fsource_02)
lines=f0.readlines()
f1=open(fcontent+'out_recordid_tfidf.txt',"w")
f2=open(fcontent+'out_cosine_tfidf.txt',"w")
for line in lines:
line=line.strip('\n')
doc = line
vec_bow = dictionary.doc2bow(doc.split(','))
vec_lsi = tfidf[vec_bow]
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
osize=mmin(len(sims),400)
for i in range(osize):
f1.write(str(sims[i][0]+1)+',')
f2.write(str("%.2f"%sims[i][1])+',')
f1.write('\n')
f2.write('\n')
f0.close()
f1.close()
f2.close()
end3=time.clock()
print "3. Time Cost for get_sim_itemsid_top_fh: %f s" % (end3-end2)
答案 0 :(得分:3)
我替换了句子
index = similarities.MatrixSimilarity(tfidf[corpus_tfidf])
与
index=similarities.Similarity('E:\\cm_test',tfidf[corpus_tfidf],len(dictionary))
它工作正常。
虽然花费大约2079秒才能得到结果。