我需要将文档与数据集中的所有其他文档进行比较,并获得相似性评分。我正在使用spacy的相似功能来做到这一点。由于数据集中的文档数量为10 ^ 6,使用2 for循环的蛮力方法需要很长时间?有没有直接的方法可以做到这一点?任何帮助将不胜感激
import uuid
import time
start_time=time.time()
counter = 1
similar_desc_uuid_dict_o=dict()
for doc1 in descs[:2]:
uniqueid=str(uuid.uuid4())
if counter % 1 == 0:
print("Processed %d out of %d documents." % (counter, len(descs)))
counter+=1
for doc2 in descs:
if(doc1.similarity(doc2)>=0.89):
current_value=similar_desc_uuid_dict_o.get(str(doc2))
if(current_value==None):
similar_desc_uuid_dict_o[str(doc2)]=uniqueid
else:
updated_value=current_value+" "+uniqueid
similar_desc_uuid_dict_o[str(doc2)]=updated_value
print('Done. Time elapsed: {:.2f}mins'.format((time.time() - start_time)/60))
similar_desc_uuid_dict_o
答案 0 :(得分:0)
我找到了使用gensim大规模执行上述任务的替代解决方案。 这是我的工作代码:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile
import sys
import time, traceback
def cossim(documents, query_docs=None, task='pairwise_similarity', metric_threshold=0.85, num_best=20, **kwargs):
try:
dictionary = Dictionary(documents)
tfidf = TfidfModel(dictionary=dictionary)
corpus = [dictionary.doc2bow(doc) for doc in documents]
features_rep='bow'
if len(dictionary) > 1000 and len(dictionary) <=2000:
corpus = [tfidf[doc] for doc in corpus]
features_rep='tfidf'
elif len(dictionary) > 2000:
model = LsiModel(corpus, id2word=dictionary, num_topics=200)
corpus = [model[tfidf[doc]] for doc in corpus]
features_rep = 'lsi'
index_tmpfile = get_tmpfile("index")
index = Similarity(output_prefix=index_tmpfile, corpus=corpus, num_best=num_best, num_features=len(dictionary),
chunksize=256)
similarities = []
if task == 'pairwise_similarity':
start_time = time.time()
for sim in index:
similarities.append(sim)
elif task == 'batch_query':
start_time = time.time()
query_docs_features = [dictionary.doc2bow(doc) for doc in query_docs]
if features_rep=='tfidf':
query_docs_features = [tfidf[doc] for doc in query_docs_features]
elif features_rep=='lsi':
query_docs_features = [model[tfidf[doc]] for doc in query_docs_features]
for sim in index[query_docs_features]:
similarities.append(sim)
filtered_results = []
for ind_sim in similarities:
filtered_results.append([item[0] for item in ind_sim if item[1] >= metric_threshold])
if query_docs is not None:
matched_docs, unmatched_docs, matching_stats = stats(documents, query_docs, filtered_results)
return matched_docs, unmatched_docs
else:
return filtered_results
except Exception:
logging.error(
"Exception has occurred while performing Cosine Similarity. {}".format(traceback.format_exc()))