如何使用相似之处。在gensim中的相似性?

时间:2016-04-12 15:54:14

标签: python gensim cosine-similarity

如何使用相似性。在gensim中的相似性

因为如果我使用相似性.MatrixSimilarity:

index = similarities.MatrixSimilarity(tfidf[corpus]) 它只是告诉我:

C:\Users\Administrator\AppData\Local\Enthought\Canopy\User\lib\site- packages\gensim-0.12.4-py2.7-win-amd64.egg\gensim\similarities\docsim.pyc in __init__(self, corpus, num_best, dtype, num_features, chunksize, corpus_len)
513                 raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
514             logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
515             self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
516             # iterate over corpus, populating the numpy index matrix with (normalized)
517             # document vectors

MyProgram:

输入内容小于20,000行时可以正常工作,但当行数超过20,000时,它就无法为' corpus_tfidf'

建立索引。

    # -*- coding: utf-8 -*-
    import logging,time
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    start=time.clock()

    def mmin(a,b):
        if a>b:
            return b
        else:
            return a
    from gensim import corpora, models, similarities

    fsource_01='E:\\cm_test\\ptfidf_test\\dim_items_terms_pre_pre.csv'
    fsource_02=fsource_01

    fcontent='E:\\cm_test\\ptfidf_test\\'

    f0=open(fsource_01)
    lines=f0.readlines()
    terms_list=[]
    for line in lines:
        line=line.strip('\n') #
        terms_sline=line.split(',') # es:['48909,53517,116593,55095']->['48909','53517','116593','55095']
        terms_list.append(terms_sline) 
    f0.close()

    from collections import defaultdict
    frequency = defaultdict(int)
    for text in terms_list:
            cnt_single=defaultdict(int)
            for token in text:
                frequency[token] += 1

    terms_list = [[token for token in text if frequency[token] > 1] for text in terms_list]

    terms_list_qc=[]
    for ttext in terms_list:
        cnt_single=defaultdict(int)
        terms_list_qc_item=[]
        for token in ttext:
            cnt_single[token]+=1
            if(cnt_single[token]<=1):
                terms_list_qc_item.append(str(token))
        terms_list_qc.append(terms_list_qc_item)

    dictionary = corpora.Dictionary(terms_list)
    #dictionary.save(fcontent+'dim_items_terms.dict')

    corpus = [dictionary.doc2bow(text) for text in terms_list]
    #corpora.MmCorpus.serialize(fcontent+'dim_items_terms.mm', corpus)  
    end1=time.clock()   
    print "01.  Time Cost for trim_items_terms_to_sparse_matrix: %f s" % (end1-start)

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    end2=time.clock()   
    print "2.   Time Cost for bagofwords_to_tfidf: %f s" % (end2-end1)

    index = similarities.MatrixSimilarity(tfidf[corpus_tfidf])
    #index.save(fcontent+'dim_items_terms_tfidf.index')
    f0=open(fsource_02)
    lines=f0.readlines()
    f1=open(fcontent+'out_recordid_tfidf.txt',"w")
    f2=open(fcontent+'out_cosine_tfidf.txt',"w")
    for line in lines:
        line=line.strip('\n') 
        doc = line
        vec_bow = dictionary.doc2bow(doc.split(','))
        vec_lsi = tfidf[vec_bow]
        sims = index[vec_lsi]

        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        osize=mmin(len(sims),400)
        for i in range(osize):
            f1.write(str(sims[i][0]+1)+',')
            f2.write(str("%.2f"%sims[i][1])+',')
        f1.write('\n')
        f2.write('\n')
    f0.close()
    f1.close()
    f2.close()

    end3=time.clock()   
    print "3.   Time Cost for get_sim_itemsid_top_fh: %f s" % (end3-end2)

1 个答案:

答案 0 :(得分:3)

我替换了句子

index = similarities.MatrixSimilarity(tfidf[corpus_tfidf])

index=similarities.Similarity('E:\\cm_test',tfidf[corpus_tfidf],len(dictionary))

它工作正常。

虽然花费大约2079秒才能得到结果。