为TF-IDF建立Scipy稀疏矩阵

时间:2019-09-21 10:42:53

标签: python scipy sparse-matrix information-retrieval tf-idf

我正在构建vectoreSpaceModel Tf_idf。我已完成代码,但是由于内存问题和速度问题,我想将代码转换为构建稀疏矩阵。主席先生告诉我们不要使用tf-idf库来计算TF-IDF,因此我必须手动进行。数字文档增加时,我的代码卡住了。我有21000个文档。它可以在10000个文档上正常工作,但是当文档由于2个for循环而增加其在tf-idf计算中的阻塞时。我还使用ThreadPoolExecutor提高了我的代码的速度,但得到了任何结果。请帮助我改进此代码。我如何将其转换为稀疏矩阵并将点积用于tf-idf计算。 这是我的代码

import nltk
import string
import numpy as np
from nltk.corpus import stopwords
from pathlib import Path
from collections import Counter

english_stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def list_comprehension(nopunc):
     #This function remove stop words from list
     return [word for word in nopunc.split() if word not in english_stopwords]

def wordList_removePuncs(doc):
"""
1: Remove Punctuation
2: Remove Stop Words
3: return List of Words
"""
nopunc=[char for char in doc if char not in punctuation]
nopunc=''.join(nopunc)
nopunc=nopunc.lower()
return list_comprehension(nopunc)

def collect_file_names():
     # This function collect the name of files from folder
    file_names=[]
    for file in Path("ACL txt").rglob("*.txt"):
       file_names.append(file.name)
    return file_names

def termFrequencyInDoc(wordList):
     # This function find the term frequency of document and return a dictionary
      return dict(Counter(wordList))

def inverseDocFre ():
    #This function Culculate the idf-term frequency of every unique word
    list_dic_word=[] 
    for i in list_dic:
        list_dic_word.extend(list(i.keys())) # getting every keys from list of dictionary of every 
                                               document where each dictionary blong to 1 document.
    df_corpus=dict()
    df_corpus=dict(Counter(list_dic_word)) #  no. of docs that contain this word 
                                           # 1: keys()=word 2: values= frequency
    M=len(list_dic)      # getting Documents length
    idf_corpus = {}      # create dictionary for IDF
    for word in df_corpus:   
        idf_corpus[word] = np.log2((M+1) / df_corpus[word]) #culculating IDF for every unique word
    return dict(idf_corpus)

def tfidf(docList):
    for i in range(len(docList)):
        docList[i]=wordList_removePuncs(docList[i]) # function call
    for i in docList:
        list_dic.append(termFrequencyInDoc(i))      #function call and append dictionary in List for 
                                                     every doc


    def read_file():
         # This function read the file from folders 
        documents_list=[]
        doclist=collect_file_names()                      
        for i in doclist:
             file = open("C:\\Users\\Zed\\PycharmProjects\\ACL txt\\"+i, 'r', 
                     encoding="utf8",errors='ignore')
              text = file.read().strip()
              file.close()
              documents_list.append(text)
         return documents_list

list_dic=[] # global list of dictionary

def main_fun():
   documents_list=read_file() 
   tfidf(documents_list)

main_fun()     # Main Fucntion Call

idf_corpus=inverseDocFre()  # inverse Frequncy

#this for loop is for Calculating TF_IDF for every document

for word in idf_corpus.keys(): 
    for i in range(len(list_dic)): #iterate through key,value pairs where key = doc_id and value = 
    doc content
    list_dic[i][word] = list_dic[i].get(word,0) * idf_corpus[word] #C(W_i,doc) * IDF(W_i)


query = "Natural language Processing" #the query
query=query.lower()

query_vocab = [] 
for word in query.split():
   if word not in query_vocab:
      query_vocab.append(word)
query_wc = {} # a dictionary to store count of a word in the query (i.e x_i according to lecture 
               slides terminology)
for word in query_vocab:
     query_wc[word] = query.split().count(word)


relevance_scores = {} # a dictionary that will store the relevance score for each doc
                      # doc_id will be the key and relevance score the value for this dictionary
for doc_id in range(len(list_dic)):
      score = 0 #initialze the score for the doc to 0 at the start
      for word in query_vocab:
           score += query_wc[word] * list_dic[doc_id][word] # count of word in query * term_freq of 
                                    the word
     relevance_scores[doc_id] = score
relevance_scores = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)

我正在通过此链接附加文件

[https://drive.google.com/open?id=1D1GjN_JTGNBv9rPNcWJMeLB_viy9pCfJ]

0 个答案:

没有答案