我正在构建vectoreSpaceModel Tf_idf。我已完成代码,但是由于内存问题和速度问题,我想将代码转换为构建稀疏矩阵。主席先生告诉我们不要使用tf-idf库来计算TF-IDF,因此我必须手动进行。数字文档增加时,我的代码卡住了。我有21000个文档。它可以在10000个文档上正常工作,但是当文档由于2个for循环而增加其在tf-idf计算中的阻塞时。我还使用ThreadPoolExecutor提高了我的代码的速度,但得到了任何结果。请帮助我改进此代码。我如何将其转换为稀疏矩阵并将点积用于tf-idf计算。 这是我的代码
import nltk
import string
import numpy as np
from nltk.corpus import stopwords
from pathlib import Path
from collections import Counter
english_stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)
def list_comprehension(nopunc):
#This function remove stop words from list
return [word for word in nopunc.split() if word not in english_stopwords]
def wordList_removePuncs(doc):
"""
1: Remove Punctuation
2: Remove Stop Words
3: return List of Words
"""
nopunc=[char for char in doc if char not in punctuation]
nopunc=''.join(nopunc)
nopunc=nopunc.lower()
return list_comprehension(nopunc)
def collect_file_names():
# This function collect the name of files from folder
file_names=[]
for file in Path("ACL txt").rglob("*.txt"):
file_names.append(file.name)
return file_names
def termFrequencyInDoc(wordList):
# This function find the term frequency of document and return a dictionary
return dict(Counter(wordList))
def inverseDocFre ():
#This function Culculate the idf-term frequency of every unique word
list_dic_word=[]
for i in list_dic:
list_dic_word.extend(list(i.keys())) # getting every keys from list of dictionary of every
document where each dictionary blong to 1 document.
df_corpus=dict()
df_corpus=dict(Counter(list_dic_word)) # no. of docs that contain this word
# 1: keys()=word 2: values= frequency
M=len(list_dic) # getting Documents length
idf_corpus = {} # create dictionary for IDF
for word in df_corpus:
idf_corpus[word] = np.log2((M+1) / df_corpus[word]) #culculating IDF for every unique word
return dict(idf_corpus)
def tfidf(docList):
for i in range(len(docList)):
docList[i]=wordList_removePuncs(docList[i]) # function call
for i in docList:
list_dic.append(termFrequencyInDoc(i)) #function call and append dictionary in List for
every doc
def read_file():
# This function read the file from folders
documents_list=[]
doclist=collect_file_names()
for i in doclist:
file = open("C:\\Users\\Zed\\PycharmProjects\\ACL txt\\"+i, 'r',
encoding="utf8",errors='ignore')
text = file.read().strip()
file.close()
documents_list.append(text)
return documents_list
list_dic=[] # global list of dictionary
def main_fun():
documents_list=read_file()
tfidf(documents_list)
main_fun() # Main Fucntion Call
idf_corpus=inverseDocFre() # inverse Frequncy
#this for loop is for Calculating TF_IDF for every document
for word in idf_corpus.keys():
for i in range(len(list_dic)): #iterate through key,value pairs where key = doc_id and value =
doc content
list_dic[i][word] = list_dic[i].get(word,0) * idf_corpus[word] #C(W_i,doc) * IDF(W_i)
query = "Natural language Processing" #the query
query=query.lower()
query_vocab = []
for word in query.split():
if word not in query_vocab:
query_vocab.append(word)
query_wc = {} # a dictionary to store count of a word in the query (i.e x_i according to lecture
slides terminology)
for word in query_vocab:
query_wc[word] = query.split().count(word)
relevance_scores = {} # a dictionary that will store the relevance score for each doc
# doc_id will be the key and relevance score the value for this dictionary
for doc_id in range(len(list_dic)):
score = 0 #initialze the score for the doc to 0 at the start
for word in query_vocab:
score += query_wc[word] * list_dic[doc_id][word] # count of word in query * term_freq of
the word
relevance_scores[doc_id] = score
relevance_scores = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
我正在通过此链接附加文件
[https://drive.google.com/open?id=1D1GjN_JTGNBv9rPNcWJMeLB_viy9pCfJ]