我已经检索了维基百科的文章,并提取了几个与气候变化有关的网址,并将其内容保存为文件名。现在,我想使用tfidf找出所有语料库中最受欢迎的词。这是我的代码:
from nltk.corpus import stopwords as stop
def termFrequency(term, doc):
"""
Input: term: Term in the Document, doc: Document
Return: Normalized tf: Number of times term occurs
in document/Total number of terms in the document
"""
# Splitting the document into individual terms
normalizeTermFreq = doc.lower().split()
# Number of times the term occurs in the document
term_in_document = normalizeTermFreq.count(term.lower())
# Total number of terms in the document
len_of_document = float(len(normalizeTermFreq ))
# Normalized Term Frequency
normalized_tf = term_in_document / len_of_document
print("TFVAL",normalized_tf)
return normalized_tf
def inverseDocumentFrequency(term, allDocs):
num_docs_with_given_term = 0
"""
Input: term: Term in the Document,
allDocs: List of all documents
Return: Inverse Document Frequency (idf) for term
= Logarithm ((Total Number of Documents) /
(Number of documents containing the term))
"""
# Iterate through all the documents
for doc in allDocs:
"""
Putting a check if a term appears in a document.
If term is present in the document, then
increment "num_docs_with_given_term" variable
"""
with open("2009_United_Nations_Climate_Change_Conference.txt","r+",encoding="utf-8") as f:
doc=f.read().split()
for word in doc:
print(word)
result=re.match(r'^[a-zA-Z]+$',word)
if result is not None:
document.append(word)
document = ' '.join([i for i in document if i not in stop.words()])
if term.lower() in document.lower().split():
num_docs_with_given_term += 1
f.close()
if num_docs_with_given_term > 0:
# Total number of documents
total_num_docs = len(allDocs)
# Calculating the IDF
idf_val = log(float(total_num_docs) / num_docs_with_given_term)
print("IDF_VALUE:",idf_val)
return idf_val
else:
return 0
def start(file,alldocs):
#my_function()
import nltk,re
document=[]
# print(count)
with open(file,"r+",encoding="utf-8") as f:
doc=f.read().split()
for word in doc:
try:
print(word)
result=re.match(r'^[a-zA-Z]+$',word)
if result is not None:
document.append(word)
except Exception:
pass
document = ' '.join([i for i in document if i not in stop.words()])
print("xxxxxxxxxxxxxx",document,"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
words = nltk.tokenize.word_tokenize(document)
print("xxxxxxxxxxxxxx",words,"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx###################")
words = [word for word in words if word not in stop.words()]
print("xxxxxxxxxxxxxx",words,"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx###################33")
fdist = nltk.FreqDist(words)
for term in fdist:
tf-idf(term, file) = tf(term, file)* idf(term, alldocs)```
我的代码显示了此部分代码中的问题:
for term in fdist:
tf-idf(term, file) = tf(term, file)* idf(term, alldocs)
此处“文件”是指文件名,而alldocs包含所有与气候变化相关的文本文件的列表。