我写了一些代码来查找包含在位置路径中存储的文件中的单词的频率和文档频率。每个文件都通过函数cleanDoc()
来获取文本文件中的单词,并且我希望以表格方式提交术语频率,这样所有文档中的所有单词都应该被视为查找计数。任何人都可以告诉我应该如何实现它?我只使用NLTK。
import collections
import os.path
import glob
import nltk
wdict = set()
path = "C://Python27//Corpus Files//*.*"
#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
final = [stemmer.stem(word) for word in clean]
return final
for text in glob.glob(path):
f = open(text)
data= f.read()
words = cleanDoc(data)
wdict.update(words)
答案 0 :(得分:0)
您可以使用FreqDist
nltk.probability
对象来计算这些字数。稍后,您可以使用类似dict的键值界面和方法(例如freq.items()
或freq['word']
)在其中导航,或者您甚至可以使用matplotlib
绘制结果。
import collections
import os.path
import glob
import nltk
from nltk.probability import FreqDist
term_frequency = {}
path = "C://Python27//Corpus Files//*.*"
#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
stopset = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
tokens = nltk.WordPunctTokenizer().tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
final = [stemmer.stem(word) for word in clean]
return final
for text in glob.glob(path):
f = open(text)
data = f.read()
words = cleanDoc(data)
numbers_of_words = len(words)
freq = FreqDist(all_words)
# term_frequency is a dict which structure is like:
# {
# 'path_to_file':
# {'term': 13.4, 'another_term': 15},
# 'another_file':
# {'term2': 12, 'foo': 15}
# }
for term in freq.keys():
if isintance(term_frequency[text], dict):
term_frequency[text][term] = freq[term]/numbers_of_words
else:
term_frequency[text] = {term: freq[term]/numbers_of_words}
参考:https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.FreqDist-class.html