我试图提取文本文件中每个句子的动词和名词(以其词干形式)包含1000个问题。我想计算所有问题中动词和名词的整体频率(出现次数/问题数)。查找所有常用动词和名词,使其总体频率大于阈值(例如,0.2,使其成为参数)
所以输出就是 Q1:频繁的verb1,frequency1;频繁的动词2,频率2; ...;频繁名词1,频率1,....
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer
with open('test.txt', encoding='utf8',) as Test_file:
text = Test_file.read()
def stop_words(a):
stop_words= set(stopwords.words('english'))
important_words = []
for w in a:
if w.lower() not in stop_words:
important_words.append(w.lower())
return important_words
def take_words(a):
sentences = nltk.sent_tokenize(a)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
return sentences
def stem_words(a):
words_stem= []
PS = PorterStemmer()
for r in a:
words_stem.append(PS.stem(r).lower())
return words_stem
def chunking(a):
Grammar = r"""
NN: {<NN.*>+}
VB: {<VB.*>+}
"""
ChunkParser = nltk.RegexpParser(Grammar)
for w in a:
Chunkit = ChunkParser.parse(w)
现在我不知道下一步怎么办?有人能给我一些帮助吗?