我可以通过以下方式轻松获得最常用的词语:
stopwords = set(nltk.corpus.stopwords.words('english'))
tagged_words = nltk.word_tokenize(text)
tagged_words = nltk.pos_tag(tagged_words)
# Remove single-character tokens (mostly punctuation)
tagged_words = [tagged_word for tagged_word in tagged_words if len(tagged_word[0]) > 1]
# Remove numbers
tagged_words = [tagged_word for tagged_word in tagged_words if not tagged_word[0].isnumeric()]
# Remove stopwords
if remove_stopwords:
tagged_words = [tagged_word for tagged_word in tagged_words if tagged_word[0] not in stopwords]
# Dark magic
lemmatizer = nltk.stem.WordNetLemmatizer()
words = []
for tagged_word in tagged_words:
pos = wordnet_pos_code(tagged_word[1])
# Ignoring all words, except nouns, verbs, adjectives and adverbs
if pos is not None:
words.append({'word':lemmatizer.lemmatize(tagged_word[0], pos=pos), 'pos':tagged_word[1]})
# Calculate frequency distribution
fdist = nltk.FreqDist(words)
# Return top % words_count % words
res = []
for word, frequency in fdist.most_common(words_count):
word_dict = {}
word_dict['word'] = word
word_dict['count'] = frequency
res.append(word_dict)
return res
但我得到了一些像布朗'作为人名和' brown'作为颜色,他们是不一样的。好的,我可以用大写字母查看它们。但如果我得到类似的东西:
布朗不仅仅是一种颜色。布朗是生活方式的一部分。布朗先生应该同意我的意见。
所以,nltk做的词性分析非常好。但是,如何获得最常见的单词取决于词性?
答案 0 :(得分:0)
尝试使用defaultdict
作为关键字words
和Counter
对象作为值。对于内部Counter
,键是POS,值是给定单词的POS的计数。
>>> from collections import defaultdict, Counter
>>> from nltk.corpus import brown
>>> corpus_counts = defaultdict(Counter)
>>> for word, tag in brown.tagged_words():
... if word in corpus_counts:
... corpus_counts[word][tag] +=1
... else:
... corpus_counts[word] = Counter({tag:1})
...
>>>
>>> corpus_counts['run']
Counter({u'VB': 122, u'NN': 52, u'VBN': 31, u'VBD': 1})
>>> corpus_counts['run']['VB']
122
答案 1 :(得分:0)
我找到了一个更好的解决方案:收集单词和pos作为元组的数组,并通过FreqDist获取它们的频率。
f
所以,现在我可以将它们分类到数组中。
tagged_words = nltk.word_tokenize(text)
tagged_words = nltk.pos_tag(tagged_words)
lemmatizer = nltk.stem.WordNetLemmatizer()
words = []
for tagged_word in tagged_words:
pos = wordnet_pos_code(tagged_word[1])
if pos is not None:
words.append((lemmatizer.lemmatize(tagged_word[0], pos=pos), tagged_word[1]))
fdist = nltk.FreqDist(words)