我试图从我从维基百科复制的文章中绘制前50个单词的单词频率。我看过How to plot the number of times each element is in a list Python: Frequency of occurrences和Using Counter() in Python to build histogram?这似乎是一个很有希望的结果,直到我意识到解决方案不能保持Counter()
的顺序。有没有办法在绘图时保留Counter()
的下降?
我用来播放数据的代码:
# Standard Library
import collections
from collections import Counter
import itertools
import re
# Third Party Library
import matplotlib.pyplot as plt
import nltk
import numpy as np
file = '...\\NLP\\Word_Embedding\\Basketball.txt'
text = open(file, 'r').read()
text = re.sub(r'([\"\'.])([\)\[,.;])', r'\1 \2', text)
vocab = text.split()
vocab = [words.lower() for words in vocab]
print('There are a total of {} words in the corpus'.format(len(vocab)))
tokens = list(set(vocab))
print('There are {} unique words in the corpus'.format(len(tokens)))
vocab_labels, vocab_values = zip(*Counter(vocab).items())
vocab_freq = Counter(vocab)
indexes = np.arange(len(vocab_labels[:10]))
width = 1
# plt.bar(indexes, vocab_values[:10], width) # Random 10 items from list
# plt.xticks(indexes + width * 0.5, vocab_labels[:10])
# plt.show()
链接到Basketball.txt文件
答案 0 :(得分:1)
您可以根据vocab_values
对vocab_freq
进行排序,然后使用[::-1]
进行反向:
import collections
from collections import Counter
import itertools
import re
# Third Party Library
import matplotlib.pyplot as plt
import nltk
import numpy as np
file = '.\Basketball.txt'
text = open(file, 'r').read()
text = re.sub(r'([\"\'.])([\)\[,.;])', r'\1 \2', text)
vocab = text.split()
vocab = [words.lower() for words in vocab]
print('There are a total of {} words in the corpus'.format(len(vocab)))
tokens = list(set(vocab))
print('There are {} unique words in the corpus'.format(len(tokens)))
vocab_labels, vocab_values = zip(*Counter(vocab).items())
vocab_freq = Counter(vocab)
sorted_values = sorted(vocab_values)[::-1]
sorted_labels = [x for (y,x) in sorted(zip(vocab_values,vocab_labels))][::-1]
indexes = np.arange(len(sorted_labels[:10]))
width = 1
plt.bar(indexes, sorted_values[:10] ) # Random 10 items from list
plt.xticks(indexes + width * 0.5, sorted_labels[:10])
plt.show()
结果: