从python

时间:2016-03-15 10:46:33

标签: python matplotlib graph nltk tokenize

嘿大家我对Python和一般的编码都很陌生。有一个我需要读取txt的作业,计算单词,对它们进行排名并将其绘制成图形。 除了把它全部放入图表之外,我已经成功完成了所有工作。 这是我的代码,以及列表存储需要绘制的字符的方式示例。

import nltk
import nltk.tokenize 
import collections
import numpy as np

from nltk.tokenize import word_tokenize

with open("en.txt") as file:    
    data = file.read()

word_tokenize_list = word_tokenize(data)

from collections import Counter
counts = Counter(word_tokenize_list)
print(counts)                       
Counter({',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883})

我的名单非常棒 我要求的只是提示可以使用什么,因为在这种情况下plt.plot对我不起作用

3 个答案:

答案 0 :(得分:0)

对此最有用的图可能是条形图,可以使用此answer直接从字典中绘制,

import matplotlib.pyplot as plt

Counter = {',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 
7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883}

#Plot bar with values from dict and label with keys
plt.bar(range(len(Counter)), Counter.values(), align='center')
plt.xticks(range(len(Counter)), Counter.keys())

#Rotate labels by 90 degrees so you can see them
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)

plt.show()

看起来像,

enter image description here

答案 1 :(得分:0)

Matplotlib是一个使用非常广泛的图形库,可以与Python一起使用。

您可能希望首先根据某些排名标准对计数器数据进行排序,下面有两种可能的解决方案:

from collections import Counter
import matplotlib.pyplot as plt

data = Counter({',': 54224, 'the': 45990, '.': 42529, 'of': 25608, 'to': 24869, 'a': 21351, 'and': 17807, 'in': 17037, "'s": 10335, 'that': 8990, 'for': 8936, '$': 8218, '``': 7733, 'The': 7724, 'is': 7695, "''": 7510, 'said': 6462, 'on': 5718, '%': 5613, 'it': 5177, 'by': 5035, 'from': 4939, 'million': 4883})    
xaxis = range(len(data))

keys_freq = []
values_freq = []

keys_length = []
values_length = []

# Rank depending on frequency
for key, value in data.most_common()[::-1]:
    keys_freq.append(key)
    values_freq.append(value)

# Rank depending on word length
for key in sorted(data.keys(), key=lambda x: (len(x), x)):
    keys_length.append(key)
    values_length.append(data[key])

fig = plt.figure()

plt.subplot(211)
plt.bar(xaxis, values_freq, align='center')
plt.xticks(xaxis, keys_freq)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)

plt.subplot(212)
plt.bar(xaxis, values_length, align='center')
plt.xticks(xaxis, keys_length)
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)

fig.tight_layout()
plt.show()

给你:

Matplotlib screenshot

答案 2 :(得分:0)

最后,这是我在朋友的帮助下做的事情

#Importing all the necessary libraries
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import string

#Opening/reading/editing file

filename=raw_input('Filename (e.g. yourfile.txt): ')
cond=raw_input('What do you want to count? \n A) Words.\n B) Characters and     Punctuation. \n Choice: ')
file=open(filename,'r')
#'r' allows us to read the file
text=file.read()
#This allows us to view the entire text and assign it as a gigantic string
text=text.lower()
'''We make the entire case lowercase to account for any words that have a capital    letter due to sentence structure'''
if cond in ['A','a','A)','a)']:
    set=['!', '#', '"', '%', '$',"''" '&', ')', '(', '+', '*', '--', ',', '/', '.', ';', ':', '=', '<', '?', '>', '@', '[', ']', '\\', '_', '^', '`', '{', '}', '|', '~']
    text="".join(l for l in text if l not in set)
    '''Hyphenated words are secure, since the text has set '--' as the dash.'''
    #Splitting the text into sepereate words, thus creating a big string array.
    text=text.split()
    #We then use the Counter function to calculate the frequency of each word appearing in the text.
    count=Counter(text)
    '''This is not enough, since count is now a function dependant from speicifc strings. We use the .most_common function to create an array which contains the word and it's frequency in each element.'''
    count=count.most_common()
    #Creating empty arrays, replace the 0 with our frequency values and plot it.    Along with the experimental data, we will take the averaged proportionality constant (K) and plot the curve y=K/x
    y=np.arange(len(count))
    x=np.arange(1,len(count)+1)
    yn=["" for m in range(len(count))]
    '''it is important to change the range from 1 to len(count), since the value  'Rank' always starts from 1.'''
    for i in range(len(count)):
        y[i]=count[i][1]
        yn[i]=count[i][0]
    K,Ks=round(np.average(x*y),2),round(np.std(x*y),2)
    plt.plot(x,y,color='red',linewidth=3)
    plt.plot(x,K/x,color='green',linewidth=2)
    plt.xlabel('Rank')
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    plt.plot(0,0,'o',alpha=0)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.title("Testing Zipf's Law: the relationship between the frequency and rank of a word in a text")
    plt.legend(['Experimental data', 'y=K/x, K=%s, $\delta_{K}$ = %s'%(K,Ks),     'Most used word=%s, least used=%s'%(count[0],count[-1])], loc='best',numpoints=1)
    plt.show()
elif cond in ['B','b','B)','b)']:
    text=text.translate( None, string.whitespace )
    count=Counter(text)
    count=count.most_common()
    y=np.arange(len(count))
    x=np.arange(1,len(count)+1)
    yn=["" for m in range(len(count))]
    for i in range(len(count)):
        y[i]=count[i][1]
        yn[i]=count[i][0]
    K,Ks=round(np.average(x*y),2),round(np.std(x*y),2)
    plt.plot(x,y,color='red',linewidth=3)
    plt.plot(x,K/x,color='green',linewidth=2)
    plt.xlabel('Rank')
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    plt.plot(0,0,'o',alpha=0)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.title("Testing Zipf's Law: the relationship between the frequency and rank of a character/punctuation,  in a text")
    plt.legend(['Experimental data', 'y=K/x, K=%s, $\delta_{K}$ = %s'%(K,Ks), 'Most used character=%s, least used=%s'%(count[0],count[-1])],       loc='best',numpoints=1)
    plt.show()