我几乎没有编程方面的实践经验,但是我已经开始学习python,并希望创建一个用于计算文本中最常用单词的函数。现在,我确定我的版本不是执行此操作的最佳方法,但是它可以工作:
import os
punctuation = "~!@#$%^&*()_-=+[{]}\\|'\";:,<.>/?"
def remove_punctuation(text):
text_wo_punctuation = ""
for word in text:
if word not in punctuation:
text_wo_punctuation += word
return text_wo_punctuation
with open(r'New Text Document.txt') as f:
text = f.read().lower()
t = remove_punctuation(text).split()
dictionary = {}
for word in t:
if word in dictionary:
dictionary[word] = dictionary[word] + 1
else:
dictionary[word] = 1
print(dictionary)
def top_five(d):
top = {}
value1 = 0
value2 = 0
value3 = 0
value4 = 0
value5 = 0
for key in dictionary:
if value1 < dictionary[key] and key not in top:
value1 = dictionary[key]
top1 = {key:value1}
else:
continue
top.update(top1)
for key in dictionary:
if value2 < dictionary[key] and key not in top:
value2 = dictionary[key]
top2 = {key:value2}
else:
continue
top.update(top2)
for key in dictionary:
if value3 < dictionary[key] and key not in top:
value3 = dictionary[key]
top3 = {key:value3}
else:
continue
top.update(top3)
for key in dictionary:
if value4 < dictionary[key] and key not in top:
value4 = dictionary[key]
top4 = {key:value4}
else:
continue
top.update(top4)
for key in dictionary:
if value5 < dictionary[key] and key not in top:
value5 = dictionary[key]
top5 = {key:value4}
else:
continue
top.update(top5)
return top
print(top_five(dictionary))
上面的代码将给出以下输出:
{'word1':“ freq1”,'word2':“ freq2”,'word3':“ freq3”,'word4':“ freq4”,'word5':“ freq5”}}
尽管这是我想要的结果,但我已尝试简化我的功能,并让用户选择应该计算多少个单词的频率:
def top_five(d,n):
top = {}
values = {}
for i in range(1,n+1):
values["value"+str(i)]=0
for i in range(1,n+1):
top["top"+str(i)]=0
for i in range(1,n+1):
for key in dictionary :
if values["value"+str(i)] < dictionary[key] and key not in top:
values["value"+str(i)] = dictionary[key]
top["top"+str(i)] = {key:values["value"+str(i)]}
else:
continue
top.update(top1)
print(top)
return top
此代码将创建一个可以在循环中使用的具有value1,value2等的字典,以及具有top1,top2等的另一本字典,但是它将不起作用,因为“和键不在顶部”将不起作用。
top["top"+str(i)] = {key:values["value"+str(i)]}
这将在字典中创建字典。我被困在这里,因为我找不到使“ top”字典有用或在循环内迭代变量名的方法。我已经读过应该使用列表或字典,并且变量名迭代不是一个好主意,但是我不明白为什么这样做,并且我想不出一种使列表或字典在for循环中有用的方法。
正如我所说,我知道这可能不是实现这种功能的最佳方法,但我的问题是:如何简化已经完成的工作并使循环正常工作?
谢谢!
答案 0 :(得分:1)
我已经按照Barmar的建议更新了代码:
def remove_punctuation(text):
""""Removes punctuation characters from given text"""
punctuation = "~`!@#$%^&*()_-=+[{]}\\|'\";:,<.>/?"
text_wo_punctuation = ""
for word in text:
if word not in punctuation:
text_wo_punctuation += word
return text_wo_punctuation
def count_words(file):
"""Returns a dictionary of words and word count from "file" """
with open(file) as f:
text = remove_punctuation(f.read()).lower().split()
dictionary = {}
for word in text:
# print(word)
if word in dictionary:
dictionary[word] = dictionary[word] + 1
# print("**Existing**")
else:
dictionary[word] = 1
# print("**New**")
# print(dictionary[word])
return dictionary
#print(dictionary)
def dict_sort(d, reverse = False):
"""Sort given dictionary "d" in ascending (default)
or descending (reverse = True) order
Outputs tuple of: list of keys, list of values and dictionary
Recommended format for output: a,b,c = dict_sort(d)"""
key_list = []
value_list = []
for key in d:
key_list.append(key)
value_list.append(d[key])
#print(key_list)
#print(value_list)
for i in range(len(value_list)-1):
for i in range(len(value_list)-1):
if reverse == False:
if value_list[i] > value_list[i+1]:
value_list[i],value_list[i+1] = value_list[i+1],value_list[i]
key_list[i],key_list[i+1] = key_list[i+1],key_list[i]
elif reverse == True:
if value_list[i] < value_list[i+1]:
value_list[i],value_list[i+1] = value_list[i+1],value_list[i]
key_list[i],key_list[i+1] = key_list[i+1],key_list[i]
d = {}
for i in range(len(value_list)):
d[key_list[i]] = value_list[i]
sorted_dict = d
return key_list,value_list,sorted_dict
def word_freq():
"""Input how many words to plot on graph"""
while True:
try:
n_freq = int(input("How many of the most frequent words would you like to display?\n"))
if (n_freq < 1 or n_freq > 10):
print("Please input an integer between 1 and 10:")
continue
except(ValueError):
print("Please input an integer between 1 and 10:")
continue
else:
break
return n_freq
def graph_word_freq(n,f,w): #create function to draw chart
"""Draw bar chart of most frequent words in text
n: number of words to plot (between 1 and 10)
f: word frequency list
w: word list"""
import turtle #import turtle module
window = turtle.Screen() #create screen
window.bgcolor("honeydew") #define screen color
window.title("Most Frequent Words") #set window title
if f[0] < 960:
y = 500
y_pos = -480
width = 60
spacing = 20
x_pos = -(30+40*(n-1))
else:
width = 100
spacing = 40
y = f[0]/2+20
y_pos = -f[0]/2
x_pos = -(50+70*(n-1))
#turtle.screensize(y,y) #set window size
turtle.setworldcoordinates(-y,-y,y,y)
tortoise = turtle.Turtle() #create turtle
tortoise.hideturtle() #hide turtle stamp
tortoise.penup() #raise turtle pen
tortoise.setposition(x_pos,y_pos) #position turtle
tortoise.pendown() #put turtle pen down
tortoise.speed(5) #set drawing speed
for i in range(n):
if abs(f[i]) < ((f[0]-f[n])/3):
tortoise.color("SeaGreen","ForestGreen") #set turtle color
elif abs(f[i]) >= ((f[0]-f[n])/3) and abs(f[i]) < ((f[0]-f[n])/1.5):
tortoise.color("orange","gold") #set turtle color
else:
tortoise.color("coral3","IndianRed") #set turtle color
tortoise.begin_fill() #begin drawing shapes
tortoise.left(90)
tortoise.forward(f[i]) #draw bar height
tortoise.right(90)
tortoise.forward(1/3*width) #prepare for text
if f[i] >= 0:
tortoise.write(f[i]) #write value
else:
tortoise.penup()
tortoise.right(90)
tortoise.forward(15)
tortoise.write(f[i])
tortoise.forward(-15)
tortoise.left(90)
tortoise.pendown()
tortoise.forward(2/3*width) #bar width
tortoise.right(90)
tortoise.forward(f[i])
tortoise.left(90)
tortoise.penup()
tortoise.right(90)
tortoise.forward(25)
tortoise.left(90)
tortoise.forward(-2/3*width)
tortoise.write(w[i]) #write word
tortoise.forward(2/3*width)
tortoise.left(90)
tortoise.forward(25)
tortoise.right(90)
tortoise.forward(spacing) #spacing
tortoise.pendown()
tortoise.end_fill() #stop drawing shapes
turtle.exitonclick()
dictionary = count_words("New Text Document.txt")
words,values,dictionary = dict_sort(dictionary, reverse = True)
n_freq = word_freq()
graph_word_freq(n_freq,values,words)
现在可以正常使用了。谢谢,队友!
答案 1 :(得分:1)
了解您想实现自己的冒泡排序,而不是使用Python的快速排序,并且您想要自己对单词进行计数,而不是使用Counter,让我们收紧代码以利用Python惯用法并减轻负担在可怜的乌龟上:
Location