我正在编写一个程序来执行以下操作:
以下是用于测试代码的文本文件数据:
This is before the start and should be ignored.
So should this
and this
*** START OF SYNTHETIC TEST CASE ***
a blah ba ba
*** END OF SYNTHETIC TEST CASE ***
This is after the end and should be ignored too.
Have a nice day.
到目前为止,这是我的代码:
import os
from collections import Counter
TABLE_TITLE = " Len Freq"
FREQ_TABLE_TEMPLATE = "{:>4}{:>6}"
GRAPH_TITLE = " Len Freq Graph"
GRAPH_LINE_TEMPLATE = "{:>4}{:>5}% {}"
def get_filename():
filename = input("Please enter filename: ")
while not os.path.isfile(filename):
print(filename, "not found...")
filename = input("Please enter filename: ")
return filename
def get_words_from_file(filename):
lines = open_and_read(filename)
stripped = strip_data(lines)
return stripped
def open_and_read(filename):
should_add = False
processed_data = []
infile = open(filename, 'r', encoding='utf-8')
raw_data = infile.readlines()
for line in raw_data:
if line.startswith("*** START"):
should_add = True
elif line.startswith("*** END OF"):
should_add = False
break
if should_add:
processed_data.append(line)
processed_data.pop(0)
return processed_data
def strip_data(raw_data):
stripped_list = get_words(raw_data)
processed_data = remove_punctuation(stripped_list)
return processed_data
def get_words(raw_data):
"""
Takes a list, raw_data, splits and strips words.
returns a list stripped_list
"""
stripped_list = []
for word in raw_data:
word = word.strip('\n"-:\';,.').split(' ')
for bit in word:
bit = bit.strip('\n"-:\';,.').split(' ')
stripped_list.append(bit)
return stripped_list
def remove_punctuation(stripped_list):
"""
Takes a list, stripped_list, removes the all non alpha words.
returns a list, processed_data
"""
processed_data = []
for piece in stripped_list:
for chunk in piece:
if chunk.isalpha():
chunk = chunk.lower()
processed_data.append(chunk)
return processed_data
def avg_word_length(words):
"""
Takes a list, words and counts the average length of the words in the list.
Returns list average_leng
"""
sum_lengths = 0
for word in words:
sum_lengths += len(word)
average_leng = sum_lengths / len(words)
return average_leng
def max_word_length(words):
"""Returns the length of the longest word in the list of words.
Or 0 if there are no words in the list.
"""
if len(words) > 0:
max_length = len(words[0])
for word in words:
length = len(word)
if length > max_length:
max_length = length
else:
max_length = 0
return max_length
def max_frequency(words):
count = Counter(words).most_common(1)
freq_count = count[0][1]
return freq_count
def length_freq(words):
"""
takes a list(words), and counts the amount of times the frequecny of a word appears
Returns a list of the frequecny of a words length(len_freq)
"""
words_length = [len(word) for word in words]
len_freq = Counter(words_length).most_common()
for i in range(1, max(words_length)): #gets the first value of the tuple
test_set = [len_freq[x][0] for x in range(len(len_freq))] #and checks if already in the set
if i not in test_set: #if not adds it as a tuple (i,0)
len_freq.append((i, 0))
return len_freq
def print_length_table(words):
freq_dict = length_freq(words)
print()
print(TABLE_TITLE)
for pair in sorted(freq_dict):
print(FREQ_TABLE_TEMPLATE.format(pair[0], pair[1]))
def print_length_graph_hori(words):
print()
print(GRAPH_TITLE)
relative_freq = get_percentage(words)
for i in range(len(relative_freq)):
number = relative_freq[i][0]
percent = relative_freq[i][1]
graph_line = "=" * percent
print(GRAPH_LINE_TEMPLATE.format(number, percent, graph_line))
def get_percentage(words):
"""
Returns a sorted list (relative_freq)
"""
lengths = length_freq(words)
relative_freq = []
for value in lengths:
percentage = int(value[1] / len(words) * 100)
relative_freq.append((value[0], percentage))
relative_freq = sorted(relative_freq)
return relative_freq
def print_length_graph_vert(words):
relative_freq = get_percentage(words)
bars = [percent[1] for percent in relative_freq]
next_10 = to_next_10(bars)
print("\n% frequency")
for percentage in range(next_10, 0, -1):
if percentage < 10:
print(" {} ".format(percentage), end="")
else:
print(" {} ".format(percentage), end="")
for point in bars:
if int(point) >= percentage:
print(" ** ", end="")
else:
print(" " * 4, end="")
print()
print(" " * 5, end="")
for i in range(len(relative_freq)):
if i < 9:
print(" 0{} ".format(i + 1), end="")
else:
print(" {} ".format(i + 1), end="")
print("\n" + " " * (len(relative_freq) * 4 - 7) + "word length")
def to_next_10(bars):
"""
Takes a list(bars)
Maps the value of bars to a new list(bars_sort) and rounds to nearest 10
Returns int(next_10)
"""
bars_sort = bars[:]
bars_sort = sorted(bars_sort)
next_10 = bars_sort[-1]
is_not_x10 = True
while is_not_x10:
next_10 += 1
if next_10 % 10 == 0:
is_not_x10 = False
return next_10
def print_results(words):
average_length = avg_word_length(words)
max_length = max_word_length(words)
max_freq = max_frequency(words)
print()
print("Word summary (all words):")
print(" Number of words = {}".format(len(words)))
print(" Avg word length = {:.2f}".format(average_length))
print(" Max word length = {}".format(max_length))
print(" Max frequency = {}".format(max_freq))
print_length_table(words)
print_length_graph_hori(words)
print_length_graph_vert(words)
def main():
""" Gets the job done """
text = get_filename()
print(" {} loaded ok.".format(text))
words = get_words_from_file(text)
print_results(words)
main()
示例终端输入/输出:
Please enter filename: blah.txt
blah.txt loaded ok.
Word summary (all words):
Number of words = 4
Avg word length = 2.25
Max word length = 4
Max frequency = 2
Len Freq
1 1
2 2
3 0
4 1
Len Freq Graph
1 25% =========================
2 50% ==================================================
3 0%
4 25% =========================
% frequency
60
59
58
57
56
55
54
53
52
51
50 **
49 **
48 **
47 **
46 **
45 **
44 **
43 **
42 **
41 **
40 **
39 **
38 **
37 **
36 **
35 **
34 **
33 **
32 **
31 **
30 **
29 **
28 **
27 **
26 **
25 ** ** **
24 ** ** **
23 ** ** **
22 ** ** **
21 ** ** **
20 ** ** **
19 ** ** **
18 ** ** **
17 ** ** **
16 ** ** **
15 ** ** **
14 ** ** **
13 ** ** **
12 ** ** **
11 ** ** **
10 ** ** **
9 ** ** **
8 ** ** **
7 ** ** **
6 ** ** **
5 ** ** **
4 ** ** **
3 ** ** **
2 ** ** **
1 ** ** **
01 02 03 04
word length
我现在需要更改代码以执行以下规则:
re
和os
库。没有其他图书馆"[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+"
答案 0 :(得分:0)
如果不允许使用“集合”模块,则可以自己重新实现Counter类的位(至少要使用的部分)(这将是init()方法和most_common ()方法)。
我不明白正则表达式应该用于什么。
编辑:好的,这是collections.Counter的脑死法。
class MyCounter(object):
def __init__(self, iterable):
"""
initialize a counter object with something iterable
"""
self._data = dict()
# set up a dictionary that counts how many of each item we have
for item in iterable:
try:
self._data[item] += 1
except KeyError:
self._data[item] = 1
def most_common(self, n=None):
"""
return the most common items from the object, along with their count.
If n=None, return the whole list
"""
# build a list of counts
list_of_counts = self._data.items()
# sort the list in descending order. Ordinarily, we would use sorted()
# along with operator.itemgetter, but since we are not allowed to use
# anything but re and os, we can just do a selection sort.
for i in range(len(list_of_counts)):
for j in range(i+1, len(list_of_counts)):
if list_of_counts[i] > list_of_counts[j]:
temp = list_of_counts[j]
list_of_counts[j] = list_of_counts[i]
list_of_counts[i] = temp
# return what is needed.
if n is None:
return list_of_counts
return list_of_counts[:n]
##############################################################################
## the code from here down is not part of the solution, it is proof that the
## solution works
import unittest
from collections import Counter
class MyCounterTest(unittest.TestCase):
def test_single_most_common(self):
"""
check when we have a single most-common value
"""
# illustrate the behavior of collections.Counter
system_counter = Counter(['a','a','b','c'])
system_common = system_counter.most_common(n=1)[0]
self.assertEqual(system_common[0], 'a')
self.assertEqual(system_common[1], 2)
# confirm we get the same results from our Counter
my_counter = MyCounter(['a','a','b','c'])
my_common = my_counter.most_common(n=1)[0]
self.assertEqual(my_common[0], 'a')
self.assertEqual(my_common[1], 2)
def test_with_none(self):
system_counter = Counter(['a','a','b','c'])
self.assertEqual(len(system_counter.most_common()), 3)
my_counter = MyCounter(['a','a','b','c'])
self.assertEqual(len(my_counter.most_common()), 3)
if __name__ == '__main__':
unittest.main()