Question

我正在编写一个程序来执行以下操作：

读取给定的文件名，并打印统计信息的快速摘要
打印单词长度频率和图表的表格
打印字长频率的图表。打印空白线。
打印相对频率的图形表示每个字长。

以下是用于测试代码的文本文件数据：

This is before the start and should be ignored.
So should this
and this


*** START OF SYNTHETIC TEST CASE ***
a blah ba ba
*** END OF SYNTHETIC TEST CASE ***

This is after the end and should be ignored too.
Have a nice day.

到目前为止，这是我的代码：

import os
from collections import Counter

TABLE_TITLE = " Len  Freq"
FREQ_TABLE_TEMPLATE = "{:>4}{:>6}"

GRAPH_TITLE = " Len  Freq Graph"
GRAPH_LINE_TEMPLATE = "{:>4}{:>5}% {}"


def get_filename():
    filename = input("Please enter filename: ")
    while not os.path.isfile(filename):
        print(filename, "not found...")
        filename = input("Please enter filename: ")
    return filename

def get_words_from_file(filename):
    lines = open_and_read(filename)
    stripped = strip_data(lines)

    return stripped    

def open_and_read(filename):
    should_add = False
    processed_data = []

    infile = open(filename, 'r', encoding='utf-8')
    raw_data = infile.readlines()
    for line in raw_data:
        if line.startswith("*** START"):
            should_add = True
        elif line.startswith("*** END OF"):
            should_add = False
            break
        if should_add:
            processed_data.append(line)
    processed_data.pop(0)

    return processed_data      


def strip_data(raw_data):
    stripped_list = get_words(raw_data)
    processed_data = remove_punctuation(stripped_list)  

    return processed_data


def get_words(raw_data):
    """
    Takes a list, raw_data, splits and strips words. 
    returns a list stripped_list
    """

    stripped_list = []
    for word in raw_data:
        word = word.strip('\n"-:\';,.').split(' ')
        for bit in word:
            bit = bit.strip('\n"-:\';,.').split(' ')
            stripped_list.append(bit)    

    return stripped_list

def remove_punctuation(stripped_list):
    """
    Takes a list, stripped_list, removes the all non alpha words.
    returns a list, processed_data
    """
    processed_data = []

    for piece in stripped_list:
        for chunk in piece:
            if chunk.isalpha():
                chunk = chunk.lower()
                processed_data.append(chunk)
    return processed_data    


def avg_word_length(words):
    """
    Takes a list, words and counts the average length of the words in the list.
    Returns list average_leng
    """
    sum_lengths = 0
    for word in words:
        sum_lengths += len(word)
    average_leng = sum_lengths / len(words)
    return average_leng

def max_word_length(words):
    """Returns the length of the longest word in the list of words.
    Or 0 if there are no words in the list.
    """
    if len(words) > 0:
        max_length = len(words[0])
        for word in words:
            length = len(word)
            if length > max_length:
                max_length = length
    else:
        max_length = 0
    return max_length


def max_frequency(words):
    count = Counter(words).most_common(1)
    freq_count = count[0][1]

    return freq_count


def length_freq(words):
    """
    takes a list(words), and counts the amount of times the frequecny of a word appears
    Returns a list of the frequecny of a words length(len_freq)
    """

    words_length = [len(word) for word in words]
    len_freq = Counter(words_length).most_common()

    for i in range(1, max(words_length)):    #gets the first value of the tuple   
        test_set = [len_freq[x][0] for x in range(len(len_freq))] #and checks if already in the set      
        if i not in test_set: #if not adds it as a tuple (i,0)
            len_freq.append((i, 0))

    return len_freq


def print_length_table(words):
    freq_dict = length_freq(words)

    print()
    print(TABLE_TITLE)
    for pair in sorted(freq_dict):
        print(FREQ_TABLE_TEMPLATE.format(pair[0], pair[1]))


def print_length_graph_hori(words):

    print()    
    print(GRAPH_TITLE)
    relative_freq = get_percentage(words)
    for i in range(len(relative_freq)):
        number = relative_freq[i][0]
        percent = relative_freq[i][1]
        graph_line = "=" * percent
        print(GRAPH_LINE_TEMPLATE.format(number, percent, graph_line))


def get_percentage(words):
    """
    Returns a sorted list (relative_freq)
    """
    lengths = length_freq(words)
    relative_freq = []    

    for value in lengths:
        percentage = int(value[1] / len(words) * 100)
        relative_freq.append((value[0], percentage))
    relative_freq = sorted(relative_freq)
    return relative_freq

def print_length_graph_vert(words):
    relative_freq = get_percentage(words)
    bars = [percent[1] for percent in relative_freq]
    next_10 = to_next_10(bars)

    print("\n% frequency")
    for percentage in range(next_10, 0, -1):
        if percentage < 10:
            print("  {}  ".format(percentage), end="")
        else:
            print(" {}  ".format(percentage), end="")

        for point in bars:
            if int(point) >= percentage:
                print(" ** ", end="")
            else:
                print(" " * 4, end="")

        print()
    print(" " * 5, end="")
    for i in range(len(relative_freq)):
        if i < 9:
            print(" 0{} ".format(i + 1), end="")
        else:
            print(" {} ".format(i + 1), end="")
    print("\n" + " " * (len(relative_freq) * 4 - 7) + "word length")


def to_next_10(bars):
    """
    Takes a list(bars)
    Maps the value of bars to a new list(bars_sort) and rounds to nearest 10
    Returns int(next_10)
    """

    bars_sort = bars[:]
    bars_sort = sorted(bars_sort)
    next_10 = bars_sort[-1]
    is_not_x10 = True
    while is_not_x10:
        next_10 += 1
        if next_10 % 10 == 0:
            is_not_x10 = False
    return next_10


def print_results(words):
    average_length = avg_word_length(words)
    max_length = max_word_length(words)
    max_freq = max_frequency(words)

    print()
    print("Word summary (all words):")
    print(" Number of words = {}".format(len(words)))
    print(" Avg word length = {:.2f}".format(average_length))
    print(" Max word length = {}".format(max_length))
    print(" Max frequency = {}".format(max_freq))
    print_length_table(words)
    print_length_graph_hori(words)
    print_length_graph_vert(words)


def main():
    """ Gets the job done """


    text = get_filename()
    print(" {} loaded ok.".format(text))
    words = get_words_from_file(text)
    print_results(words)


main()

示例终端输入/输出：

Please enter filename: blah.txt
 blah.txt loaded ok.

Word summary (all words):
 Number of words = 4
 Avg word length = 2.25
 Max word length = 4
 Max frequency = 2

 Len  Freq
   1     1
   2     2
   3     0
   4     1

 Len  Freq Graph
   1   25% =========================
   2   50% ==================================================
   3    0% 
   4   25% =========================

% frequency
 60                  
 59                  
 58                  
 57                  
 56                  
 55                  
 54                  
 53                  
 52                  
 51                  
 50       **         
 49       **         
 48       **         
 47       **         
 46       **         
 45       **         
 44       **         
 43       **         
 42       **         
 41       **         
 40       **         
 39       **         
 38       **         
 37       **         
 36       **         
 35       **         
 34       **         
 33       **         
 32       **         
 31       **         
 30       **         
 29       **         
 28       **         
 27       **         
 26       **         
 25   **  **      ** 
 24   **  **      ** 
 23   **  **      ** 
 22   **  **      ** 
 21   **  **      ** 
 20   **  **      ** 
 19   **  **      ** 
 18   **  **      ** 
 17   **  **      ** 
 16   **  **      ** 
 15   **  **      ** 
 14   **  **      ** 
 13   **  **      ** 
 12   **  **      ** 
 11   **  **      ** 
 10   **  **      ** 
  9   **  **      ** 
  8   **  **      ** 
  7   **  **      ** 
  6   **  **      ** 
  5   **  **      ** 
  4   **  **      ** 
  3   **  **      ** 
  2   **  **      ** 
  1   **  **      ** 
      01  02  03  04 
         word length

我现在需要更改代码以执行以下规则：

我只能导入re和os库。没有其他图书馆
代码现在必须使用模式"[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+"

Answer 1

如果不允许使用“集合”模块，则可以自己重新实现Counter类的位（至少要使用的部分）（这将是init（）方法和most_common （）方法）。

我不明白正则表达式应该用于什么。

编辑：好的，这是collections.Counter的脑死法。

class MyCounter(object):
    def __init__(self, iterable):
        """
        initialize a counter object with something iterable
        """
        self._data = dict()

        # set up a dictionary that counts how many of each item we have
        for item in iterable:
            try:
                self._data[item] += 1
            except KeyError:
                self._data[item] = 1

    def most_common(self, n=None):
        """
        return the most common items from the object, along with their count.
        If n=None, return the whole list
        """
        # build a list of counts
        list_of_counts = self._data.items()

        # sort the list in descending order. Ordinarily, we would use sorted()
        # along with operator.itemgetter, but since we are not allowed to use
        # anything but re and os, we can just do a selection sort.
        for i in range(len(list_of_counts)):
            for j in range(i+1, len(list_of_counts)):
                if list_of_counts[i] > list_of_counts[j]:
                    temp = list_of_counts[j]
                    list_of_counts[j] = list_of_counts[i]
                    list_of_counts[i] = temp

        # return what is needed.             
        if n is None:
            return list_of_counts

        return list_of_counts[:n]

##############################################################################
## the code from here down is not part of the solution, it is proof that the
## solution works
import unittest
from collections import Counter

class MyCounterTest(unittest.TestCase):
    def test_single_most_common(self):
        """
        check when we have a single most-common value
        """
        # illustrate the behavior of collections.Counter
        system_counter = Counter(['a','a','b','c'])
        system_common = system_counter.most_common(n=1)[0]
        self.assertEqual(system_common[0], 'a')
        self.assertEqual(system_common[1], 2)

        # confirm we get the same results from our Counter
        my_counter = MyCounter(['a','a','b','c'])
        my_common = my_counter.most_common(n=1)[0]
        self.assertEqual(my_common[0], 'a')
        self.assertEqual(my_common[1], 2)

    def test_with_none(self):
        system_counter = Counter(['a','a','b','c'])
        self.assertEqual(len(system_counter.most_common()), 3)

        my_counter = MyCounter(['a','a','b','c'])
        self.assertEqual(len(my_counter.most_common()), 3)

if __name__ == '__main__':
    unittest.main()

如何通过排除库来修改代码

1 个答案: